# Training configuration for RTX 4090 (24GB VRAM) with IMAGE-LEVEL splits # # Combines RTX 4090 hardware optimizations with image-level splitting and # gentler contrastive learning for better generalization. # # Usage: # python train_clip_logo.py --config configs/cloud_rtx4090_image_split.yaml # # Estimated training time: 5-7 hours (more epochs than logo-level) # Estimated cost on RunPod: ~$4 # Base model base_model: "openai/clip-vit-large-patch14" # Dataset paths dataset_dir: "LogoDet-3K" reference_dir: "reference_logos" db_path: "test_data_mapping.db" # Data split configuration - IMAGE LEVEL # Each logo brand will have images in all splits, allowing the model # to see some examples of each brand during training. split_level: "image" train_split: 0.7 val_split: 0.15 test_split: 0.15 # Larger batches for faster training on 24GB VRAM batch_size: 32 logos_per_batch: 32 samples_per_logo: 4 gradient_accumulation_steps: 4 # Effective batch = 128 num_workers: 8 # Model architecture lora_r: 16 lora_alpha: 32 lora_dropout: 0.1 freeze_layers: 12 use_gradient_checkpointing: true # Training - GENTLER settings for better generalization learning_rate: 5.0e-6 # Reduced from 1e-5 weight_decay: 0.01 warmup_steps: 500 max_epochs: 30 # More epochs with slower learning mixed_precision: true # Loss - HIGHER temperature for softer contrastive learning temperature: 0.15 # Increased from 0.07 loss_type: "infonce" triplet_margin: 0.2 # Reduced from 0.3 # Early stopping - more patience with gentler learning patience: 7 min_delta: 0.001 # Output - separate directory for image-split model checkpoint_dir: "checkpoints_image_split" output_dir: "models/logo_detection/clip_finetuned_image_split" save_every_n_epochs: 2 # Save frequently for cloud # Logging log_every_n_steps: 10 eval_every_n_epochs: 1 seed: 42 use_hard_negatives: false use_augmentation: true augmentation_strength: "medium"