Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
65 lines
1.3 KiB
YAML
65 lines
1.3 KiB
YAML
# Training configuration optimized for cloud A100 / H100 (80GB VRAM)
|
|
#
|
|
# Usage:
|
|
# python train_clip_logo.py --config configs/cloud_a100.yaml
|
|
#
|
|
# Estimated training time: 1.5-3 hours
|
|
# Estimated cost on RunPod: ~$3-6
|
|
|
|
# Base model
|
|
base_model: "openai/clip-vit-large-patch14"
|
|
|
|
# Dataset paths
|
|
dataset_dir: "LogoDet-3K"
|
|
reference_dir: "reference_logos"
|
|
db_path: "test_data_mapping.db"
|
|
|
|
# Data splits
|
|
train_split: 0.7
|
|
val_split: 0.15
|
|
test_split: 0.15
|
|
|
|
# Maximum batch sizes for 80GB VRAM
|
|
batch_size: 64
|
|
logos_per_batch: 32
|
|
samples_per_logo: 4
|
|
gradient_accumulation_steps: 2 # Effective batch = 128
|
|
num_workers: 8
|
|
|
|
# Model architecture (no gradient checkpointing needed with 80GB)
|
|
lora_r: 16
|
|
lora_alpha: 32
|
|
lora_dropout: 0.1
|
|
freeze_layers: 12
|
|
use_gradient_checkpointing: false
|
|
|
|
# Training
|
|
learning_rate: 1.0e-5
|
|
weight_decay: 0.01
|
|
warmup_steps: 500
|
|
max_epochs: 20
|
|
mixed_precision: true
|
|
|
|
# Loss
|
|
temperature: 0.07
|
|
loss_type: "infonce"
|
|
triplet_margin: 0.3
|
|
|
|
# Early stopping
|
|
patience: 5
|
|
min_delta: 0.001
|
|
|
|
# Output
|
|
checkpoint_dir: "checkpoints"
|
|
output_dir: "models/logo_detection/clip_finetuned"
|
|
save_every_n_epochs: 2 # Save more frequently for cloud
|
|
|
|
# Logging
|
|
log_every_n_steps: 10
|
|
eval_every_n_epochs: 1
|
|
|
|
seed: 42
|
|
use_hard_negatives: false
|
|
use_augmentation: true
|
|
augmentation_strength: "medium"
|