Add CLIP fine-tuning pipeline for logo recognition
Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
This commit is contained in:
76
configs/jetson_orin.yaml
Normal file
76
configs/jetson_orin.yaml
Normal file
@ -0,0 +1,76 @@
|
||||
# Training configuration optimized for Jetson Orin AGX (~64GB shared memory)
|
||||
#
|
||||
# Usage:
|
||||
# uv run python train_clip_logo.py --config configs/jetson_orin.yaml
|
||||
|
||||
# Base model
|
||||
base_model: "openai/clip-vit-large-patch14"
|
||||
|
||||
# Dataset paths (relative to project root)
|
||||
dataset_dir: "LogoDet-3K"
|
||||
reference_dir: "reference_logos"
|
||||
db_path: "test_data_mapping.db"
|
||||
|
||||
# Data split ratios (logo-level split for generalization testing)
|
||||
train_split: 0.7
|
||||
val_split: 0.15
|
||||
test_split: 0.15
|
||||
|
||||
# Batch construction
|
||||
# - batch_size: Number of batches loaded at once (keep low for memory)
|
||||
# - logos_per_batch: Different logo classes per contrastive batch
|
||||
# - samples_per_logo: Samples of each logo (creates positive pairs)
|
||||
# - Effective samples per step = logos_per_batch * samples_per_logo = 128
|
||||
batch_size: 16
|
||||
logos_per_batch: 32
|
||||
samples_per_logo: 4
|
||||
gradient_accumulation_steps: 8 # Effective batch = 128
|
||||
num_workers: 4
|
||||
|
||||
# Model architecture
|
||||
# LoRA enables memory-efficient fine-tuning by training low-rank adapters
|
||||
# instead of full model weights
|
||||
lora_r: 16 # LoRA rank (0 to disable)
|
||||
lora_alpha: 32 # LoRA scaling factor
|
||||
lora_dropout: 0.1 # Dropout in LoRA layers
|
||||
freeze_layers: 12 # Freeze first 12 of 24 transformer layers
|
||||
use_gradient_checkpointing: true # Trade compute for memory
|
||||
|
||||
# Training hyperparameters
|
||||
learning_rate: 1.0e-5 # Conservative LR for fine-tuning
|
||||
weight_decay: 0.01 # L2 regularization
|
||||
warmup_steps: 500 # LR warmup steps
|
||||
max_epochs: 20 # Maximum training epochs
|
||||
mixed_precision: true # FP16 training for memory efficiency
|
||||
|
||||
# Loss function
|
||||
# InfoNCE is the contrastive loss used in CLIP training
|
||||
temperature: 0.07 # Similarity scaling (0.05-0.1 typical)
|
||||
loss_type: "infonce" # Options: infonce, supcon, triplet, combined
|
||||
triplet_margin: 0.3 # Only used if loss_type is triplet
|
||||
|
||||
# Early stopping
|
||||
patience: 5 # Stop if no improvement for N epochs
|
||||
min_delta: 0.001 # Minimum improvement threshold
|
||||
|
||||
# Checkpoints and output
|
||||
checkpoint_dir: "checkpoints"
|
||||
output_dir: "models/logo_detection/clip_finetuned"
|
||||
save_every_n_epochs: 5
|
||||
|
||||
# Logging
|
||||
log_every_n_steps: 10
|
||||
eval_every_n_epochs: 1
|
||||
|
||||
# Reproducibility
|
||||
seed: 42
|
||||
|
||||
# Hard negative mining (advanced)
|
||||
# Enable after initial training epochs for harder examples
|
||||
use_hard_negatives: false
|
||||
hard_negative_start_epoch: 5
|
||||
hard_negatives_per_logo: 10
|
||||
|
||||
# Data augmentation
|
||||
use_augmentation: true
|
||||
augmentation_strength: "medium" # light, medium, or strong
|
||||
Reference in New Issue
Block a user