Add CLIP fine-tuning pipeline for logo recognition

Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
2026-01-04 13:45:25 -05:00
parent 1551360028
commit 44e8b6ae7d
16 changed files with 3334 additions and 12 deletions
--- a/training/config.py
+++ b/training/config.py
@ -0,0 +1,141 @@
+"""
+Training configuration for CLIP fine-tuning.
+"""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+import yaml
+
+
+@dataclass
+class TrainingConfig:
+    """Configuration for CLIP logo fine-tuning."""
+
+    # Base model
+    base_model: str = "openai/clip-vit-large-patch14"
+
+    # Dataset paths
+    dataset_dir: str = "LogoDet-3K"
+    reference_dir: str = "reference_logos"
+    db_path: str = "test_data_mapping.db"
+
+    # Data split ratios
+    train_split: float = 0.7
+    val_split: float = 0.15
+    test_split: float = 0.15
+
+    # Batch construction
+    batch_size: int = 16
+    logos_per_batch: int = 32
+    samples_per_logo: int = 4
+    gradient_accumulation_steps: int = 8
+    num_workers: int = 4
+
+    # Model architecture
+    lora_r: int = 16
+    lora_alpha: int = 32
+    lora_dropout: float = 0.1
+    freeze_layers: int = 12
+    use_gradient_checkpointing: bool = True
+
+    # Training hyperparameters
+    learning_rate: float = 1e-5
+    weight_decay: float = 0.01
+    warmup_steps: int = 500
+    max_epochs: int = 20
+    mixed_precision: bool = True
+
+    # Loss function
+    temperature: float = 0.07
+    loss_type: str = "infonce"  # "infonce" or "triplet"
+    triplet_margin: float = 0.3
+
+    # Early stopping
+    patience: int = 5
+    min_delta: float = 0.001
+
+    # Checkpoints and output
+    checkpoint_dir: str = "checkpoints"
+    output_dir: str = "models/logo_detection/clip_finetuned"
+    save_every_n_epochs: int = 5
+
+    # Logging
+    log_every_n_steps: int = 10
+    eval_every_n_epochs: int = 1
+
+    # Random seed for reproducibility
+    seed: int = 42
+
+    # Hard negative mining
+    use_hard_negatives: bool = False
+    hard_negative_start_epoch: int = 5
+    hard_negatives_per_logo: int = 10
+
+    # Data augmentation
+    use_augmentation: bool = True
+    augmentation_strength: str = "medium"  # "light", "medium", "strong"
+
+    @classmethod
+    def from_yaml(cls, yaml_path: str) -> "TrainingConfig":
+        """Load configuration from YAML file."""
+        with open(yaml_path, "r") as f:
+            config_dict = yaml.safe_load(f)
+        return cls(**config_dict)
+
+    def to_yaml(self, yaml_path: str) -> None:
+        """Save configuration to YAML file."""
+        Path(yaml_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(yaml_path, "w") as f:
+            yaml.dump(self.__dict__, f, default_flow_style=False, sort_keys=False)
+
+    def validate(self) -> List[str]:
+        """Validate configuration and return list of warnings."""
+        warnings = []
+
+        # Check split ratios
+        total_split = self.train_split + self.val_split + self.test_split
+        if abs(total_split - 1.0) > 0.01:
+            warnings.append(
+                f"Split ratios sum to {total_split}, expected 1.0"
+            )
+
+        # Check batch construction
+        effective_batch = self.batch_size * self.gradient_accumulation_steps
+        if effective_batch < 64:
+            warnings.append(
+                f"Effective batch size ({effective_batch}) is small for contrastive learning. "
+                "Consider increasing batch_size or gradient_accumulation_steps."
+            )
+
+        # Check LoRA config
+        if self.lora_r > 0 and self.lora_alpha < self.lora_r:
+            warnings.append(
+                f"lora_alpha ({self.lora_alpha}) < lora_r ({self.lora_r}). "
+                "This may reduce LoRA effectiveness."
+            )
+
+        # Check freeze layers
+        if self.freeze_layers < 0:
+            warnings.append("freeze_layers should be >= 0")
+
+        # Check temperature
+        if self.temperature <= 0:
+            warnings.append("temperature must be positive")
+        elif self.temperature > 1.0:
+            warnings.append(
+                f"temperature ({self.temperature}) is high. "
+                "Typical values are 0.05-0.1."
+            )
+
+        return warnings
+
+    @property
+    def effective_batch_size(self) -> int:
+        """Calculate effective batch size with gradient accumulation."""
+        return self.batch_size * self.gradient_accumulation_steps
+
+    @property
+    def samples_per_batch(self) -> int:
+        """Total samples in one batch (logos_per_batch * samples_per_logo)."""
+        return self.logos_per_batch * self.samples_per_logo