Add CLIP fine-tuning pipeline for logo recognition

Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
2026-01-04 13:45:25 -05:00
parent 1551360028
commit 44e8b6ae7d
16 changed files with 3334 additions and 12 deletions
--- a/training/model.py
+++ b/training/model.py
@ -0,0 +1,335 @@
+"""
+Fine-tunable CLIP model wrapper with LoRA support.
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import CLIPModel, CLIPProcessor
+
+# Check if peft is available for LoRA
+try:
+    from peft import LoraConfig, get_peft_model, PeftModel
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
+    LoraConfig = None
+    get_peft_model = None
+    PeftModel = None
+
+
+class LogoFineTunedCLIP(nn.Module):
+    """
+    CLIP vision encoder fine-tuned for logo similarity.
+
+    Preserves embedding interface for compatibility with DetectLogosDETR:
+    - Same embedding dimensionality (768 for ViT-L/14)
+    - L2 normalized outputs
+    - Works with existing get_image_features() pattern
+
+    Supports:
+    - LoRA for memory-efficient fine-tuning
+    - Layer freezing for transfer learning
+    - Gradient checkpointing for memory optimization
+    """
+
+    def __init__(
+        self,
+        vision_model: nn.Module,
+        lora_r: int = 16,
+        lora_alpha: int = 32,
+        lora_dropout: float = 0.1,
+        freeze_layers: int = 12,
+        use_gradient_checkpointing: bool = True,
+        add_projection_head: bool = True,
+    ):
+        """
+        Initialize the fine-tunable CLIP wrapper.
+
+        Args:
+            vision_model: CLIP vision model (CLIPVisionModel)
+            lora_r: Rank of LoRA low-rank matrices (0 to disable)
+            lora_alpha: LoRA scaling factor
+            lora_dropout: Dropout for LoRA layers
+            freeze_layers: Number of transformer layers to freeze (from bottom)
+            use_gradient_checkpointing: Enable gradient checkpointing
+            add_projection_head: Add trainable projection head
+        """
+        super().__init__()
+
+        self.vision_model = vision_model
+        self.embedding_dim = vision_model.config.hidden_size
+        self.freeze_layers = freeze_layers
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+
+        # Enable gradient checkpointing for memory efficiency
+        if use_gradient_checkpointing:
+            if hasattr(self.vision_model, "gradient_checkpointing_enable"):
+                self.vision_model.gradient_checkpointing_enable()
+
+        # Freeze lower layers
+        self._freeze_layers(freeze_layers)
+
+        # Apply LoRA to attention layers in upper blocks
+        self.peft_applied = False
+        if PEFT_AVAILABLE and lora_r > 0:
+            self._apply_lora(lora_r, lora_alpha, lora_dropout)
+            self.peft_applied = True
+        elif lora_r > 0 and not PEFT_AVAILABLE:
+            print(
+                "Warning: peft not installed. LoRA disabled. "
+                "Install with: pip install peft"
+            )
+
+        # Optional projection head for fine-tuning
+        self.add_projection_head = add_projection_head
+        if add_projection_head:
+            self.projection = nn.Sequential(
+                nn.Linear(self.embedding_dim, self.embedding_dim),
+                nn.LayerNorm(self.embedding_dim),
+            )
+        else:
+            self.projection = nn.Identity()
+
+    def _freeze_layers(self, num_layers: int) -> None:
+        """Freeze the first N transformer layers and embeddings."""
+        if num_layers <= 0:
+            return
+
+        # Freeze embeddings
+        if hasattr(self.vision_model, "embeddings"):
+            for param in self.vision_model.embeddings.parameters():
+                param.requires_grad = False
+
+        # Freeze specified number of encoder layers
+        if hasattr(self.vision_model, "encoder"):
+            for i, layer in enumerate(self.vision_model.encoder.layers):
+                if i < num_layers:
+                    for param in layer.parameters():
+                        param.requires_grad = False
+
+    def _apply_lora(
+        self,
+        r: int,
+        alpha: int,
+        dropout: float,
+    ) -> None:
+        """Apply LoRA adapters to attention layers."""
+        if not PEFT_AVAILABLE:
+            return
+
+        # Configure LoRA for vision transformer
+        lora_config = LoraConfig(
+            r=r,
+            lora_alpha=alpha,
+            lora_dropout=dropout,
+            target_modules=["q_proj", "v_proj"],
+            bias="none",
+            modules_to_save=[],  # Don't save any full modules
+        )
+
+        self.vision_model = get_peft_model(self.vision_model, lora_config)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Extract normalized embeddings for logo images.
+
+        Args:
+            pixel_values: [batch, 3, 224, 224] preprocessed images
+
+        Returns:
+            embeddings: [batch, embedding_dim] L2-normalized
+        """
+        # Get vision features
+        outputs = self.vision_model(pixel_values=pixel_values)
+
+        # Use pooler output (CLS token projection) if available
+        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
+            features = outputs.pooler_output
+        else:
+            # Fall back to CLS token from last hidden state
+            features = outputs.last_hidden_state[:, 0, :]
+
+        # Apply projection head
+        features = self.projection(features)
+
+        # L2 normalize for cosine similarity
+        features = F.normalize(features, dim=-1)
+
+        return features
+
+    def get_image_features(self, **kwargs) -> torch.Tensor:
+        """
+        Compatibility method matching CLIP's interface.
+
+        Used by DetectLogosDETR._get_embedding_pil().
+        """
+        return self.forward(kwargs["pixel_values"])
+
+    def get_trainable_parameters(self) -> List[torch.nn.Parameter]:
+        """Return list of trainable parameters."""
+        return [p for p in self.parameters() if p.requires_grad]
+
+    def get_parameter_count(self) -> Dict[str, int]:
+        """Return count of trainable and total parameters."""
+        total = sum(p.numel() for p in self.parameters())
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return {
+            "total": total,
+            "trainable": trainable,
+            "frozen": total - trainable,
+            "trainable_percent": 100 * trainable / total if total > 0 else 0,
+        }
+
+    def save_pretrained(self, output_dir: str) -> None:
+        """
+        Save model in HuggingFace-compatible format.
+
+        Args:
+            output_dir: Directory to save model files
+        """
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+
+        # Save model weights
+        if self.peft_applied and PEFT_AVAILABLE:
+            # Save LoRA weights separately
+            self.vision_model.save_pretrained(output_path / "vision_lora")
+            # Save projection head
+            torch.save(
+                self.projection.state_dict(),
+                output_path / "projection_head.bin",
+            )
+        else:
+            # Save full model state
+            torch.save(self.state_dict(), output_path / "pytorch_model.bin")
+
+        # Save config
+        config = {
+            "model_type": "clip_logo_finetuned",
+            "embedding_dim": self.embedding_dim,
+            "lora_r": self.lora_r,
+            "lora_alpha": self.lora_alpha,
+            "freeze_layers": self.freeze_layers,
+            "add_projection_head": self.add_projection_head,
+            "peft_applied": self.peft_applied,
+        }
+
+        with open(output_path / "config.json", "w") as f:
+            json.dump(config, f, indent=2)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: str,
+        base_model: str = "openai/clip-vit-large-patch14",
+        device: Optional[torch.device] = None,
+    ) -> "LogoFineTunedCLIP":
+        """
+        Load a fine-tuned model from saved weights.
+
+        Args:
+            model_path: Path to saved model directory
+            base_model: Base CLIP model name (for architecture)
+            device: Device to load model on
+
+        Returns:
+            Loaded LogoFineTunedCLIP model
+        """
+        model_path = Path(model_path)
+
+        # Load config
+        with open(model_path / "config.json", "r") as f:
+            config = json.load(f)
+
+        # Load base CLIP model
+        clip_model = CLIPModel.from_pretrained(base_model)
+
+        # Create model instance
+        model = cls(
+            vision_model=clip_model.vision_model,
+            lora_r=config.get("lora_r", 0),
+            lora_alpha=config.get("lora_alpha", 1),
+            freeze_layers=config.get("freeze_layers", 12),
+            add_projection_head=config.get("add_projection_head", True),
+            use_gradient_checkpointing=False,  # Not needed for inference
+        )
+
+        # Load weights
+        if config.get("peft_applied", False) and PEFT_AVAILABLE:
+            # Load LoRA weights
+            lora_path = model_path / "vision_lora"
+            if lora_path.exists():
+                model.vision_model = PeftModel.from_pretrained(
+                    model.vision_model, lora_path
+                )
+            # Load projection head
+            proj_path = model_path / "projection_head.bin"
+            if proj_path.exists():
+                model.projection.load_state_dict(torch.load(proj_path))
+        else:
+            # Load full model state
+            weights_path = model_path / "pytorch_model.bin"
+            if weights_path.exists():
+                model.load_state_dict(torch.load(weights_path))
+
+        if device is not None:
+            model = model.to(device)
+
+        return model
+
+
+def create_model(
+    base_model: str = "openai/clip-vit-large-patch14",
+    lora_r: int = 16,
+    lora_alpha: int = 32,
+    lora_dropout: float = 0.1,
+    freeze_layers: int = 12,
+    use_gradient_checkpointing: bool = True,
+    device: Optional[torch.device] = None,
+) -> Tuple[LogoFineTunedCLIP, CLIPProcessor]:
+    """
+    Create a fine-tunable CLIP model and processor.
+
+    Args:
+        base_model: HuggingFace model name or path
+        lora_r: LoRA rank (0 to disable)
+        lora_alpha: LoRA scaling factor
+        lora_dropout: LoRA dropout
+        freeze_layers: Number of layers to freeze
+        use_gradient_checkpointing: Enable gradient checkpointing
+        device: Device to load model on
+
+    Returns:
+        Tuple of (model, processor)
+    """
+    # Load base CLIP model
+    clip_model = CLIPModel.from_pretrained(base_model)
+    processor = CLIPProcessor.from_pretrained(base_model)
+
+    # Create fine-tunable wrapper
+    model = LogoFineTunedCLIP(
+        vision_model=clip_model.vision_model,
+        lora_r=lora_r,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        freeze_layers=freeze_layers,
+        use_gradient_checkpointing=use_gradient_checkpointing,
+    )
+
+    if device is not None:
+        model = model.to(device)
+
+    # Print parameter info
+    param_info = model.get_parameter_count()
+    print(f"Model created:")
+    print(f"  Total parameters: {param_info['total']:,}")
+    print(f"  Trainable: {param_info['trainable']:,} ({param_info['trainable_percent']:.2f}%)")
+    print(f"  Frozen: {param_info['frozen']:,}")
+
+    return model, processor