Add CLIP fine-tuning pipeline for logo recognition

Implement contrastive learning with LoRA to fine-tune CLIP's vision encoder on LogoDet-3K dataset for improved logo embedding similarity. New training module (training/): - config.py: TrainingConfig dataclass with all hyperparameters - dataset.py: LogoContrastiveDataset with logo-level splits - model.py: LogoFineTunedCLIP wrapper with LoRA support - losses.py: InfoNCE, TripletLoss, SupConLoss implementations - trainer.py: Training loop with mixed precision and checkpointing - evaluation.py: EmbeddingEvaluator for validation metrics New scripts: - train_clip_logo.py: Main training entry point - export_model.py: Export to HuggingFace-compatible format Configurations: - configs/jetson_orin.yaml: Optimized for Jetson Orin AGX - configs/cloud_rtx4090.yaml: Optimized for 24GB cloud GPUs - configs/cloud_a100.yaml: Optimized for 80GB cloud GPUs Documentation: - CLIP_FINETUNING.md: Training guide and usage instructions - CLOUD_TRAINING.md: Cloud GPU recommendations and cost estimates Modified: - logo_detection_detr.py: Add fine-tuned model loading support - pyproject.toml: Add peft, pyyaml, torchvision dependencies
2026-01-04 13:45:25 -05:00
parent 1551360028
commit 44e8b6ae7d
16 changed files with 3334 additions and 12 deletions
--- a/logo_detection_detr.py
+++ b/logo_detection_detr.py
@ -13,6 +13,7 @@ Supported embedding models:
 - DINOv2 models (facebook/dinov2-*): Self-supervised, excellent for visual similarity
 """

+import json
 import os
 import torch
 import torch.nn.functional as F
@ -100,16 +101,20 @@ class DetectLogosDETR:
            embedding_model, default_embedding_dir, "Embedding"
        )

-        # Detect model type and initialize accordingly
-        self.model_type = self._detect_model_type(embedding_model)
-        self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")
+        # Check if this is a fine-tuned model
+        if self._is_finetuned_model(embedding_model_path):
+            self._load_finetuned_embedding_model(embedding_model_path)
+        else:
+            # Detect model type and initialize accordingly
+            self.model_type = self._detect_model_type(embedding_model)
+            self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")

-        if self.model_type == "clip":
-            self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
-            self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
-        else:  # dinov2 or other transformer models
-            self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
-            self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)
+            if self.model_type == "clip":
+                self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
+                self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
+            else:  # dinov2 or other transformer models
+                self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
+                self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)

        self.logger.info("DetectLogosDETR initialization complete")

@ -124,6 +129,62 @@ class DetectLogosDETR:
            # Default to generic transformer for unknown models
            return "transformer"

+    def _is_finetuned_model(self, model_path: str) -> bool:
+        """Check if a model path points to a fine-tuned CLIP model."""
+        config_path = Path(model_path) / "config.json"
+        if config_path.exists():
+            try:
+                with open(config_path, "r") as f:
+                    config = json.load(f)
+                return config.get("model_type") == "clip_logo_finetuned"
+            except (json.JSONDecodeError, IOError):
+                pass
+        return False
+
+    def _load_finetuned_embedding_model(self, model_path: str) -> None:
+        """
+        Load a fine-tuned CLIP model from the training module.
+
+        Args:
+            model_path: Path to the fine-tuned model directory
+        """
+        # Import the fine-tuned model class
+        try:
+            from training.model import LogoFineTunedCLIP
+        except ImportError as e:
+            self.logger.error(
+                f"Cannot import training.model for fine-tuned model: {e}"
+            )
+            raise ImportError(
+                "Fine-tuned model requires the training module. "
+                "Ensure the training/ directory is in your Python path."
+            ) from e
+
+        # Load config
+        config_path = Path(model_path) / "config.json"
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        base_model = config.get("base_model", "openai/clip-vit-large-patch14")
+
+        self.logger.info(f"Loading fine-tuned CLIP model from: {model_path}")
+        self.logger.info(f"  Base model: {base_model}")
+
+        # Load model using the from_pretrained method
+        self.embedding_model = LogoFineTunedCLIP.from_pretrained(
+            model_path,
+            base_model=base_model,
+            device=self.device,
+        )
+        self.embedding_model.eval()
+
+        # Load processor from base model
+        self.embedding_processor = CLIPProcessor.from_pretrained(base_model)
+
+        # Set model type for embedding extraction
+        self.model_type = "clip_finetuned"
+        self.logger.info("Fine-tuned CLIP model loaded successfully")
+
    def _resolve_model_path(
        self, model_name_or_path: str, default_local_dir: str, model_type: str
    ) -> str:
@ -345,7 +406,7 @@ class DetectLogosDETR:
        """
        Internal method to get embedding from PIL image.

-        Handles both CLIP and DINOv2 model types.
+        Handles CLIP, fine-tuned CLIP, and DINOv2 model types.

        Args:
            pil_image: PIL Image (RGB format)
@ -360,6 +421,9 @@ class DetectLogosDETR:
            if self.model_type == "clip":
                # CLIP has a dedicated method for image features
                features = self.embedding_model.get_image_features(**inputs)
+            elif self.model_type == "clip_finetuned":
+                # Fine-tuned CLIP uses get_image_features or forward with pixel_values
+                features = self.embedding_model.get_image_features(**inputs)
            else:
                # DINOv2 and other transformers use the CLS token or pooled output
                outputs = self.embedding_model(**inputs)
@ -370,8 +434,9 @@ class DetectLogosDETR:
                    # Use CLS token from last_hidden_state
                    features = outputs.last_hidden_state[:, 0, :]

-            # Normalize for cosine similarity
-            features = F.normalize(features, dim=-1)
+            # Normalize for cosine similarity (fine-tuned model already normalizes)
+            if self.model_type != "clip_finetuned":
+                features = F.normalize(features, dim=-1)

        return features