Add embedding model selection and comparison test scripts

- Update DetectLogosDETR to support both CLIP and DINOv2 models - Rename clip_model parameter to embedding_model - Add model type detection for different embedding extraction - DINOv2 uses CLS token, CLIP uses get_image_features() - Add -e/--embedding-model argument to test_logo_detection.py - Include model name in file output header - Add run_threshold_tests.sh for testing various threshold/margin values - Add run_model_comparison.sh for comparing CLIP vs DINOv2 models
2026-01-02 12:05:27 -05:00
parent a3008ee57f
commit 94db5bd40b
4 changed files with 312 additions and 30 deletions
--- a/logo_detection_detr.py
+++ b/logo_detection_detr.py
@ -1,18 +1,22 @@
 """
-Logo detection using DETR for object detection and CLIP for feature matching.
+Logo detection using DETR for object detection and vision models for feature matching.

 This module provides a class for detecting logos in images using:
 1. DETR (DEtection TRansformer) for initial logo region detection
-2. CLIP (Contrastive Language-Image Pre-training) for feature extraction and matching
+2. Vision models (CLIP, DINOv2, etc.) for feature extraction and matching

 The class supports caching of embeddings for efficient reprocessing.
 The class automatically uses local models if available, otherwise falls back to HuggingFace.
+
+Supported embedding models:
+- CLIP models (openai/clip-vit-*): Text-image alignment, good general features
+- DINOv2 models (facebook/dinov2-*): Self-supervised, excellent for visual similarity
 """

 import os
 import torch
 import torch.nn.functional as F
-from transformers import pipeline, CLIPProcessor, CLIPModel
+from transformers import pipeline, CLIPProcessor, CLIPModel, AutoImageProcessor, AutoModel
 from PIL import Image
 import cv2
 import numpy as np
@ -22,28 +26,31 @@ from typing import List, Tuple, Dict, Optional, Any

 class DetectLogosDETR:
    """
-    Logo detection class using DETR and CLIP models.
+    Logo detection class using DETR and vision embedding models.

    This class detects logos in images by:
    1. Using DETR to find potential logo regions (bounding boxes)
-    2. Extracting CLIP embeddings for each detected region
+    2. Extracting embeddings for each detected region (CLIP, DINOv2, etc.)
    3. Comparing embeddings with reference logos for identification

    The class automatically checks for local models before downloading from HuggingFace.
+
+    Supported embedding models:
+    - CLIP models (openai/clip-vit-*): Text-image alignment
+    - DINOv2 models (facebook/dinov2-*): Self-supervised visual features
    """

    def __init__(
        self,
        logger,
        detr_model: str = "Pravallika6/detr-finetuned-logo-detection_v2",
-        #clip_model: str = "openai/clip-vit-base-patch32",
-        clip_model: str = "openai/clip-vit-large-patch14",
+        embedding_model: str = "openai/clip-vit-large-patch14",
        detr_threshold: float = 0.5,
        min_box_size: int = 20,
        nms_iou_threshold: float = 0.5,
    ):
        """
-        Initialize DETR and CLIP models.
+        Initialize DETR and embedding models.

        The class will automatically check for local models in the default directories
        before downloading from HuggingFace. You can override this by providing absolute
@ -52,7 +59,7 @@ class DetectLogosDETR:
        Args:
            logger: Logger instance for logging
            detr_model: HuggingFace model name or local path for DETR object detection
-            clip_model: HuggingFace model name or local path for CLIP embeddings
+            embedding_model: HuggingFace model name for embeddings (CLIP or DINOv2)
            detr_threshold: Confidence threshold for DETR detections (0-1)
            min_box_size: Minimum width/height in pixels for detected boxes (filters noise)
            nms_iou_threshold: IoU threshold for Non-Maximum Suppression
@ -61,6 +68,7 @@ class DetectLogosDETR:
        self.detr_threshold = detr_threshold
        self.min_box_size = min_box_size
        self.nms_iou_threshold = nms_iou_threshold
+        self.embedding_model_name = embedding_model

        # Set device
        self.device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
@ -71,7 +79,7 @@ class DetectLogosDETR:

        # Get default model directories from environment variables
        default_detr_dir = os.environ.get('LOGO_DETR_MODEL_DIR', 'models/logo_detection/detr')
-        default_clip_dir = os.environ.get('LOGO_CLIP_MODEL_DIR', 'models/logo_detection/clip')
+        default_embedding_dir = os.environ.get('LOGO_EMBEDDING_MODEL_DIR', 'models/logo_detection/embedding')

        # Resolve DETR model path (check local first, then use HuggingFace name)
        detr_model_path = self._resolve_model_path(
@ -87,18 +95,35 @@ class DetectLogosDETR:
            use_fast=True,
        )

-        # Resolve CLIP model path (check local first, then use HuggingFace name)
-        clip_model_path = self._resolve_model_path(
-            clip_model, default_clip_dir, "CLIP"
+        # Resolve embedding model path
+        embedding_model_path = self._resolve_model_path(
+            embedding_model, default_embedding_dir, "Embedding"
        )

-        # Initialize CLIP model for feature extraction
-        self.logger.info(f"Loading CLIP model: {clip_model_path}")
-        self.clip_model = CLIPModel.from_pretrained(clip_model_path).to(self.device)
-        self.clip_processor = CLIPProcessor.from_pretrained(clip_model_path)
+        # Detect model type and initialize accordingly
+        self.model_type = self._detect_model_type(embedding_model)
+        self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")
+
+        if self.model_type == "clip":
+            self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
+            self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
+        else:  # dinov2 or other transformer models
+            self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
+            self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)

        self.logger.info("DetectLogosDETR initialization complete")

+    def _detect_model_type(self, model_name: str) -> str:
+        """Detect the type of embedding model based on name."""
+        model_name_lower = model_name.lower()
+        if "clip" in model_name_lower:
+            return "clip"
+        elif "dino" in model_name_lower:
+            return "dinov2"
+        else:
+            # Default to generic transformer for unknown models
+            return "transformer"
+
    def _resolve_model_path(
        self, model_name_or_path: str, default_local_dir: str, model_type: str
    ) -> str:
@ -193,8 +218,8 @@ class DetectLogosDETR:
            # Extract bounding box region
            bbox_crop = pil_image.crop((xmin, ymin, xmax, ymax))

-            # Get CLIP embedding for this region
-            embedding = self._get_clip_embedding_pil(bbox_crop)
+            # Get embedding for this region
+            embedding = self._get_embedding_pil(bbox_crop)

            detections.append(
                {
@ -299,7 +324,7 @@ class DetectLogosDETR:

    def get_embedding(self, image: np.ndarray) -> torch.Tensor:
        """
-        Get CLIP embedding for a reference logo image.
+        Get embedding for a reference logo image.

        This method is used to compute embeddings for reference logos
        that will be compared against detected regions.
@ -308,29 +333,43 @@ class DetectLogosDETR:
            image: OpenCV image (BGR format, numpy array)

        Returns:
-            Normalized CLIP feature embedding (torch.Tensor, shape: [1, 512])
+            Normalized feature embedding (torch.Tensor)
        """
        # Convert OpenCV BGR to RGB PIL Image
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)

-        return self._get_clip_embedding_pil(pil_image)
+        return self._get_embedding_pil(pil_image)

-    def _get_clip_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
+    def _get_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
        """
-        Internal method to get CLIP embedding from PIL image.
+        Internal method to get embedding from PIL image.
+
+        Handles both CLIP and DINOv2 model types.

        Args:
            pil_image: PIL Image (RGB format)

        Returns:
-            Normalized CLIP feature embedding (torch.Tensor)
+            Normalized feature embedding (torch.Tensor)
        """
-        # Process image through CLIP
-        inputs = self.clip_processor(images=pil_image, return_tensors="pt").to(self.device)
+        # Process image through the embedding model
+        inputs = self.embedding_processor(images=pil_image, return_tensors="pt").to(self.device)

        with torch.no_grad():
-            features = self.clip_model.get_image_features(**inputs)
+            if self.model_type == "clip":
+                # CLIP has a dedicated method for image features
+                features = self.embedding_model.get_image_features(**inputs)
+            else:
+                # DINOv2 and other transformers use the CLS token or pooled output
+                outputs = self.embedding_model(**inputs)
+                # Use the CLS token (first token) from last hidden state
+                if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
+                    features = outputs.pooler_output
+                else:
+                    # Use CLS token from last_hidden_state
+                    features = outputs.last_hidden_state[:, 0, :]
+
            # Normalize for cosine similarity
            features = F.normalize(features, dim=-1)