Add embedding model selection and comparison test scripts

- Update DetectLogosDETR to support both CLIP and DINOv2 models
  - Rename clip_model parameter to embedding_model
  - Add model type detection for different embedding extraction
  - DINOv2 uses CLS token, CLIP uses get_image_features()
- Add -e/--embedding-model argument to test_logo_detection.py
- Include model name in file output header
- Add run_threshold_tests.sh for testing various threshold/margin values
- Add run_model_comparison.sh for comparing CLIP vs DINOv2 models
This commit is contained in:
Rick McEwen
2026-01-02 12:05:27 -05:00
parent a3008ee57f
commit 94db5bd40b
4 changed files with 312 additions and 30 deletions

View File

@ -1,18 +1,22 @@
"""
Logo detection using DETR for object detection and CLIP for feature matching.
Logo detection using DETR for object detection and vision models for feature matching.
This module provides a class for detecting logos in images using:
1. DETR (DEtection TRansformer) for initial logo region detection
2. CLIP (Contrastive Language-Image Pre-training) for feature extraction and matching
2. Vision models (CLIP, DINOv2, etc.) for feature extraction and matching
The class supports caching of embeddings for efficient reprocessing.
The class automatically uses local models if available, otherwise falls back to HuggingFace.
Supported embedding models:
- CLIP models (openai/clip-vit-*): Text-image alignment, good general features
- DINOv2 models (facebook/dinov2-*): Self-supervised, excellent for visual similarity
"""
import os
import torch
import torch.nn.functional as F
from transformers import pipeline, CLIPProcessor, CLIPModel
from transformers import pipeline, CLIPProcessor, CLIPModel, AutoImageProcessor, AutoModel
from PIL import Image
import cv2
import numpy as np
@ -22,28 +26,31 @@ from typing import List, Tuple, Dict, Optional, Any
class DetectLogosDETR:
"""
Logo detection class using DETR and CLIP models.
Logo detection class using DETR and vision embedding models.
This class detects logos in images by:
1. Using DETR to find potential logo regions (bounding boxes)
2. Extracting CLIP embeddings for each detected region
2. Extracting embeddings for each detected region (CLIP, DINOv2, etc.)
3. Comparing embeddings with reference logos for identification
The class automatically checks for local models before downloading from HuggingFace.
Supported embedding models:
- CLIP models (openai/clip-vit-*): Text-image alignment
- DINOv2 models (facebook/dinov2-*): Self-supervised visual features
"""
def __init__(
self,
logger,
detr_model: str = "Pravallika6/detr-finetuned-logo-detection_v2",
#clip_model: str = "openai/clip-vit-base-patch32",
clip_model: str = "openai/clip-vit-large-patch14",
embedding_model: str = "openai/clip-vit-large-patch14",
detr_threshold: float = 0.5,
min_box_size: int = 20,
nms_iou_threshold: float = 0.5,
):
"""
Initialize DETR and CLIP models.
Initialize DETR and embedding models.
The class will automatically check for local models in the default directories
before downloading from HuggingFace. You can override this by providing absolute
@ -52,7 +59,7 @@ class DetectLogosDETR:
Args:
logger: Logger instance for logging
detr_model: HuggingFace model name or local path for DETR object detection
clip_model: HuggingFace model name or local path for CLIP embeddings
embedding_model: HuggingFace model name for embeddings (CLIP or DINOv2)
detr_threshold: Confidence threshold for DETR detections (0-1)
min_box_size: Minimum width/height in pixels for detected boxes (filters noise)
nms_iou_threshold: IoU threshold for Non-Maximum Suppression
@ -61,6 +68,7 @@ class DetectLogosDETR:
self.detr_threshold = detr_threshold
self.min_box_size = min_box_size
self.nms_iou_threshold = nms_iou_threshold
self.embedding_model_name = embedding_model
# Set device
self.device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
@ -71,7 +79,7 @@ class DetectLogosDETR:
# Get default model directories from environment variables
default_detr_dir = os.environ.get('LOGO_DETR_MODEL_DIR', 'models/logo_detection/detr')
default_clip_dir = os.environ.get('LOGO_CLIP_MODEL_DIR', 'models/logo_detection/clip')
default_embedding_dir = os.environ.get('LOGO_EMBEDDING_MODEL_DIR', 'models/logo_detection/embedding')
# Resolve DETR model path (check local first, then use HuggingFace name)
detr_model_path = self._resolve_model_path(
@ -87,18 +95,35 @@ class DetectLogosDETR:
use_fast=True,
)
# Resolve CLIP model path (check local first, then use HuggingFace name)
clip_model_path = self._resolve_model_path(
clip_model, default_clip_dir, "CLIP"
# Resolve embedding model path
embedding_model_path = self._resolve_model_path(
embedding_model, default_embedding_dir, "Embedding"
)
# Initialize CLIP model for feature extraction
self.logger.info(f"Loading CLIP model: {clip_model_path}")
self.clip_model = CLIPModel.from_pretrained(clip_model_path).to(self.device)
self.clip_processor = CLIPProcessor.from_pretrained(clip_model_path)
# Detect model type and initialize accordingly
self.model_type = self._detect_model_type(embedding_model)
self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")
if self.model_type == "clip":
self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
else: # dinov2 or other transformer models
self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)
self.logger.info("DetectLogosDETR initialization complete")
def _detect_model_type(self, model_name: str) -> str:
"""Detect the type of embedding model based on name."""
model_name_lower = model_name.lower()
if "clip" in model_name_lower:
return "clip"
elif "dino" in model_name_lower:
return "dinov2"
else:
# Default to generic transformer for unknown models
return "transformer"
def _resolve_model_path(
self, model_name_or_path: str, default_local_dir: str, model_type: str
) -> str:
@ -193,8 +218,8 @@ class DetectLogosDETR:
# Extract bounding box region
bbox_crop = pil_image.crop((xmin, ymin, xmax, ymax))
# Get CLIP embedding for this region
embedding = self._get_clip_embedding_pil(bbox_crop)
# Get embedding for this region
embedding = self._get_embedding_pil(bbox_crop)
detections.append(
{
@ -299,7 +324,7 @@ class DetectLogosDETR:
def get_embedding(self, image: np.ndarray) -> torch.Tensor:
"""
Get CLIP embedding for a reference logo image.
Get embedding for a reference logo image.
This method is used to compute embeddings for reference logos
that will be compared against detected regions.
@ -308,29 +333,43 @@ class DetectLogosDETR:
image: OpenCV image (BGR format, numpy array)
Returns:
Normalized CLIP feature embedding (torch.Tensor, shape: [1, 512])
Normalized feature embedding (torch.Tensor)
"""
# Convert OpenCV BGR to RGB PIL Image
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image_rgb)
return self._get_clip_embedding_pil(pil_image)
return self._get_embedding_pil(pil_image)
def _get_clip_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
def _get_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
"""
Internal method to get CLIP embedding from PIL image.
Internal method to get embedding from PIL image.
Handles both CLIP and DINOv2 model types.
Args:
pil_image: PIL Image (RGB format)
Returns:
Normalized CLIP feature embedding (torch.Tensor)
Normalized feature embedding (torch.Tensor)
"""
# Process image through CLIP
inputs = self.clip_processor(images=pil_image, return_tensors="pt").to(self.device)
# Process image through the embedding model
inputs = self.embedding_processor(images=pil_image, return_tensors="pt").to(self.device)
with torch.no_grad():
features = self.clip_model.get_image_features(**inputs)
if self.model_type == "clip":
# CLIP has a dedicated method for image features
features = self.embedding_model.get_image_features(**inputs)
else:
# DINOv2 and other transformers use the CLS token or pooled output
outputs = self.embedding_model(**inputs)
# Use the CLS token (first token) from last hidden state
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
features = outputs.pooler_output
else:
# Use CLS token from last_hidden_state
features = outputs.last_hidden_state[:, 0, :]
# Normalize for cosine similarity
features = F.normalize(features, dim=-1)