Add embedding model selection and comparison test scripts
- Update DetectLogosDETR to support both CLIP and DINOv2 models - Rename clip_model parameter to embedding_model - Add model type detection for different embedding extraction - DINOv2 uses CLS token, CLIP uses get_image_features() - Add -e/--embedding-model argument to test_logo_detection.py - Include model name in file output header - Add run_threshold_tests.sh for testing various threshold/margin values - Add run_model_comparison.sh for comparing CLIP vs DINOv2 models
This commit is contained in:
@ -1,18 +1,22 @@
|
||||
"""
|
||||
Logo detection using DETR for object detection and CLIP for feature matching.
|
||||
Logo detection using DETR for object detection and vision models for feature matching.
|
||||
|
||||
This module provides a class for detecting logos in images using:
|
||||
1. DETR (DEtection TRansformer) for initial logo region detection
|
||||
2. CLIP (Contrastive Language-Image Pre-training) for feature extraction and matching
|
||||
2. Vision models (CLIP, DINOv2, etc.) for feature extraction and matching
|
||||
|
||||
The class supports caching of embeddings for efficient reprocessing.
|
||||
The class automatically uses local models if available, otherwise falls back to HuggingFace.
|
||||
|
||||
Supported embedding models:
|
||||
- CLIP models (openai/clip-vit-*): Text-image alignment, good general features
|
||||
- DINOv2 models (facebook/dinov2-*): Self-supervised, excellent for visual similarity
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import pipeline, CLIPProcessor, CLIPModel
|
||||
from transformers import pipeline, CLIPProcessor, CLIPModel, AutoImageProcessor, AutoModel
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
@ -22,28 +26,31 @@ from typing import List, Tuple, Dict, Optional, Any
|
||||
|
||||
class DetectLogosDETR:
|
||||
"""
|
||||
Logo detection class using DETR and CLIP models.
|
||||
Logo detection class using DETR and vision embedding models.
|
||||
|
||||
This class detects logos in images by:
|
||||
1. Using DETR to find potential logo regions (bounding boxes)
|
||||
2. Extracting CLIP embeddings for each detected region
|
||||
2. Extracting embeddings for each detected region (CLIP, DINOv2, etc.)
|
||||
3. Comparing embeddings with reference logos for identification
|
||||
|
||||
The class automatically checks for local models before downloading from HuggingFace.
|
||||
|
||||
Supported embedding models:
|
||||
- CLIP models (openai/clip-vit-*): Text-image alignment
|
||||
- DINOv2 models (facebook/dinov2-*): Self-supervised visual features
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logger,
|
||||
detr_model: str = "Pravallika6/detr-finetuned-logo-detection_v2",
|
||||
#clip_model: str = "openai/clip-vit-base-patch32",
|
||||
clip_model: str = "openai/clip-vit-large-patch14",
|
||||
embedding_model: str = "openai/clip-vit-large-patch14",
|
||||
detr_threshold: float = 0.5,
|
||||
min_box_size: int = 20,
|
||||
nms_iou_threshold: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Initialize DETR and CLIP models.
|
||||
Initialize DETR and embedding models.
|
||||
|
||||
The class will automatically check for local models in the default directories
|
||||
before downloading from HuggingFace. You can override this by providing absolute
|
||||
@ -52,7 +59,7 @@ class DetectLogosDETR:
|
||||
Args:
|
||||
logger: Logger instance for logging
|
||||
detr_model: HuggingFace model name or local path for DETR object detection
|
||||
clip_model: HuggingFace model name or local path for CLIP embeddings
|
||||
embedding_model: HuggingFace model name for embeddings (CLIP or DINOv2)
|
||||
detr_threshold: Confidence threshold for DETR detections (0-1)
|
||||
min_box_size: Minimum width/height in pixels for detected boxes (filters noise)
|
||||
nms_iou_threshold: IoU threshold for Non-Maximum Suppression
|
||||
@ -61,6 +68,7 @@ class DetectLogosDETR:
|
||||
self.detr_threshold = detr_threshold
|
||||
self.min_box_size = min_box_size
|
||||
self.nms_iou_threshold = nms_iou_threshold
|
||||
self.embedding_model_name = embedding_model
|
||||
|
||||
# Set device
|
||||
self.device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
|
||||
@ -71,7 +79,7 @@ class DetectLogosDETR:
|
||||
|
||||
# Get default model directories from environment variables
|
||||
default_detr_dir = os.environ.get('LOGO_DETR_MODEL_DIR', 'models/logo_detection/detr')
|
||||
default_clip_dir = os.environ.get('LOGO_CLIP_MODEL_DIR', 'models/logo_detection/clip')
|
||||
default_embedding_dir = os.environ.get('LOGO_EMBEDDING_MODEL_DIR', 'models/logo_detection/embedding')
|
||||
|
||||
# Resolve DETR model path (check local first, then use HuggingFace name)
|
||||
detr_model_path = self._resolve_model_path(
|
||||
@ -87,18 +95,35 @@ class DetectLogosDETR:
|
||||
use_fast=True,
|
||||
)
|
||||
|
||||
# Resolve CLIP model path (check local first, then use HuggingFace name)
|
||||
clip_model_path = self._resolve_model_path(
|
||||
clip_model, default_clip_dir, "CLIP"
|
||||
# Resolve embedding model path
|
||||
embedding_model_path = self._resolve_model_path(
|
||||
embedding_model, default_embedding_dir, "Embedding"
|
||||
)
|
||||
|
||||
# Initialize CLIP model for feature extraction
|
||||
self.logger.info(f"Loading CLIP model: {clip_model_path}")
|
||||
self.clip_model = CLIPModel.from_pretrained(clip_model_path).to(self.device)
|
||||
self.clip_processor = CLIPProcessor.from_pretrained(clip_model_path)
|
||||
# Detect model type and initialize accordingly
|
||||
self.model_type = self._detect_model_type(embedding_model)
|
||||
self.logger.info(f"Loading {self.model_type} embedding model: {embedding_model_path}")
|
||||
|
||||
if self.model_type == "clip":
|
||||
self.embedding_model = CLIPModel.from_pretrained(embedding_model_path).to(self.device)
|
||||
self.embedding_processor = CLIPProcessor.from_pretrained(embedding_model_path)
|
||||
else: # dinov2 or other transformer models
|
||||
self.embedding_model = AutoModel.from_pretrained(embedding_model_path).to(self.device)
|
||||
self.embedding_processor = AutoImageProcessor.from_pretrained(embedding_model_path)
|
||||
|
||||
self.logger.info("DetectLogosDETR initialization complete")
|
||||
|
||||
def _detect_model_type(self, model_name: str) -> str:
|
||||
"""Detect the type of embedding model based on name."""
|
||||
model_name_lower = model_name.lower()
|
||||
if "clip" in model_name_lower:
|
||||
return "clip"
|
||||
elif "dino" in model_name_lower:
|
||||
return "dinov2"
|
||||
else:
|
||||
# Default to generic transformer for unknown models
|
||||
return "transformer"
|
||||
|
||||
def _resolve_model_path(
|
||||
self, model_name_or_path: str, default_local_dir: str, model_type: str
|
||||
) -> str:
|
||||
@ -193,8 +218,8 @@ class DetectLogosDETR:
|
||||
# Extract bounding box region
|
||||
bbox_crop = pil_image.crop((xmin, ymin, xmax, ymax))
|
||||
|
||||
# Get CLIP embedding for this region
|
||||
embedding = self._get_clip_embedding_pil(bbox_crop)
|
||||
# Get embedding for this region
|
||||
embedding = self._get_embedding_pil(bbox_crop)
|
||||
|
||||
detections.append(
|
||||
{
|
||||
@ -299,7 +324,7 @@ class DetectLogosDETR:
|
||||
|
||||
def get_embedding(self, image: np.ndarray) -> torch.Tensor:
|
||||
"""
|
||||
Get CLIP embedding for a reference logo image.
|
||||
Get embedding for a reference logo image.
|
||||
|
||||
This method is used to compute embeddings for reference logos
|
||||
that will be compared against detected regions.
|
||||
@ -308,29 +333,43 @@ class DetectLogosDETR:
|
||||
image: OpenCV image (BGR format, numpy array)
|
||||
|
||||
Returns:
|
||||
Normalized CLIP feature embedding (torch.Tensor, shape: [1, 512])
|
||||
Normalized feature embedding (torch.Tensor)
|
||||
"""
|
||||
# Convert OpenCV BGR to RGB PIL Image
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(image_rgb)
|
||||
|
||||
return self._get_clip_embedding_pil(pil_image)
|
||||
return self._get_embedding_pil(pil_image)
|
||||
|
||||
def _get_clip_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
|
||||
def _get_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
|
||||
"""
|
||||
Internal method to get CLIP embedding from PIL image.
|
||||
Internal method to get embedding from PIL image.
|
||||
|
||||
Handles both CLIP and DINOv2 model types.
|
||||
|
||||
Args:
|
||||
pil_image: PIL Image (RGB format)
|
||||
|
||||
Returns:
|
||||
Normalized CLIP feature embedding (torch.Tensor)
|
||||
Normalized feature embedding (torch.Tensor)
|
||||
"""
|
||||
# Process image through CLIP
|
||||
inputs = self.clip_processor(images=pil_image, return_tensors="pt").to(self.device)
|
||||
# Process image through the embedding model
|
||||
inputs = self.embedding_processor(images=pil_image, return_tensors="pt").to(self.device)
|
||||
|
||||
with torch.no_grad():
|
||||
features = self.clip_model.get_image_features(**inputs)
|
||||
if self.model_type == "clip":
|
||||
# CLIP has a dedicated method for image features
|
||||
features = self.embedding_model.get_image_features(**inputs)
|
||||
else:
|
||||
# DINOv2 and other transformers use the CLS token or pooled output
|
||||
outputs = self.embedding_model(**inputs)
|
||||
# Use the CLS token (first token) from last hidden state
|
||||
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
|
||||
features = outputs.pooler_output
|
||||
else:
|
||||
# Use CLS token from last_hidden_state
|
||||
features = outputs.last_hidden_state[:, 0, :]
|
||||
|
||||
# Normalize for cosine similarity
|
||||
features = F.normalize(features, dim=-1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user