Initial commit: Logo detection test framework
Add DETR+CLIP based logo detection library and test framework: - DetectLogosDETR class for logo detection and matching - Test script with margin-based and multi-ref matching methods - Data preparation script for test database - Documentation for API usage and test methodology
This commit is contained in:
556
logo_detection_detr.py
Normal file
556
logo_detection_detr.py
Normal file
@ -0,0 +1,556 @@
|
||||
"""
|
||||
Logo detection using DETR for object detection and CLIP for feature matching.
|
||||
|
||||
This module provides a class for detecting logos in images using:
|
||||
1. DETR (DEtection TRansformer) for initial logo region detection
|
||||
2. CLIP (Contrastive Language-Image Pre-training) for feature extraction and matching
|
||||
|
||||
The class supports caching of embeddings for efficient reprocessing.
|
||||
The class automatically uses local models if available, otherwise falls back to HuggingFace.
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import pipeline, CLIPProcessor, CLIPModel
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Dict, Optional, Any
|
||||
|
||||
|
||||
class DetectLogosDETR:
|
||||
"""
|
||||
Logo detection class using DETR and CLIP models.
|
||||
|
||||
This class detects logos in images by:
|
||||
1. Using DETR to find potential logo regions (bounding boxes)
|
||||
2. Extracting CLIP embeddings for each detected region
|
||||
3. Comparing embeddings with reference logos for identification
|
||||
|
||||
The class automatically checks for local models before downloading from HuggingFace.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logger,
|
||||
detr_model: str = "Pravallika6/detr-finetuned-logo-detection_v2",
|
||||
#clip_model: str = "openai/clip-vit-base-patch32",
|
||||
clip_model: str = "openai/clip-vit-large-patch14",
|
||||
detr_threshold: float = 0.5,
|
||||
min_box_size: int = 20,
|
||||
nms_iou_threshold: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Initialize DETR and CLIP models.
|
||||
|
||||
The class will automatically check for local models in the default directories
|
||||
before downloading from HuggingFace. You can override this by providing absolute
|
||||
paths to local models.
|
||||
|
||||
Args:
|
||||
logger: Logger instance for logging
|
||||
detr_model: HuggingFace model name or local path for DETR object detection
|
||||
clip_model: HuggingFace model name or local path for CLIP embeddings
|
||||
detr_threshold: Confidence threshold for DETR detections (0-1)
|
||||
min_box_size: Minimum width/height in pixels for detected boxes (filters noise)
|
||||
nms_iou_threshold: IoU threshold for Non-Maximum Suppression
|
||||
"""
|
||||
self.logger = logger
|
||||
self.detr_threshold = detr_threshold
|
||||
self.min_box_size = min_box_size
|
||||
self.nms_iou_threshold = nms_iou_threshold
|
||||
|
||||
# Set device
|
||||
self.device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
|
||||
self.device_index = 0 if torch.cuda.is_available() else -1
|
||||
self.device = torch.device(self.device_str)
|
||||
|
||||
self.logger.info(f"Initializing DetectLogosDETR on device: {self.device_str}")
|
||||
|
||||
# Get default model directories from environment variables
|
||||
default_detr_dir = os.environ.get('LOGO_DETR_MODEL_DIR', 'models/logo_detection/detr')
|
||||
default_clip_dir = os.environ.get('LOGO_CLIP_MODEL_DIR', 'models/logo_detection/clip')
|
||||
|
||||
# Resolve DETR model path (check local first, then use HuggingFace name)
|
||||
detr_model_path = self._resolve_model_path(
|
||||
detr_model, default_detr_dir, "DETR"
|
||||
)
|
||||
|
||||
# Initialize DETR pipeline for logo detection
|
||||
self.logger.info(f"Loading DETR model: {detr_model_path}")
|
||||
self.detr_pipe = pipeline(
|
||||
task="object-detection",
|
||||
model=detr_model_path,
|
||||
device=self.device_index,
|
||||
use_fast=True,
|
||||
)
|
||||
|
||||
# Resolve CLIP model path (check local first, then use HuggingFace name)
|
||||
clip_model_path = self._resolve_model_path(
|
||||
clip_model, default_clip_dir, "CLIP"
|
||||
)
|
||||
|
||||
# Initialize CLIP model for feature extraction
|
||||
self.logger.info(f"Loading CLIP model: {clip_model_path}")
|
||||
self.clip_model = CLIPModel.from_pretrained(clip_model_path).to(self.device)
|
||||
self.clip_processor = CLIPProcessor.from_pretrained(clip_model_path)
|
||||
|
||||
self.logger.info("DetectLogosDETR initialization complete")
|
||||
|
||||
def _resolve_model_path(
|
||||
self, model_name_or_path: str, default_local_dir: str, model_type: str
|
||||
) -> str:
|
||||
"""
|
||||
Resolve model path, checking for local models before using HuggingFace.
|
||||
|
||||
Args:
|
||||
model_name_or_path: HuggingFace model name or absolute path
|
||||
default_local_dir: Default local directory to check
|
||||
model_type: Type of model (for logging, e.g., "DETR" or "CLIP")
|
||||
|
||||
Returns:
|
||||
Resolved model path (local path or HuggingFace model name)
|
||||
"""
|
||||
# If it's an absolute path, use it directly
|
||||
if os.path.isabs(model_name_or_path):
|
||||
if os.path.exists(model_name_or_path):
|
||||
self.logger.info(
|
||||
f"{model_type} model: Using local model at {model_name_or_path}"
|
||||
)
|
||||
return model_name_or_path
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"{model_type} model: Local path {model_name_or_path} does not exist, "
|
||||
f"falling back to HuggingFace"
|
||||
)
|
||||
return model_name_or_path
|
||||
|
||||
# Check if default local directory exists
|
||||
if os.path.exists(default_local_dir):
|
||||
# Verify it's a valid model directory (has config.json)
|
||||
config_file = os.path.join(default_local_dir, "config.json")
|
||||
if os.path.exists(config_file):
|
||||
abs_path = os.path.abspath(default_local_dir)
|
||||
self.logger.info(
|
||||
f"{model_type} model: Found local model at {abs_path}"
|
||||
)
|
||||
return abs_path
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"{model_type} model: Local directory {default_local_dir} exists but "
|
||||
f"is not a valid model (missing config.json)"
|
||||
)
|
||||
|
||||
# Use HuggingFace model name
|
||||
self.logger.info(
|
||||
f"{model_type} model: No local model found, will download from HuggingFace: "
|
||||
f"{model_name_or_path}"
|
||||
)
|
||||
return model_name_or_path
|
||||
|
||||
def detect(self, image: np.ndarray) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect logos in an image and return bounding boxes with CLIP embeddings.
|
||||
|
||||
Args:
|
||||
image: OpenCV image (BGR format, numpy array)
|
||||
|
||||
Returns:
|
||||
List of dictionaries, each containing:
|
||||
- 'box': dict with 'xmin', 'ymin', 'xmax', 'ymax' (pixel coordinates)
|
||||
- 'score': DETR confidence score (float 0-1)
|
||||
- 'embedding': CLIP feature embedding (torch.Tensor)
|
||||
- 'label': DETR predicted label (string)
|
||||
"""
|
||||
# Convert OpenCV BGR to RGB PIL Image
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(image_rgb)
|
||||
|
||||
# Run DETR detection
|
||||
predictions = self.detr_pipe(pil_image)
|
||||
|
||||
# Filter by threshold and size, then add CLIP embeddings
|
||||
detections = []
|
||||
for pred in predictions:
|
||||
score = pred.get("score", 0.0)
|
||||
if score < self.detr_threshold:
|
||||
continue
|
||||
|
||||
box = pred.get("box", {})
|
||||
xmin = box.get("xmin", 0)
|
||||
ymin = box.get("ymin", 0)
|
||||
xmax = box.get("xmax", 0)
|
||||
ymax = box.get("ymax", 0)
|
||||
|
||||
# Filter by minimum box size
|
||||
box_width = xmax - xmin
|
||||
box_height = ymax - ymin
|
||||
if box_width < self.min_box_size or box_height < self.min_box_size:
|
||||
continue
|
||||
|
||||
# Extract bounding box region
|
||||
bbox_crop = pil_image.crop((xmin, ymin, xmax, ymax))
|
||||
|
||||
# Get CLIP embedding for this region
|
||||
embedding = self._get_clip_embedding_pil(bbox_crop)
|
||||
|
||||
detections.append(
|
||||
{
|
||||
"box": {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax},
|
||||
"score": score,
|
||||
"embedding": embedding,
|
||||
"label": pred.get("label", "logo"),
|
||||
}
|
||||
)
|
||||
|
||||
# Apply Non-Maximum Suppression to remove overlapping detections
|
||||
detections = self._apply_nms(detections, self.nms_iou_threshold)
|
||||
|
||||
self.logger.debug(f"Detected {len(detections)} logos (threshold: {self.detr_threshold})")
|
||||
return detections
|
||||
|
||||
def _apply_nms(self, predictions: List[Dict], iou_threshold: float) -> List[Dict]:
|
||||
"""
|
||||
Apply Non-Maximum Suppression to remove overlapping detections.
|
||||
|
||||
Args:
|
||||
predictions: List of prediction dictionaries with 'box' and 'score'
|
||||
iou_threshold: IoU threshold for considering boxes as overlapping
|
||||
|
||||
Returns:
|
||||
Filtered list of predictions after NMS
|
||||
"""
|
||||
if len(predictions) == 0:
|
||||
return []
|
||||
|
||||
# Extract boxes and scores
|
||||
boxes = []
|
||||
scores = []
|
||||
for pred in predictions:
|
||||
box = pred.get("box", {})
|
||||
boxes.append([
|
||||
box.get("xmin", 0),
|
||||
box.get("ymin", 0),
|
||||
box.get("xmax", 0),
|
||||
box.get("ymax", 0)
|
||||
])
|
||||
scores.append(pred.get("score", 0.0))
|
||||
|
||||
# Convert to numpy arrays
|
||||
boxes = np.array(boxes, dtype=np.float32)
|
||||
scores = np.array(scores, dtype=np.float32)
|
||||
|
||||
# Sort by scores (descending)
|
||||
sorted_indices = np.argsort(scores)[::-1]
|
||||
|
||||
keep_indices = []
|
||||
while len(sorted_indices) > 0:
|
||||
# Keep the box with highest score
|
||||
current_idx = sorted_indices[0]
|
||||
keep_indices.append(current_idx)
|
||||
|
||||
if len(sorted_indices) == 1:
|
||||
break
|
||||
|
||||
# Calculate IoU with remaining boxes
|
||||
current_box = boxes[current_idx]
|
||||
remaining_boxes = boxes[sorted_indices[1:]]
|
||||
|
||||
ious = self._calculate_iou_batch(current_box, remaining_boxes)
|
||||
|
||||
# Keep only boxes with IoU below threshold
|
||||
mask = ious < iou_threshold
|
||||
sorted_indices = sorted_indices[1:][mask]
|
||||
|
||||
# Return predictions for kept indices
|
||||
return [predictions[i] for i in keep_indices]
|
||||
|
||||
def _calculate_iou_batch(self, box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Calculate IoU between one box and multiple boxes.
|
||||
|
||||
Args:
|
||||
box: Single box [xmin, ymin, xmax, ymax]
|
||||
boxes: Multiple boxes [[xmin, ymin, xmax, ymax], ...]
|
||||
|
||||
Returns:
|
||||
Array of IoU values
|
||||
"""
|
||||
# Calculate intersection coordinates
|
||||
x1 = np.maximum(box[0], boxes[:, 0])
|
||||
y1 = np.maximum(box[1], boxes[:, 1])
|
||||
x2 = np.minimum(box[2], boxes[:, 2])
|
||||
y2 = np.minimum(box[3], boxes[:, 3])
|
||||
|
||||
# Calculate intersection area
|
||||
intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
|
||||
|
||||
# Calculate union area
|
||||
box_area = (box[2] - box[0]) * (box[3] - box[1])
|
||||
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
||||
union = box_area + boxes_area - intersection
|
||||
|
||||
# Calculate IoU
|
||||
iou = intersection / (union + 1e-6) # Add small epsilon to avoid division by zero
|
||||
|
||||
return iou
|
||||
|
||||
def get_embedding(self, image: np.ndarray) -> torch.Tensor:
|
||||
"""
|
||||
Get CLIP embedding for a reference logo image.
|
||||
|
||||
This method is used to compute embeddings for reference logos
|
||||
that will be compared against detected regions.
|
||||
|
||||
Args:
|
||||
image: OpenCV image (BGR format, numpy array)
|
||||
|
||||
Returns:
|
||||
Normalized CLIP feature embedding (torch.Tensor, shape: [1, 512])
|
||||
"""
|
||||
# Convert OpenCV BGR to RGB PIL Image
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(image_rgb)
|
||||
|
||||
return self._get_clip_embedding_pil(pil_image)
|
||||
|
||||
def _get_clip_embedding_pil(self, pil_image: Image.Image) -> torch.Tensor:
|
||||
"""
|
||||
Internal method to get CLIP embedding from PIL image.
|
||||
|
||||
Args:
|
||||
pil_image: PIL Image (RGB format)
|
||||
|
||||
Returns:
|
||||
Normalized CLIP feature embedding (torch.Tensor)
|
||||
"""
|
||||
# Process image through CLIP
|
||||
inputs = self.clip_processor(images=pil_image, return_tensors="pt").to(self.device)
|
||||
|
||||
with torch.no_grad():
|
||||
features = self.clip_model.get_image_features(**inputs)
|
||||
# Normalize for cosine similarity
|
||||
features = F.normalize(features, dim=-1)
|
||||
|
||||
return features
|
||||
|
||||
def compare_embeddings(
|
||||
self, embedding1: torch.Tensor, embedding2: torch.Tensor
|
||||
) -> float:
|
||||
"""
|
||||
Compute cosine similarity between two CLIP embeddings.
|
||||
|
||||
Args:
|
||||
embedding1: First CLIP embedding (torch.Tensor)
|
||||
embedding2: Second CLIP embedding (torch.Tensor)
|
||||
|
||||
Returns:
|
||||
Cosine similarity score (float, range: -1 to 1, typically 0 to 1)
|
||||
"""
|
||||
# Ensure tensors are on the same device
|
||||
if embedding1.device != embedding2.device:
|
||||
embedding2 = embedding2.to(embedding1.device)
|
||||
|
||||
# Compute cosine similarity
|
||||
similarity = F.cosine_similarity(embedding1, embedding2, dim=-1)
|
||||
|
||||
# Return as Python float
|
||||
return similarity.item()
|
||||
|
||||
def find_best_match(
|
||||
self,
|
||||
detected_embedding: torch.Tensor,
|
||||
reference_embeddings: List[Tuple[str, torch.Tensor]],
|
||||
similarity_threshold: float = 0.7,
|
||||
) -> Optional[Tuple[str, float]]:
|
||||
"""
|
||||
Find the best matching reference logo for a detected embedding.
|
||||
|
||||
Args:
|
||||
detected_embedding: CLIP embedding from detected logo region
|
||||
reference_embeddings: List of (label, embedding) tuples for reference logos
|
||||
similarity_threshold: Minimum similarity to consider a match (0-1)
|
||||
|
||||
Returns:
|
||||
Tuple of (label, similarity) for best match, or None if no match above threshold
|
||||
"""
|
||||
if not reference_embeddings:
|
||||
return None
|
||||
|
||||
best_similarity = -1.0
|
||||
best_label = None
|
||||
|
||||
for label, ref_embedding in reference_embeddings:
|
||||
similarity = self.compare_embeddings(detected_embedding, ref_embedding)
|
||||
|
||||
if similarity > best_similarity:
|
||||
best_similarity = similarity
|
||||
best_label = label
|
||||
|
||||
if best_similarity >= similarity_threshold:
|
||||
return (best_label, best_similarity)
|
||||
else:
|
||||
return None
|
||||
|
||||
def find_best_match_multi_ref(
|
||||
self,
|
||||
detected_embedding: torch.Tensor,
|
||||
reference_embeddings: Dict[str, List[torch.Tensor]],
|
||||
similarity_threshold: float = 0.85,
|
||||
min_matching_refs: int = 1,
|
||||
use_mean_similarity: bool = True,
|
||||
) -> Optional[Tuple[str, float, int]]:
|
||||
"""
|
||||
Find the best matching reference logo using multiple reference embeddings per logo.
|
||||
|
||||
This method improves accuracy by using multiple reference images for each logo
|
||||
and requiring consistency across references.
|
||||
|
||||
Args:
|
||||
detected_embedding: CLIP embedding from detected logo region
|
||||
reference_embeddings: Dict mapping logo name to list of embeddings
|
||||
similarity_threshold: Minimum similarity to consider a match (0-1)
|
||||
min_matching_refs: Minimum number of references that must match above threshold
|
||||
use_mean_similarity: If True, use mean similarity across all refs; if False, use max
|
||||
|
||||
Returns:
|
||||
Tuple of (label, similarity, num_matching_refs) for best match,
|
||||
or None if no match meets criteria
|
||||
"""
|
||||
if not reference_embeddings:
|
||||
return None
|
||||
|
||||
best_score = -1.0
|
||||
best_label = None
|
||||
best_num_matches = 0
|
||||
|
||||
for label, ref_embedding_list in reference_embeddings.items():
|
||||
if not ref_embedding_list:
|
||||
continue
|
||||
|
||||
# Calculate similarity to each reference embedding
|
||||
similarities = []
|
||||
for ref_embedding in ref_embedding_list:
|
||||
sim = self.compare_embeddings(detected_embedding, ref_embedding)
|
||||
similarities.append(sim)
|
||||
|
||||
# Count how many references match above threshold
|
||||
num_matches = sum(1 for s in similarities if s >= similarity_threshold)
|
||||
|
||||
# Calculate aggregate score
|
||||
if use_mean_similarity:
|
||||
score = sum(similarities) / len(similarities)
|
||||
else:
|
||||
score = max(similarities)
|
||||
|
||||
# Check if this logo meets the minimum matching refs requirement
|
||||
if num_matches >= min_matching_refs and score > best_score:
|
||||
best_score = score
|
||||
best_label = label
|
||||
best_num_matches = num_matches
|
||||
|
||||
if best_label is not None and best_score >= similarity_threshold:
|
||||
return (best_label, best_score, best_num_matches)
|
||||
else:
|
||||
return None
|
||||
|
||||
def find_best_match_with_margin(
|
||||
self,
|
||||
detected_embedding: torch.Tensor,
|
||||
reference_embeddings: List[Tuple[str, torch.Tensor]],
|
||||
similarity_threshold: float = 0.85,
|
||||
margin: float = 0.05,
|
||||
) -> Optional[Tuple[str, float]]:
|
||||
"""
|
||||
Find best match with a confidence margin over the second-best match.
|
||||
|
||||
This reduces false positives by requiring the best match to be
|
||||
significantly better than alternatives.
|
||||
|
||||
Args:
|
||||
detected_embedding: CLIP embedding from detected logo region
|
||||
reference_embeddings: List of (label, embedding) tuples for reference logos
|
||||
similarity_threshold: Minimum similarity to consider a match (0-1)
|
||||
margin: Required margin between best and second-best match
|
||||
|
||||
Returns:
|
||||
Tuple of (label, similarity) for best match, or None if no confident match
|
||||
"""
|
||||
if not reference_embeddings:
|
||||
return None
|
||||
|
||||
# Calculate all similarities
|
||||
similarities = []
|
||||
for label, ref_embedding in reference_embeddings:
|
||||
sim = self.compare_embeddings(detected_embedding, ref_embedding)
|
||||
similarities.append((label, sim))
|
||||
|
||||
# Sort by similarity descending
|
||||
similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
best_label, best_sim = similarities[0]
|
||||
|
||||
# Check if best is above threshold
|
||||
if best_sim < similarity_threshold:
|
||||
return None
|
||||
|
||||
# Check margin against second best (if exists)
|
||||
if len(similarities) > 1:
|
||||
second_best_sim = similarities[1][1]
|
||||
if best_sim - second_best_sim < margin:
|
||||
return None # Not confident enough
|
||||
|
||||
return (best_label, best_sim)
|
||||
|
||||
def detect_and_match(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
reference_embeddings: List[Tuple[str, torch.Tensor]],
|
||||
similarity_threshold: float = 0.7,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect logos and match them against reference embeddings in one step.
|
||||
|
||||
This is a convenience method that combines detection and matching.
|
||||
|
||||
Args:
|
||||
image: OpenCV image (BGR format, numpy array)
|
||||
reference_embeddings: List of (label, embedding) tuples for reference logos
|
||||
similarity_threshold: Minimum similarity to consider a match (0-1)
|
||||
|
||||
Returns:
|
||||
List of matched detections, each containing:
|
||||
- 'box': bounding box coordinates
|
||||
- 'detr_score': DETR confidence score
|
||||
- 'clip_similarity': CLIP similarity score
|
||||
- 'label': matched reference logo label
|
||||
"""
|
||||
# Detect all logos
|
||||
detections = self.detect(image)
|
||||
|
||||
# Match each detection against references
|
||||
matched_detections = []
|
||||
for detection in detections:
|
||||
match_result = self.find_best_match(
|
||||
detection["embedding"], reference_embeddings, similarity_threshold
|
||||
)
|
||||
|
||||
if match_result is not None:
|
||||
label, similarity = match_result
|
||||
matched_detections.append(
|
||||
{
|
||||
"box": detection["box"],
|
||||
"detr_score": detection["score"],
|
||||
"clip_similarity": similarity,
|
||||
"label": label,
|
||||
}
|
||||
)
|
||||
|
||||
self.logger.debug(
|
||||
f"Matched {len(matched_detections)}/{len(detections)} detections "
|
||||
f"(threshold: {similarity_threshold})"
|
||||
)
|
||||
|
||||
return matched_detections
|
||||
Reference in New Issue
Block a user