Compare commits

...

2 Commits

Author SHA1 Message Date
ea6fcec9ce Remove hybrid text+CLIP matching approach
The hybrid approach combined OCR text recognition with CLIP embeddings
to improve logo matching accuracy. After extensive testing, the approach
was abandoned because:

1. OCR quality on small logo crops is unreliable
2. Text filtering rejected correct matches as often as wrong ones
3. Best hybrid result (57.1% precision) was similar to baseline (55.1%)
4. Recall dropped significantly (52.6% vs 59.6%)
5. Added complexity (EasyOCR dependency, extra parameters) wasn't justified

Removed:
- Hybrid matching methods from DetectLogosDETR class
- Text extraction and similarity methods
- Hybrid test scripts and text_recognition.py module
- Hybrid-related CLI arguments from test_logo_detection.py

The baseline multi-ref matching with 0.70 threshold remains the
recommended approach for logo detection.
2026-01-08 12:48:39 -05:00
f777b049a3 Fix EasyOCR model path to use script-relative directory 2026-01-07 15:38:23 -05:00
3 changed files with 7 additions and 594 deletions

View File

@ -23,7 +23,6 @@ import cv2
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict, Optional, Any
from difflib import SequenceMatcher
class DetectLogosDETR:
@ -765,311 +764,4 @@ class DetectLogosDETR:
f"(threshold: {similarity_threshold})"
)
return matched_detections
# =========================================================================
# Hybrid Text + CLIP Matching
# =========================================================================
def set_text_detector(self, text_detector) -> None:
"""
Set an optional text detector for hybrid matching.
Args:
text_detector: Instance of DetectText class from text_recognition.py
"""
self.text_detector = text_detector
self.logger.info("Text detector enabled for hybrid matching")
def extract_text(self, image: np.ndarray, min_confidence: float = 0.3) -> List[str]:
"""
Extract text from an image using the text detector.
Args:
image: OpenCV image (BGR format)
min_confidence: Minimum OCR confidence to accept text
Returns:
List of detected text strings (lowercased, stripped)
"""
if not hasattr(self, 'text_detector') or self.text_detector is None:
return []
try:
results, _ = self.text_detector.detect(image)
# Filter by confidence and normalize text
texts = []
for text, confidence in results:
if confidence >= min_confidence:
# Normalize: lowercase, strip whitespace, remove special chars
normalized = text.lower().strip()
if len(normalized) >= 2: # Ignore single characters
texts.append(normalized)
return texts
except Exception as e:
self.logger.warning(f"Text extraction failed: {e}")
return []
def extract_text_pil(self, pil_image: Image.Image, min_confidence: float = 0.3) -> List[str]:
"""
Extract text from a PIL image.
Args:
pil_image: PIL Image (RGB format)
min_confidence: Minimum OCR confidence
Returns:
List of detected text strings
"""
# Convert PIL to OpenCV format
cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
return self.extract_text(cv_image, min_confidence)
@staticmethod
def compute_text_similarity(text1_list: List[str], text2_list: List[str]) -> float:
"""
Compute fuzzy text similarity between two lists of text strings.
Uses a combination of exact matches and fuzzy matching to handle
OCR variations like case differences, spacing, and minor errors.
Args:
text1_list: List of text strings from first image
text2_list: List of text strings from second image
Returns:
Similarity score between 0 and 1
"""
if not text1_list or not text2_list:
return 0.0
# Combine all text into single strings for overall comparison
text1_combined = " ".join(sorted(text1_list))
text2_combined = " ".join(sorted(text2_list))
# Method 1: Sequence matching on combined text
seq_similarity = SequenceMatcher(None, text1_combined, text2_combined).ratio()
# Method 2: Token overlap (Jaccard-like)
# Split into tokens
tokens1 = set(text1_combined.split())
tokens2 = set(text2_combined.split())
if tokens1 and tokens2:
intersection = len(tokens1 & tokens2)
union = len(tokens1 | tokens2)
token_similarity = intersection / union if union > 0 else 0
else:
token_similarity = 0
# Method 3: Best pairwise match for each text in list1
pairwise_scores = []
for t1 in text1_list:
best_match = 0
for t2 in text2_list:
score = SequenceMatcher(None, t1, t2).ratio()
best_match = max(best_match, score)
pairwise_scores.append(best_match)
pairwise_similarity = sum(pairwise_scores) / len(pairwise_scores) if pairwise_scores else 0
# Combine methods (weighted average)
combined = (seq_similarity * 0.3 + token_similarity * 0.3 + pairwise_similarity * 0.4)
return combined
@staticmethod
def texts_match(
ref_texts: List[str],
det_texts: List[str],
threshold: float = 0.5
) -> Tuple[bool, float]:
"""
Determine if texts match above a threshold.
Args:
ref_texts: Text from reference logo
det_texts: Text from detected region
threshold: Minimum similarity to consider a match
Returns:
Tuple of (is_match, similarity_score)
"""
if not ref_texts:
# Reference has no text - can't match on text
return (False, 0.0)
if not det_texts:
# Reference has text but detection doesn't - no text match
return (False, 0.0)
similarity = DetectLogosDETR.compute_text_similarity(ref_texts, det_texts)
return (similarity >= threshold, similarity)
def find_best_match_hybrid(
self,
detected_embedding: torch.Tensor,
detected_image: np.ndarray,
reference_data: Dict[str, Dict[str, Any]],
clip_threshold: float = 0.70,
clip_threshold_with_text: float = 0.60,
clip_threshold_text_mismatch: float = 0.80,
text_similarity_threshold: float = 0.5,
margin: float = 0.05,
use_mean_similarity: bool = False,
) -> Optional[Tuple[str, float, Dict[str, Any]]]:
"""
Find best match using hybrid text + CLIP approach.
Strategy:
- If reference has text AND detection has matching text:
→ Use lower CLIP threshold (text provides additional confidence)
- If reference has text but detection doesn't match:
→ Use higher CLIP threshold (need more visual confidence)
- If reference has no text:
→ Use standard CLIP threshold
Args:
detected_embedding: CLIP embedding from detected logo region
detected_image: OpenCV image of the detected region (for text extraction)
reference_data: Dict mapping logo name to:
{
'embeddings': List[torch.Tensor], # CLIP embeddings
'texts': List[str], # Extracted text from reference
}
clip_threshold: Standard CLIP threshold for no-text references
clip_threshold_with_text: Lower threshold when text matches
clip_threshold_text_mismatch: Higher threshold when text expected but missing
text_similarity_threshold: Threshold for text matching
margin: Required margin between best and second-best
use_mean_similarity: Use mean vs max for multi-ref aggregation
Returns:
Tuple of (label, clip_similarity, match_info) or None
match_info contains: text_matched, text_similarity, threshold_used
"""
if not reference_data:
return None
# Extract text from detected region
detected_texts = self.extract_text(detected_image)
# Calculate scores for all logos
logo_scores = []
for label, ref_info in reference_data.items():
ref_embeddings = ref_info.get('embeddings', [])
ref_texts = ref_info.get('texts', [])
if not ref_embeddings:
continue
# Calculate CLIP similarity
similarities = []
for ref_emb in ref_embeddings:
sim = self.compare_embeddings(detected_embedding, ref_emb)
similarities.append(sim)
if use_mean_similarity:
clip_score = sum(similarities) / len(similarities)
else:
clip_score = max(similarities)
# Determine text match status and appropriate threshold
has_ref_text = len(ref_texts) > 0
text_matched, text_sim = self.texts_match(
ref_texts, detected_texts, text_similarity_threshold
)
if has_ref_text:
if text_matched:
# Text matches - use lower threshold, boost confidence
threshold_used = clip_threshold_with_text
match_type = "text_match"
else:
# Reference has text but detection doesn't match
# Require higher CLIP threshold
threshold_used = clip_threshold_text_mismatch
match_type = "text_mismatch"
else:
# No text in reference - standard matching
threshold_used = clip_threshold
match_type = "no_text"
text_sim = 0.0
# Check if CLIP score meets the appropriate threshold
if clip_score >= threshold_used:
logo_scores.append({
'label': label,
'clip_score': clip_score,
'text_matched': text_matched,
'text_similarity': text_sim,
'threshold_used': threshold_used,
'match_type': match_type,
'has_ref_text': has_ref_text,
})
if not logo_scores:
return None
# Sort by CLIP score descending
logo_scores.sort(key=lambda x: x['clip_score'], reverse=True)
best = logo_scores[0]
# Check margin against second-best
if margin > 0 and len(logo_scores) > 1:
second_best_score = logo_scores[1]['clip_score']
if best['clip_score'] - second_best_score < margin:
return None
match_info = {
'text_matched': best['text_matched'],
'text_similarity': best['text_similarity'],
'threshold_used': best['threshold_used'],
'match_type': best['match_type'],
'has_ref_text': best['has_ref_text'],
'detected_texts': detected_texts,
}
return (best['label'], best['clip_score'], match_info)
def prepare_reference_data_hybrid(
self,
reference_images: Dict[str, List[np.ndarray]],
text_min_confidence: float = 0.3,
) -> Dict[str, Dict[str, Any]]:
"""
Prepare reference data for hybrid matching by computing embeddings and extracting text.
Args:
reference_images: Dict mapping logo name to list of reference images (OpenCV BGR)
text_min_confidence: Minimum confidence for text extraction
Returns:
Dict mapping logo name to {'embeddings': [...], 'texts': [...]}
"""
reference_data = {}
for logo_name, images in reference_images.items():
embeddings = []
all_texts = set()
for img in images:
# Compute CLIP embedding
emb = self.get_embedding(img)
embeddings.append(emb)
# Extract text
texts = self.extract_text(img, text_min_confidence)
all_texts.update(texts)
reference_data[logo_name] = {
'embeddings': embeddings,
'texts': list(all_texts),
}
if all_texts:
self.logger.debug(f"Reference '{logo_name}' has text: {all_texts}")
return reference_data
return matched_detections

View File

@ -1,168 +0,0 @@
#!/bin/bash
#
# Test the hybrid text+CLIP matching approach for logo detection.
#
# This approach uses text recognition to improve logo matching:
# - If reference logo has text and detection matches it: use lower CLIP threshold
# - If reference logo has text but detection doesn't match: use higher CLIP threshold
# - If reference logo has no text: use standard CLIP threshold
#
# Usage:
# ./run_hybrid_test.sh
#
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_FILE="${SCRIPT_DIR}/test_results/hybrid_matching_results.txt"
# Model - baseline CLIP
MODEL="openai/clip-vit-large-patch14"
# Fixed parameters
NUM_LOGOS=20
REFS_PER_LOGO=10
POSITIVE_SAMPLES=20
NEGATIVE_SAMPLES=100
SEED=42
# Create output directory if needed
mkdir -p "${SCRIPT_DIR}/test_results"
# Clear output file and write header
cat > "$OUTPUT_FILE" << EOF
Hybrid Text+CLIP Matching Test Results
======================================
Date: $(date)
Model: ${MODEL}
Fixed Parameters:
Number of logo brands: ${NUM_LOGOS}
Refs per logo: ${REFS_PER_LOGO}
Positive samples/logo: ${POSITIVE_SAMPLES}
Negative samples/logo: ${NEGATIVE_SAMPLES}
Seed: ${SEED}
EOF
echo "Hybrid Text+CLIP Matching Test"
echo "==============================="
echo "Model: ${MODEL}"
echo ""
# Test 1: Compare hybrid vs multi-ref baseline
echo "=== Test 1: Multi-ref baseline (for comparison) ==="
echo "" >> "$OUTPUT_FILE"
echo "=== BASELINE: Multi-ref (max) at threshold 0.70 ===" >> "$OUTPUT_FILE"
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
--num-logos $NUM_LOGOS \
--refs-per-logo $REFS_PER_LOGO \
--positive-samples $POSITIVE_SAMPLES \
--negative-samples $NEGATIVE_SAMPLES \
--matching-method multi-ref \
--min-matching-refs 1 \
--use-max-similarity \
--threshold 0.70 \
--margin 0.05 \
--seed $SEED \
--embedding-model "$MODEL" \
--output-file "$OUTPUT_FILE" \
--no-cache
echo ""
# Test 2: Hybrid with default thresholds
echo "=== Test 2: Hybrid with default thresholds ==="
echo "" >> "$OUTPUT_FILE"
echo "=== HYBRID: default thresholds (0.70/0.60/0.80) ===" >> "$OUTPUT_FILE"
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
--num-logos $NUM_LOGOS \
--refs-per-logo $REFS_PER_LOGO \
--positive-samples $POSITIVE_SAMPLES \
--negative-samples $NEGATIVE_SAMPLES \
--matching-method hybrid \
--threshold 0.70 \
--hybrid-text-threshold 0.60 \
--hybrid-no-text-threshold 0.80 \
--text-similarity-threshold 0.5 \
--margin 0.05 \
--seed $SEED \
--embedding-model "$MODEL" \
--output-file "$OUTPUT_FILE" \
--no-cache
echo ""
# Test 3: Hybrid with more aggressive text bonus
echo "=== Test 3: Hybrid with lower text-match threshold ==="
echo "" >> "$OUTPUT_FILE"
echo "=== HYBRID: aggressive text bonus (0.70/0.55/0.80) ===" >> "$OUTPUT_FILE"
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
--num-logos $NUM_LOGOS \
--refs-per-logo $REFS_PER_LOGO \
--positive-samples $POSITIVE_SAMPLES \
--negative-samples $NEGATIVE_SAMPLES \
--matching-method hybrid \
--threshold 0.70 \
--hybrid-text-threshold 0.55 \
--hybrid-no-text-threshold 0.80 \
--text-similarity-threshold 0.5 \
--margin 0.05 \
--seed $SEED \
--embedding-model "$MODEL" \
--output-file "$OUTPUT_FILE" \
--no-cache
echo ""
# Test 4: Hybrid with stricter text mismatch penalty
echo "=== Test 4: Hybrid with stricter text mismatch penalty ==="
echo "" >> "$OUTPUT_FILE"
echo "=== HYBRID: strict mismatch (0.70/0.60/0.85) ===" >> "$OUTPUT_FILE"
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
--num-logos $NUM_LOGOS \
--refs-per-logo $REFS_PER_LOGO \
--positive-samples $POSITIVE_SAMPLES \
--negative-samples $NEGATIVE_SAMPLES \
--matching-method hybrid \
--threshold 0.70 \
--hybrid-text-threshold 0.60 \
--hybrid-no-text-threshold 0.85 \
--text-similarity-threshold 0.5 \
--margin 0.05 \
--seed $SEED \
--embedding-model "$MODEL" \
--output-file "$OUTPUT_FILE" \
--no-cache
echo ""
# Test 5: Hybrid with lower text similarity threshold (more lenient OCR matching)
echo "=== Test 5: Hybrid with lenient text matching ==="
echo "" >> "$OUTPUT_FILE"
echo "=== HYBRID: lenient text matching (text_sim=0.4) ===" >> "$OUTPUT_FILE"
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
--num-logos $NUM_LOGOS \
--refs-per-logo $REFS_PER_LOGO \
--positive-samples $POSITIVE_SAMPLES \
--negative-samples $NEGATIVE_SAMPLES \
--matching-method hybrid \
--threshold 0.70 \
--hybrid-text-threshold 0.60 \
--hybrid-no-text-threshold 0.80 \
--text-similarity-threshold 0.4 \
--margin 0.05 \
--seed $SEED \
--embedding-model "$MODEL" \
--output-file "$OUTPUT_FILE" \
--no-cache
echo ""
echo "======================================="
echo "Tests complete!"
echo "Results saved to: $OUTPUT_FILE"
echo "======================================="

View File

@ -243,12 +243,11 @@ def main():
parser.add_argument(
"--matching-method",
type=str,
choices=["simple", "margin", "multi-ref", "hybrid"],
choices=["simple", "margin", "multi-ref"],
default="margin",
help="Matching method: 'simple' returns all matches above threshold, "
"'margin' requires confidence margin over 2nd best, "
"'multi-ref' aggregates scores across reference images, "
"'hybrid' combines text recognition with CLIP (default: margin)",
"'multi-ref' aggregates scores across reference images (default: margin)",
)
parser.add_argument(
"--min-matching-refs",
@ -261,25 +260,6 @@ def main():
action="store_true",
help="For 'multi-ref' method: use max similarity instead of mean across references",
)
# Hybrid method arguments
parser.add_argument(
"--hybrid-text-threshold",
type=float,
default=0.60,
help="For 'hybrid' method: CLIP threshold when text matches (default: 0.60)",
)
parser.add_argument(
"--hybrid-no-text-threshold",
type=float,
default=0.80,
help="For 'hybrid' method: CLIP threshold when text expected but not found (default: 0.80)",
)
parser.add_argument(
"--text-similarity-threshold",
type=float,
default=0.5,
help="For 'hybrid' method: minimum text similarity to consider a match (default: 0.5)",
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
@ -352,14 +332,6 @@ def main():
preprocess_mode=args.preprocess_mode,
)
# Initialize text detector for hybrid method
text_detector = None
if args.matching_method == "hybrid":
logger.info("Initializing text detector for hybrid matching...")
from text_recognition import DetectText
text_detector = DetectText(logger=logger, threshold=0.3)
detector.set_text_detector(text_detector)
# Load ground truth (both mappings)
logger.info("Loading ground truth from database...")
image_to_logos, logo_to_images = get_ground_truth(db_path)
@ -377,15 +349,10 @@ def main():
multi_ref_embeddings: Dict[str, List[torch.Tensor]] = {}
# List for margin-based matching: (logo_name, embedding) tuples
reference_embeddings: List[Tuple[str, torch.Tensor]] = []
# Dict for hybrid matching: logo_name -> {'embeddings': [...], 'texts': [...]}
hybrid_reference_data: Dict[str, Dict[str, Any]] = {}
total_refs = 0
logos_with_text = 0
for logo_name, ref_filenames in tqdm(sampled_logos.items(), desc="Reference logos"):
multi_ref_embeddings[logo_name] = []
if args.matching_method == "hybrid":
hybrid_reference_data[logo_name] = {'embeddings': [], 'texts': set()}
for ref_filename in ref_filenames:
ref_path = reference_dir / ref_filename
@ -398,15 +365,12 @@ def main():
cache_key = f"ref:{ref_filename}"
embedding = cache.get(cache_key) if cache else None
# Load image if needed (for embedding or text extraction)
img = None
if embedding is None or args.matching_method == "hybrid":
# Load image if needed for embedding
if embedding is None:
img = load_image(ref_path)
if img is None:
logger.warning(f"Failed to load reference logo: {ref_path}")
continue
if embedding is None:
embedding = detector.get_embedding(img)
if cache:
cache.put(cache_key, embedding)
@ -415,21 +379,7 @@ def main():
reference_embeddings.append((logo_name, embedding))
total_refs += 1
# Extract text for hybrid method
if args.matching_method == "hybrid" and img is not None:
hybrid_reference_data[logo_name]['embeddings'].append(embedding)
texts = detector.extract_text(img, min_confidence=0.3)
hybrid_reference_data[logo_name]['texts'].update(texts)
# Convert text set to list for hybrid data
if args.matching_method == "hybrid":
hybrid_reference_data[logo_name]['texts'] = list(hybrid_reference_data[logo_name]['texts'])
if hybrid_reference_data[logo_name]['texts']:
logos_with_text += 1
logger.info(f"Computed {total_refs} embeddings for {len(sampled_logos)} logos")
if args.matching_method == "hybrid":
logger.info(f"Extracted text from {logos_with_text}/{len(sampled_logos)} reference logos")
# Build test set: for each logo, sample positive and negative images
logger.info(f"Sampling test images: {args.positive_samples} positive, {args.negative_samples} negative per logo...")
@ -504,14 +454,7 @@ def main():
cache_key = f"det:{test_filename}"
cached_detections = cache.get(cache_key) if cache else None
# For hybrid matching, we always need the original image for text extraction
test_img = None
if args.matching_method == "hybrid":
test_img = load_image(test_path)
if test_img is None:
logger.warning(f"Failed to load test image: {test_path}")
continue
if cached_detections is not None:
# Cached detections contain serialized box data and embeddings
detections = cached_detections
@ -651,50 +594,6 @@ def main():
"correct": is_correct,
})
else: # hybrid
# Hybrid matching: combines text recognition with CLIP
# Extract crop from original image for text extraction
box = detection["box"]
crop = test_img[
int(box["ymin"]):int(box["ymax"]),
int(box["xmin"]):int(box["xmax"])
]
match_result = detector.find_best_match_hybrid(
detected_embedding=detection["embedding"],
detected_image=crop,
reference_data=hybrid_reference_data,
clip_threshold=args.threshold,
clip_threshold_with_text=args.hybrid_text_threshold,
clip_threshold_text_mismatch=args.hybrid_no_text_threshold,
text_similarity_threshold=args.text_similarity_threshold,
margin=args.margin,
use_mean_similarity=not args.use_max_similarity,
)
if match_result:
label, similarity, match_info = match_result
matched_logos.add(label)
is_correct = label in expected_logos
if is_correct:
true_positives += 1
if args.similarity_details:
similarity_details["true_positive_sims"].append(similarity)
else:
false_positives += 1
if args.similarity_details:
similarity_details["false_positive_sims"].append(similarity)
results.append({
"test_image": test_filename,
"matched_logo": label,
"similarity": similarity,
"correct": is_correct,
"text_matched": match_info.get("text_matched", False),
"text_similarity": match_info.get("text_similarity", 0),
"match_type": match_info.get("match_type", "unknown"),
})
# Count missed detections (false negatives)
missed = expected_logos - matched_logos
false_negatives += len(missed)
@ -742,16 +641,11 @@ def main():
print(f" DETR confidence threshold: {args.detr_threshold}")
print(f" Preprocess mode: {args.preprocess_mode}")
print(f" Matching method: {args.matching_method}")
if args.matching_method in ("margin", "multi-ref", "hybrid"):
if args.matching_method in ("margin", "multi-ref"):
print(f" Matching margin: {args.margin}")
if args.matching_method == "multi-ref":
print(f" Min matching refs: {args.min_matching_refs}")
print(f" Similarity aggregation: {'max' if args.use_max_similarity else 'mean'}")
if args.matching_method == "hybrid":
print(f" CLIP threshold (text match): {args.hybrid_text_threshold}")
print(f" CLIP threshold (no text): {args.hybrid_no_text_threshold}")
print(f" Text similarity threshold: {args.text_similarity_threshold}")
print(f" Refs with text: {logos_with_text}/{len(sampled_logos)}")
if args.seed is not None:
print(f" Random seed: {args.seed}")
@ -939,14 +833,9 @@ def write_results_to_file(
method_desc = "Simple (all matches above threshold)"
elif args.matching_method == "margin":
method_desc = f"Margin-based (margin={args.margin})"
elif args.matching_method == "multi-ref":
else: # multi-ref
agg = "max" if args.use_max_similarity else "mean"
method_desc = f"Multi-ref ({agg}, min_refs={args.min_matching_refs}, margin={args.margin})"
else: # hybrid
method_desc = (
f"Hybrid (text+CLIP, text_thresh={args.hybrid_text_threshold}, "
f"no_text_thresh={args.hybrid_no_text_threshold}, margin={args.margin})"
)
lines = [
"=" * 70,