Remove hybrid text+CLIP matching approach

The hybrid approach combined OCR text recognition with CLIP embeddings
to improve logo matching accuracy. After extensive testing, the approach
was abandoned because:

1. OCR quality on small logo crops is unreliable
2. Text filtering rejected correct matches as often as wrong ones
3. Best hybrid result (57.1% precision) was similar to baseline (55.1%)
4. Recall dropped significantly (52.6% vs 59.6%)
5. Added complexity (EasyOCR dependency, extra parameters) wasn't justified

Removed:
- Hybrid matching methods from DetectLogosDETR class
- Text extraction and similarity methods
- Hybrid test scripts and text_recognition.py module
- Hybrid-related CLI arguments from test_logo_detection.py

The baseline multi-ref matching with 0.70 threshold remains the
recommended approach for logo detection.
This commit is contained in:
Rick McEwen
2026-01-08 12:48:39 -05:00
parent f777b049a3
commit ea6fcec9ce
4 changed files with 7 additions and 646 deletions

View File

@ -243,12 +243,11 @@ def main():
parser.add_argument(
"--matching-method",
type=str,
choices=["simple", "margin", "multi-ref", "hybrid"],
choices=["simple", "margin", "multi-ref"],
default="margin",
help="Matching method: 'simple' returns all matches above threshold, "
"'margin' requires confidence margin over 2nd best, "
"'multi-ref' aggregates scores across reference images, "
"'hybrid' combines text recognition with CLIP (default: margin)",
"'multi-ref' aggregates scores across reference images (default: margin)",
)
parser.add_argument(
"--min-matching-refs",
@ -261,25 +260,6 @@ def main():
action="store_true",
help="For 'multi-ref' method: use max similarity instead of mean across references",
)
# Hybrid method arguments
parser.add_argument(
"--hybrid-text-threshold",
type=float,
default=0.60,
help="For 'hybrid' method: CLIP threshold when text matches (default: 0.60)",
)
parser.add_argument(
"--hybrid-no-text-threshold",
type=float,
default=0.80,
help="For 'hybrid' method: CLIP threshold when text expected but not found (default: 0.80)",
)
parser.add_argument(
"--text-similarity-threshold",
type=float,
default=0.5,
help="For 'hybrid' method: minimum text similarity to consider a match (default: 0.5)",
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
@ -352,14 +332,6 @@ def main():
preprocess_mode=args.preprocess_mode,
)
# Initialize text detector for hybrid method
text_detector = None
if args.matching_method == "hybrid":
logger.info("Initializing text detector for hybrid matching...")
from text_recognition import DetectText
text_detector = DetectText(logger=logger, threshold=0.3)
detector.set_text_detector(text_detector)
# Load ground truth (both mappings)
logger.info("Loading ground truth from database...")
image_to_logos, logo_to_images = get_ground_truth(db_path)
@ -377,15 +349,10 @@ def main():
multi_ref_embeddings: Dict[str, List[torch.Tensor]] = {}
# List for margin-based matching: (logo_name, embedding) tuples
reference_embeddings: List[Tuple[str, torch.Tensor]] = []
# Dict for hybrid matching: logo_name -> {'embeddings': [...], 'texts': [...]}
hybrid_reference_data: Dict[str, Dict[str, Any]] = {}
total_refs = 0
logos_with_text = 0
for logo_name, ref_filenames in tqdm(sampled_logos.items(), desc="Reference logos"):
multi_ref_embeddings[logo_name] = []
if args.matching_method == "hybrid":
hybrid_reference_data[logo_name] = {'embeddings': [], 'texts': set()}
for ref_filename in ref_filenames:
ref_path = reference_dir / ref_filename
@ -398,15 +365,12 @@ def main():
cache_key = f"ref:{ref_filename}"
embedding = cache.get(cache_key) if cache else None
# Load image if needed (for embedding or text extraction)
img = None
if embedding is None or args.matching_method == "hybrid":
# Load image if needed for embedding
if embedding is None:
img = load_image(ref_path)
if img is None:
logger.warning(f"Failed to load reference logo: {ref_path}")
continue
if embedding is None:
embedding = detector.get_embedding(img)
if cache:
cache.put(cache_key, embedding)
@ -415,21 +379,7 @@ def main():
reference_embeddings.append((logo_name, embedding))
total_refs += 1
# Extract text for hybrid method
if args.matching_method == "hybrid" and img is not None:
hybrid_reference_data[logo_name]['embeddings'].append(embedding)
texts = detector.extract_text(img, min_confidence=0.3)
hybrid_reference_data[logo_name]['texts'].update(texts)
# Convert text set to list for hybrid data
if args.matching_method == "hybrid":
hybrid_reference_data[logo_name]['texts'] = list(hybrid_reference_data[logo_name]['texts'])
if hybrid_reference_data[logo_name]['texts']:
logos_with_text += 1
logger.info(f"Computed {total_refs} embeddings for {len(sampled_logos)} logos")
if args.matching_method == "hybrid":
logger.info(f"Extracted text from {logos_with_text}/{len(sampled_logos)} reference logos")
# Build test set: for each logo, sample positive and negative images
logger.info(f"Sampling test images: {args.positive_samples} positive, {args.negative_samples} negative per logo...")
@ -504,14 +454,7 @@ def main():
cache_key = f"det:{test_filename}"
cached_detections = cache.get(cache_key) if cache else None
# For hybrid matching, we always need the original image for text extraction
test_img = None
if args.matching_method == "hybrid":
test_img = load_image(test_path)
if test_img is None:
logger.warning(f"Failed to load test image: {test_path}")
continue
if cached_detections is not None:
# Cached detections contain serialized box data and embeddings
detections = cached_detections
@ -651,50 +594,6 @@ def main():
"correct": is_correct,
})
else: # hybrid
# Hybrid matching: combines text recognition with CLIP
# Extract crop from original image for text extraction
box = detection["box"]
crop = test_img[
int(box["ymin"]):int(box["ymax"]),
int(box["xmin"]):int(box["xmax"])
]
match_result = detector.find_best_match_hybrid(
detected_embedding=detection["embedding"],
detected_image=crop,
reference_data=hybrid_reference_data,
clip_threshold=args.threshold,
clip_threshold_with_text=args.hybrid_text_threshold,
clip_threshold_text_mismatch=args.hybrid_no_text_threshold,
text_similarity_threshold=args.text_similarity_threshold,
margin=args.margin,
use_mean_similarity=not args.use_max_similarity,
)
if match_result:
label, similarity, match_info = match_result
matched_logos.add(label)
is_correct = label in expected_logos
if is_correct:
true_positives += 1
if args.similarity_details:
similarity_details["true_positive_sims"].append(similarity)
else:
false_positives += 1
if args.similarity_details:
similarity_details["false_positive_sims"].append(similarity)
results.append({
"test_image": test_filename,
"matched_logo": label,
"similarity": similarity,
"correct": is_correct,
"text_matched": match_info.get("text_matched", False),
"text_similarity": match_info.get("text_similarity", 0),
"match_type": match_info.get("match_type", "unknown"),
})
# Count missed detections (false negatives)
missed = expected_logos - matched_logos
false_negatives += len(missed)
@ -742,16 +641,11 @@ def main():
print(f" DETR confidence threshold: {args.detr_threshold}")
print(f" Preprocess mode: {args.preprocess_mode}")
print(f" Matching method: {args.matching_method}")
if args.matching_method in ("margin", "multi-ref", "hybrid"):
if args.matching_method in ("margin", "multi-ref"):
print(f" Matching margin: {args.margin}")
if args.matching_method == "multi-ref":
print(f" Min matching refs: {args.min_matching_refs}")
print(f" Similarity aggregation: {'max' if args.use_max_similarity else 'mean'}")
if args.matching_method == "hybrid":
print(f" CLIP threshold (text match): {args.hybrid_text_threshold}")
print(f" CLIP threshold (no text): {args.hybrid_no_text_threshold}")
print(f" Text similarity threshold: {args.text_similarity_threshold}")
print(f" Refs with text: {logos_with_text}/{len(sampled_logos)}")
if args.seed is not None:
print(f" Random seed: {args.seed}")
@ -939,14 +833,9 @@ def write_results_to_file(
method_desc = "Simple (all matches above threshold)"
elif args.matching_method == "margin":
method_desc = f"Margin-based (margin={args.margin})"
elif args.matching_method == "multi-ref":
else: # multi-ref
agg = "max" if args.use_max_similarity else "mean"
method_desc = f"Multi-ref ({agg}, min_refs={args.min_matching_refs}, margin={args.margin})"
else: # hybrid
method_desc = (
f"Hybrid (text+CLIP, text_thresh={args.hybrid_text_threshold}, "
f"no_text_thresh={args.hybrid_no_text_threshold}, margin={args.margin})"
)
lines = [
"=" * 70,