Compare commits
4 Commits
569285f664
...
78f46f04bf
| Author | SHA1 | Date | |
|---|---|---|---|
| 78f46f04bf | |||
| b5432c9ef7 | |||
| 440e8fcdb4 | |||
| 2f28aa6052 |
132
run_refs_per_logo_test.sh
Executable file
132
run_refs_per_logo_test.sh
Executable file
@ -0,0 +1,132 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Test different numbers of reference logos per brand to find optimal setting.
|
||||||
|
# Uses baseline CLIP with multi-ref (max) matching method.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./run_refs_per_logo_test.sh
|
||||||
|
#
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
OUTPUT_FILE="${SCRIPT_DIR}/test_results/refs_per_logo_analysis.txt"
|
||||||
|
|
||||||
|
# Model - baseline CLIP (best for unknown logos)
|
||||||
|
MODEL="openai/clip-vit-large-patch14"
|
||||||
|
|
||||||
|
# Fixed parameters
|
||||||
|
NUM_LOGOS=20
|
||||||
|
POSITIVE_SAMPLES=20
|
||||||
|
NEGATIVE_SAMPLES=100
|
||||||
|
MIN_MATCHING_REFS=1
|
||||||
|
THRESHOLD=0.70
|
||||||
|
MARGIN=0.05
|
||||||
|
SEED=42
|
||||||
|
|
||||||
|
# Refs per logo values to test
|
||||||
|
REFS_TO_TEST="1 2 3 5 7 10 15 20"
|
||||||
|
|
||||||
|
# Create output directory if needed
|
||||||
|
mkdir -p "${SCRIPT_DIR}/test_results"
|
||||||
|
|
||||||
|
# Clear output file and write header
|
||||||
|
cat > "$OUTPUT_FILE" << EOF
|
||||||
|
Reference Logos Per Brand Optimization
|
||||||
|
======================================
|
||||||
|
Date: $(date)
|
||||||
|
|
||||||
|
Model: ${MODEL}
|
||||||
|
Method: multi-ref (max)
|
||||||
|
|
||||||
|
Fixed Parameters:
|
||||||
|
Number of logo brands: ${NUM_LOGOS}
|
||||||
|
Similarity threshold: ${THRESHOLD}
|
||||||
|
Margin: ${MARGIN}
|
||||||
|
Min matching refs: ${MIN_MATCHING_REFS}
|
||||||
|
Positive samples/logo: ${POSITIVE_SAMPLES}
|
||||||
|
Negative samples/logo: ${NEGATIVE_SAMPLES}
|
||||||
|
Seed: ${SEED}
|
||||||
|
|
||||||
|
Testing refs per logo: ${REFS_TO_TEST}
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Reference Logos Per Brand Optimization"
|
||||||
|
echo "======================================="
|
||||||
|
echo "Model: ${MODEL}"
|
||||||
|
echo "Testing refs per logo: ${REFS_TO_TEST}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Results table header
|
||||||
|
echo "Results Summary:" >> "$OUTPUT_FILE"
|
||||||
|
echo "----------------" >> "$OUTPUT_FILE"
|
||||||
|
printf "%-12s %8s %8s %8s %8s %8s %8s\n" "Refs/Logo" "TP" "FP" "FN" "Prec" "Recall" "F1" >> "$OUTPUT_FILE"
|
||||||
|
echo "------------------------------------------------------------------------" >> "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
# Track best result
|
||||||
|
BEST_F1=0
|
||||||
|
BEST_REFS=0
|
||||||
|
|
||||||
|
for REFS in ${REFS_TO_TEST}; do
|
||||||
|
echo "=== Testing refs_per_logo=${REFS} ==="
|
||||||
|
|
||||||
|
# Run test and capture output
|
||||||
|
OUTPUT=$(uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold $THRESHOLD \
|
||||||
|
--margin $MARGIN \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL" \
|
||||||
|
2>&1)
|
||||||
|
|
||||||
|
# Extract metrics
|
||||||
|
TP=$(echo "${OUTPUT}" | grep "True Positives" | grep -oE "[0-9]+" | head -1)
|
||||||
|
FP=$(echo "${OUTPUT}" | grep "False Positives" | grep -oE "[0-9]+" | head -1)
|
||||||
|
FN=$(echo "${OUTPUT}" | grep "False Negatives" | grep -oE "[0-9]+" | head -1)
|
||||||
|
PREC=$(echo "${OUTPUT}" | grep "Precision:" | grep -oE "[0-9]+\.[0-9]+%" | head -1)
|
||||||
|
RECALL=$(echo "${OUTPUT}" | grep "Recall:" | grep -oE "[0-9]+\.[0-9]+%" | head -1)
|
||||||
|
F1=$(echo "${OUTPUT}" | grep "F1 Score:" | grep -oE "[0-9]+\.[0-9]+%" | head -1)
|
||||||
|
|
||||||
|
# Print to console
|
||||||
|
echo " TP: ${TP}, FP: ${FP}, FN: ${FN}"
|
||||||
|
echo " Precision: ${PREC}, Recall: ${RECALL}, F1: ${F1}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Add to results table
|
||||||
|
printf "%-12s %8s %8s %8s %8s %8s %8s\n" "${REFS}" "${TP}" "${FP}" "${FN}" "${PREC}" "${RECALL}" "${F1}" >> "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
# Track best F1
|
||||||
|
F1_NUM=$(echo "${F1}" | tr -d '%')
|
||||||
|
if [ -n "$F1_NUM" ]; then
|
||||||
|
BETTER=$(echo "${F1_NUM} > ${BEST_F1}" | bc -l 2>/dev/null || echo "0")
|
||||||
|
if [ "$BETTER" = "1" ]; then
|
||||||
|
BEST_F1="${F1_NUM}"
|
||||||
|
BEST_REFS="${REFS}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also append full output for this test
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
echo "======================================================================" >> "$OUTPUT_FILE"
|
||||||
|
echo "DETAILED RESULTS: refs_per_logo=${REFS}" >> "$OUTPUT_FILE"
|
||||||
|
echo "======================================================================" >> "$OUTPUT_FILE"
|
||||||
|
echo "${OUTPUT}" | grep -A 50 "Configuration:" | head -30 >> "$OUTPUT_FILE"
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
echo "------------------------------------------------------------------------" >> "$OUTPUT_FILE"
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
echo "OPTIMAL SETTING: refs_per_logo=${BEST_REFS} (F1 = ${BEST_F1}%)" >> "$OUTPUT_FILE"
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo "======================================="
|
||||||
|
echo "OPTIMAL: refs_per_logo=${BEST_REFS} (F1 = ${BEST_F1}%)"
|
||||||
|
echo "======================================="
|
||||||
|
echo ""
|
||||||
|
echo "Results saved to: $OUTPUT_FILE"
|
||||||
181
run_threshold_tests_image_split.sh
Executable file
181
run_threshold_tests_image_split.sh
Executable file
@ -0,0 +1,181 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Run logo detection tests with the image-split fine-tuned model.
|
||||||
|
# Tests various threshold and margin settings to find optimal parameters.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./run_threshold_tests_image_split.sh
|
||||||
|
#
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
OUTPUT_FILE="${SCRIPT_DIR}/threshold_test_results_image_split.txt"
|
||||||
|
|
||||||
|
# Model path
|
||||||
|
MODEL_PATH="models/logo_detection/clip_finetuned_image_split"
|
||||||
|
|
||||||
|
# Common parameters
|
||||||
|
NUM_LOGOS=20
|
||||||
|
REFS_PER_LOGO=10
|
||||||
|
POSITIVE_SAMPLES=20
|
||||||
|
NEGATIVE_SAMPLES=100
|
||||||
|
MIN_MATCHING_REFS=3
|
||||||
|
SEED=42
|
||||||
|
|
||||||
|
# Check if model exists
|
||||||
|
if [ ! -d "${SCRIPT_DIR}/${MODEL_PATH}" ]; then
|
||||||
|
echo "Error: Image-split model not found at ${SCRIPT_DIR}/${MODEL_PATH}"
|
||||||
|
echo "Train the model first with: python train_clip_logo.py --config configs/cloud_rtx4090_image_split.yaml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clear output file and write header
|
||||||
|
echo "Threshold Optimization Tests - Image-Split Model" > "$OUTPUT_FILE"
|
||||||
|
echo "=================================================" >> "$OUTPUT_FILE"
|
||||||
|
echo "Date: $(date)" >> "$OUTPUT_FILE"
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
echo "Model: ${MODEL_PATH}" >> "$OUTPUT_FILE"
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
echo "Common Parameters:" >> "$OUTPUT_FILE"
|
||||||
|
echo " Matching method: multi-ref (max)" >> "$OUTPUT_FILE"
|
||||||
|
echo " Reference logos: $NUM_LOGOS" >> "$OUTPUT_FILE"
|
||||||
|
echo " Refs per logo: $REFS_PER_LOGO" >> "$OUTPUT_FILE"
|
||||||
|
echo " Positive samples: $POSITIVE_SAMPLES" >> "$OUTPUT_FILE"
|
||||||
|
echo " Negative samples: $NEGATIVE_SAMPLES" >> "$OUTPUT_FILE"
|
||||||
|
echo " Min matching refs: $MIN_MATCHING_REFS" >> "$OUTPUT_FILE"
|
||||||
|
echo " Seed: $SEED" >> "$OUTPUT_FILE"
|
||||||
|
echo "" >> "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo "Running threshold optimization tests for image-split model..."
|
||||||
|
echo " Model: ${MODEL_PATH}"
|
||||||
|
echo " Matching method: multi-ref (max)"
|
||||||
|
echo " Reference logos: $NUM_LOGOS"
|
||||||
|
echo " Refs per logo: $REFS_PER_LOGO"
|
||||||
|
echo " Seed: $SEED"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 1: Lower threshold (image-split model may have different distribution)
|
||||||
|
echo "=== Test 1: threshold=0.65, margin=0.05 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.65 \
|
||||||
|
--margin 0.05 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 2: Default threshold
|
||||||
|
echo "=== Test 2: threshold=0.70, margin=0.05 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.70 \
|
||||||
|
--margin 0.05 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 3: threshold=0.75
|
||||||
|
echo "=== Test 3: threshold=0.75, margin=0.05 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.75 \
|
||||||
|
--margin 0.05 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 4: threshold=0.80
|
||||||
|
echo "=== Test 4: threshold=0.80, margin=0.05 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.80 \
|
||||||
|
--margin 0.05 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 5: threshold=0.80 with larger margin
|
||||||
|
echo "=== Test 5: threshold=0.80, margin=0.10 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.80 \
|
||||||
|
--margin 0.10 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 6: threshold=0.85
|
||||||
|
echo "=== Test 6: threshold=0.85, margin=0.10 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.85 \
|
||||||
|
--margin 0.10 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Test 7: threshold=0.90
|
||||||
|
echo "=== Test 7: threshold=0.90, margin=0.10 ==="
|
||||||
|
uv run python "$SCRIPT_DIR/test_logo_detection.py" \
|
||||||
|
--num-logos $NUM_LOGOS \
|
||||||
|
--refs-per-logo $REFS_PER_LOGO \
|
||||||
|
--positive-samples $POSITIVE_SAMPLES \
|
||||||
|
--negative-samples $NEGATIVE_SAMPLES \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--min-matching-refs $MIN_MATCHING_REFS \
|
||||||
|
--use-max-similarity \
|
||||||
|
--threshold 0.90 \
|
||||||
|
--margin 0.10 \
|
||||||
|
--seed $SEED \
|
||||||
|
--embedding-model "$MODEL_PATH" \
|
||||||
|
--output-file "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Results saved to: $OUTPUT_FILE"
|
||||||
216
test_results/FINAL_MODEL_ANALYSIS.md
Normal file
216
test_results/FINAL_MODEL_ANALYSIS.md
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
# Logo Recognition Model Analysis
|
||||||
|
|
||||||
|
**Date:** January 7, 2026
|
||||||
|
**Purpose:** Determine the best model and threshold for logo recognition of logos not currently in the test set.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
| Model | Best Threshold | F1 Score | Precision | Recall | Recommended Use |
|
||||||
|
|-------|---------------|----------|-----------|--------|-----------------|
|
||||||
|
| **Image-Split Fine-tuned** | 0.70-0.75 | **67-68%** | 66-80% | 59-68% | Known logos (in reference set) |
|
||||||
|
| Baseline CLIP | 0.70 | 57-60% | 48-49% | 72-77% | Unknown logos (never seen before) |
|
||||||
|
| Logo-Split Fine-tuned | 0.76 | 56% | 49% | 64% | Not recommended |
|
||||||
|
| DINOv2 (small/large) | - | 29-30% | 22-32% | 28-43% | Not suitable |
|
||||||
|
|
||||||
|
**Winner: Image-Split Fine-tuned Model** at threshold **0.70-0.75**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Detailed Model Comparison
|
||||||
|
|
||||||
|
### 1. Baseline CLIP (openai/clip-vit-large-patch14)
|
||||||
|
|
||||||
|
The pre-trained CLIP model without any fine-tuning.
|
||||||
|
|
||||||
|
**Threshold Performance:**
|
||||||
|
|
||||||
|
| Threshold | Precision | Recall | F1 |
|
||||||
|
|-----------|-----------|--------|-----|
|
||||||
|
| 0.70 | 47.9% | 71.8% | 57.5% |
|
||||||
|
| 0.80 | 33.0% | 63.1% | 43.4% |
|
||||||
|
| 0.85 | 26.9% | 43.4% | 33.2% |
|
||||||
|
| 0.90 | 54.9% | 22.8% | 32.2% |
|
||||||
|
|
||||||
|
**Similarity Distribution:**
|
||||||
|
- True Positive mean: 0.854 (range: 0.75-0.95)
|
||||||
|
- False Positive mean: 0.846 (range: 0.75-0.95)
|
||||||
|
- **Problem:** TP and FP distributions almost completely overlap
|
||||||
|
|
||||||
|
**Suggested optimal threshold:** 0.756 (predicted F1 = 67.1%)
|
||||||
|
|
||||||
|
**Strengths:**
|
||||||
|
- Good recall at low thresholds
|
||||||
|
- Works on completely unseen logos
|
||||||
|
- No training required
|
||||||
|
|
||||||
|
**Weaknesses:**
|
||||||
|
- Poor separation between correct and incorrect matches
|
||||||
|
- High false positive rate
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Fine-tuned CLIP (Logo-Level Splits)
|
||||||
|
|
||||||
|
Trained with contrastive learning, tested on completely unseen logo brands.
|
||||||
|
|
||||||
|
**Threshold Performance:**
|
||||||
|
|
||||||
|
| Threshold | Precision | Recall | F1 |
|
||||||
|
|-----------|-----------|--------|-----|
|
||||||
|
| 0.70 | 25.9% | 67.1% | 37.4% |
|
||||||
|
| 0.76 | **49.1%** | 64.3% | **55.7%** |
|
||||||
|
| 0.82 | 75.7% | 41.4% | 53.5% |
|
||||||
|
| 0.86 | 88.6% | 28.1% | 42.7% |
|
||||||
|
|
||||||
|
**Similarity Distribution:**
|
||||||
|
- True Positive mean: 0.853
|
||||||
|
- False Positive mean: 0.787 (better separation than baseline)
|
||||||
|
- Missed logos mean: 0.711 (only 43.7% above 0.75)
|
||||||
|
|
||||||
|
**Suggested optimal threshold:** 0.82 (predicted F1 = 71.9%)
|
||||||
|
|
||||||
|
**Strengths:**
|
||||||
|
- Better TP/FP separation than baseline
|
||||||
|
- Very high precision at high thresholds (88.6% at t=0.86)
|
||||||
|
|
||||||
|
**Weaknesses:**
|
||||||
|
- Does not generalize well to unseen logo brands
|
||||||
|
- Many correct logos score below threshold (56% of missed logos below 0.75)
|
||||||
|
- Worse than baseline at threshold 0.70
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Fine-tuned CLIP (Image-Level Splits) ⭐ BEST
|
||||||
|
|
||||||
|
Trained with contrastive learning, all logo brands seen during training (different images held out for testing).
|
||||||
|
|
||||||
|
**Threshold Performance:**
|
||||||
|
|
||||||
|
| Threshold | Precision | Recall | F1 |
|
||||||
|
|-----------|-----------|--------|-----|
|
||||||
|
| 0.65 | 56.9% | **75.9%** | 65.0% |
|
||||||
|
| 0.70 | 66.3% | 68.3% | **67.3%** |
|
||||||
|
| 0.75 | **79.9%** | 59.3% | **68.1%** |
|
||||||
|
| 0.80 | 83.7% | 52.8% | 64.8% |
|
||||||
|
| 0.85 | 92.4% | 42.8% | 58.5% |
|
||||||
|
| 0.90 | 98.9% | 24.7% | 39.5% |
|
||||||
|
|
||||||
|
**Similarity Distribution:**
|
||||||
|
- True Positive mean: 0.866 (higher than other models)
|
||||||
|
- False Positive mean: 0.807
|
||||||
|
- TP-FP gap: 0.059 (best separation)
|
||||||
|
- At t=0.75: 92 TP vs only 38 FP (excellent ratio)
|
||||||
|
|
||||||
|
**Suggested optimal threshold:** 0.755 (predicted F1 = 85.6%)
|
||||||
|
|
||||||
|
**Strengths:**
|
||||||
|
- Best overall F1 score (68.1% at t=0.75)
|
||||||
|
- Best precision at any threshold (79.9-98.9%)
|
||||||
|
- Excellent TP/FP ratio
|
||||||
|
- Highest true positive similarity scores
|
||||||
|
|
||||||
|
**Weaknesses:**
|
||||||
|
- Requires logos to be in the reference set during training
|
||||||
|
- May not generalize to completely novel logos
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. DINOv2 Models
|
||||||
|
|
||||||
|
Tested for comparison but significantly underperformed.
|
||||||
|
|
||||||
|
| Model | Precision | Recall | F1 |
|
||||||
|
|-------|-----------|--------|-----|
|
||||||
|
| DINOv2-small | 22.4% | 42.8% | 29.5% |
|
||||||
|
| DINOv2-large | 32.2% | 28.5% | 30.2% |
|
||||||
|
|
||||||
|
**Not recommended** for logo recognition tasks.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommendations
|
||||||
|
|
||||||
|
### For Logo Recognition of Known Logos (logos in your reference set)
|
||||||
|
|
||||||
|
**Use: Image-Split Fine-tuned Model**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recommended configuration
|
||||||
|
python test_logo_detection.py \
|
||||||
|
-e models/logo_detection/clip_finetuned_image_split \
|
||||||
|
-t 0.70 \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--use-max-similarity
|
||||||
|
```
|
||||||
|
|
||||||
|
| Use Case | Threshold | Expected Performance |
|
||||||
|
|----------|-----------|---------------------|
|
||||||
|
| Balanced (recommended) | 0.70 | 66% precision, 68% recall, 67% F1 |
|
||||||
|
| High precision | 0.75 | 80% precision, 59% recall, 68% F1 |
|
||||||
|
| Very high precision | 0.80 | 84% precision, 53% recall, 65% F1 |
|
||||||
|
| Maximum precision | 0.85+ | 92%+ precision, <43% recall |
|
||||||
|
|
||||||
|
### For Logo Recognition of Unknown Logos (completely novel brands)
|
||||||
|
|
||||||
|
**Use: Baseline CLIP** (the fine-tuned models don't generalize well)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recommended configuration
|
||||||
|
python test_logo_detection.py \
|
||||||
|
-e openai/clip-vit-large-patch14 \
|
||||||
|
-t 0.70 \
|
||||||
|
--matching-method multi-ref \
|
||||||
|
--use-max-similarity
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: ~48% precision, ~72% recall, ~58% F1
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key Findings
|
||||||
|
|
||||||
|
### 1. Image-Level Splits Dramatically Improve Performance
|
||||||
|
|
||||||
|
The image-split fine-tuned model outperforms all others because:
|
||||||
|
- It learns brand-specific features during training
|
||||||
|
- Test images are different but from same brands
|
||||||
|
- Better represents real-world use where you have reference images for logos you want to detect
|
||||||
|
|
||||||
|
### 2. Logo-Level Splits Test True Generalization (but results are poor)
|
||||||
|
|
||||||
|
The logo-split model tests whether fine-tuning helps with completely unseen logos:
|
||||||
|
- Result: It doesn't help much (56% F1 vs 58% baseline)
|
||||||
|
- Contrastive learning doesn't transfer well to novel brands
|
||||||
|
- Use baseline CLIP for novel logo detection
|
||||||
|
|
||||||
|
### 3. Threshold Sweet Spot is 0.70-0.75
|
||||||
|
|
||||||
|
For all models, the optimal F1 occurs around threshold 0.70-0.75:
|
||||||
|
- Lower thresholds: Too many false positives
|
||||||
|
- Higher thresholds: Misses too many correct logos
|
||||||
|
- At 0.90+: Precision is high but recall drops below 25%
|
||||||
|
|
||||||
|
### 4. Precision-Recall Tradeoff
|
||||||
|
|
||||||
|
| Priority | Threshold | Tradeoff |
|
||||||
|
|----------|-----------|----------|
|
||||||
|
| Recall | 0.65-0.70 | More matches, more false positives |
|
||||||
|
| Balanced | 0.70-0.75 | Best F1 score |
|
||||||
|
| Precision | 0.75-0.80 | Fewer false positives, misses some matches |
|
||||||
|
| High Precision | 0.85+ | Very few false positives, misses many matches |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
**For production use with known logos:**
|
||||||
|
- Use **Image-Split Fine-tuned Model** at **threshold 0.70-0.75**
|
||||||
|
- Expected F1: 67-68%, Precision: 66-80%
|
||||||
|
|
||||||
|
**For discovering unknown logos:**
|
||||||
|
- Use **Baseline CLIP** at **threshold 0.70**
|
||||||
|
- Expected F1: ~58%, Precision: ~48%
|
||||||
|
|
||||||
|
The image-split fine-tuning provides significant improvements (+8-10% F1) over baseline for known logos, but does not help with completely novel brands. For a production system, ensure all target logos are included in the training/reference set.
|
||||||
87
test_results/comparison_results/baseline_20260105_100740.txt
Normal file
87
test_results/comparison_results/baseline_20260105_100740.txt
Normal file
File diff suppressed because one or more lines are too long
@ -0,0 +1,29 @@
|
|||||||
|
============================================================
|
||||||
|
|
||||||
|
Test Parameters:
|
||||||
|
Logos: 50, Seed: 42, Threshold: 0.7
|
||||||
|
Method: multi-ref, Refs/logo: 3, Margin: 0.05
|
||||||
|
|
||||||
|
BASELINE (openai/clip-vit-large-patch14):
|
||||||
|
True Positives (correct matches): 101
|
||||||
|
False Positives (wrong matches): 104
|
||||||
|
False Negatives (missed logos): 156
|
||||||
|
Precision: 0.4927 (49.3%)
|
||||||
|
Recall: 0.4056 (40.6%)
|
||||||
|
F1 Score: 0.4449 (44.5%)
|
||||||
|
|
||||||
|
FINE-TUNED (models/logo_detection/clip_finetuned):
|
||||||
|
True Positives (correct matches): 164
|
||||||
|
False Positives (wrong matches): 414
|
||||||
|
False Negatives (missed logos): 115
|
||||||
|
Precision: 0.2837 (28.4%)
|
||||||
|
Recall: 0.6586 (65.9%)
|
||||||
|
F1 Score: 0.3966 (39.7%)
|
||||||
|
|
||||||
|
------------------------------------------------------------
|
||||||
|
F1 SCORE COMPARISON:
|
||||||
|
Baseline: 44.5%
|
||||||
|
Fine-tuned: 39.7%
|
||||||
|
------------------------------------------------------------
|
||||||
|
|
||||||
|
Full results saved to: comparison_results/
|
||||||
File diff suppressed because one or more lines are too long
124
test_results/comparison_results_clip_defaults_all_methods.txt
Normal file
124
test_results/comparison_results_clip_defaults_all_methods.txt
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
Logo Detection Comparison Tests
|
||||||
|
================================
|
||||||
|
Date: Wed Dec 31 03:43:45 PM MST 2025
|
||||||
|
|
||||||
|
Common Parameters:
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Positive samples: 20
|
||||||
|
Negative samples: 100
|
||||||
|
Min matching refs: 3
|
||||||
|
Seed: 42
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: SIMPLE MATCHING
|
||||||
|
Method: Simple (all matches above threshold)
|
||||||
|
======================================================================
|
||||||
|
Date: 2025-12-31 16:02:25
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2355
|
||||||
|
CLIP threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 751
|
||||||
|
False Positives: 58221
|
||||||
|
False Negatives: 9
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.0127 (1.3%)
|
||||||
|
Recall: 2.0352 (203.5%)
|
||||||
|
F1 Score: 0.0253 (2.5%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MARGIN MATCHING
|
||||||
|
Method: Margin-based (margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2025-12-31 16:20:42
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2361
|
||||||
|
CLIP threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 60
|
||||||
|
False Positives: 26
|
||||||
|
False Negatives: 310
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.6977 (69.8%)
|
||||||
|
Recall: 0.1626 (16.3%)
|
||||||
|
F1 Score: 0.2637 (26.4%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Method: Multi-ref (mean, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2025-12-31 16:38:59
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2352
|
||||||
|
CLIP threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 233
|
||||||
|
False Positives: 217
|
||||||
|
False Negatives: 170
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.5178 (51.8%)
|
||||||
|
Recall: 0.6314 (63.1%)
|
||||||
|
F1 Score: 0.5690 (56.9%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2025-12-31 16:56:49
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2350
|
||||||
|
CLIP threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 278
|
||||||
|
False Positives: 259
|
||||||
|
False Negatives: 136
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.5177 (51.8%)
|
||||||
|
Recall: 0.7534 (75.3%)
|
||||||
|
F1 Score: 0.6137 (61.4%)
|
||||||
|
|
||||||
105
test_results/model_comparison_results.txt
Normal file
105
test_results/model_comparison_results.txt
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
Embedding Model Comparison Tests
|
||||||
|
=================================
|
||||||
|
Date: Fri Jan 2 12:47:03 PM MST 2026
|
||||||
|
|
||||||
|
Common Parameters:
|
||||||
|
Matching method: multi-ref (max)
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Positive samples: 20
|
||||||
|
Negative samples: 100
|
||||||
|
Min matching refs: 3
|
||||||
|
Threshold: 0.70
|
||||||
|
Margin: 0.05
|
||||||
|
Seed: 42
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 13:05:17
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2355
|
||||||
|
Similarity threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 284
|
||||||
|
False Positives: 295
|
||||||
|
False Negatives: 124
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.4905 (49.1%)
|
||||||
|
Recall: 0.7696 (77.0%)
|
||||||
|
F1 Score: 0.5992 (59.9%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: facebook/dinov2-small
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 13:19:01
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: facebook/dinov2-small
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2358
|
||||||
|
Similarity threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 158
|
||||||
|
False Positives: 546
|
||||||
|
False Negatives: 234
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.2244 (22.4%)
|
||||||
|
Recall: 0.4282 (42.8%)
|
||||||
|
F1 Score: 0.2945 (29.5%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: facebook/dinov2-large
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 13:39:33
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: facebook/dinov2-large
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2355
|
||||||
|
Similarity threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 105
|
||||||
|
False Positives: 221
|
||||||
|
False Negatives: 277
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.3221 (32.2%)
|
||||||
|
Recall: 0.2846 (28.5%)
|
||||||
|
F1 Score: 0.3022 (30.2%)
|
||||||
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -346,6 +346,131 @@ DINOv2 Small produces over 3x as many false positives as true positives, making
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Summary and Recommendations
|
||||||
|
|
||||||
|
This section synthesizes findings from all test runs to provide actionable recommendations for logo detection configuration and future improvements.
|
||||||
|
|
||||||
|
### Best Configuration
|
||||||
|
|
||||||
|
Based on all tests conducted, the optimal configuration is:
|
||||||
|
|
||||||
|
| Parameter | Recommended Value | Rationale |
|
||||||
|
|-----------|-------------------|-----------|
|
||||||
|
| **Embedding Model** | `openai/clip-vit-large-patch14` | 2x better F1 than DINOv2 alternatives |
|
||||||
|
| **Matching Method** | `multi-ref` with max similarity | Best F1 (59.9%) and recall (77.0%) |
|
||||||
|
| **Similarity Threshold** | 0.70 | Lower thresholds outperform higher ones |
|
||||||
|
| **Margin** | 0.05 | Minimal impact; keep low to avoid rejecting valid matches |
|
||||||
|
| **Min Matching Refs** | 3 | Provides better discrimination than threshold alone |
|
||||||
|
| **Refs Per Logo** | 10 | More references improve robustness |
|
||||||
|
| **DETR Threshold** | 0.50 | Standard detection confidence |
|
||||||
|
|
||||||
|
### Performance Expectations
|
||||||
|
|
||||||
|
With the recommended configuration:
|
||||||
|
|
||||||
|
| Metric | Expected Value | Interpretation |
|
||||||
|
|--------|----------------|----------------|
|
||||||
|
| Precision | ~49% | About half of detections are correct |
|
||||||
|
| Recall | ~77% | Finds most logos present in images |
|
||||||
|
| F1 Score | ~60% | Moderate overall accuracy |
|
||||||
|
| FP:TP Ratio | ~1:1 | Approximately equal true and false positives |
|
||||||
|
|
||||||
|
**Important**: These results indicate the system is suitable for applications that can tolerate a high false positive rate, such as:
|
||||||
|
- Initial screening with human review
|
||||||
|
- Flagging content for further analysis
|
||||||
|
- Low-stakes logo presence detection
|
||||||
|
|
||||||
|
The system is **not suitable** for high-precision applications without additional filtering or verification steps.
|
||||||
|
|
||||||
|
### Key Insights from Testing
|
||||||
|
|
||||||
|
#### What Works
|
||||||
|
|
||||||
|
1. **Multi-ref matching with max aggregation** consistently outperforms other methods
|
||||||
|
2. **Multiple references per logo** (10) provides robustness against logo variations
|
||||||
|
3. **min_matching_refs=3** is more effective at discrimination than threshold tuning
|
||||||
|
4. **CLIP embeddings** significantly outperform self-supervised alternatives (DINOv2)
|
||||||
|
|
||||||
|
#### What Doesn't Work
|
||||||
|
|
||||||
|
1. **Raising similarity threshold** paradoxically increases false positives in the 0.70-0.85 range
|
||||||
|
2. **Margin-only matching** fails with multiple references (same-logo refs compete)
|
||||||
|
3. **DINOv2 models** produce 2-3x worse results than CLIP
|
||||||
|
4. **Simple threshold-based matching** produces unacceptable 78:1 FP:TP ratio
|
||||||
|
|
||||||
|
#### Limitations
|
||||||
|
|
||||||
|
1. **~50% precision ceiling**: Even the best configuration produces nearly as many false positives as true positives
|
||||||
|
2. **No clean threshold separation**: CLIP's embedding space doesn't provide clear decision boundaries for logos
|
||||||
|
3. **General-purpose models**: Neither CLIP nor DINOv2 are optimized for fine-grained logo discrimination
|
||||||
|
4. **Pipeline dependencies**: Results depend heavily on DETR detection quality
|
||||||
|
|
||||||
|
### Recommendations for Future Improvements
|
||||||
|
|
||||||
|
#### Short-Term Improvements
|
||||||
|
|
||||||
|
| Improvement | Expected Impact | Effort |
|
||||||
|
|-------------|-----------------|--------|
|
||||||
|
| **Post-processing filters** | Reduce FP by 20-30% | Low |
|
||||||
|
| Add color histogram matching | Filter matches with wrong colors | |
|
||||||
|
| Add aspect ratio validation | Reject shape mismatches | |
|
||||||
|
| Add text detection | Filter if expected text is missing | |
|
||||||
|
| **Reference curation** | Improve TP by 10-20% | Low |
|
||||||
|
| Remove low-quality references | Reduce noise in ref embeddings | |
|
||||||
|
| Ensure diverse logo variants | Improve coverage | |
|
||||||
|
| **Ensemble scoring** | Improve F1 by 10-15% | Medium |
|
||||||
|
| Combine CLIP + color features | Multi-signal confidence | |
|
||||||
|
| Weighted voting across refs | More robust aggregation | |
|
||||||
|
|
||||||
|
#### Medium-Term Improvements
|
||||||
|
|
||||||
|
| Improvement | Expected Impact | Effort |
|
||||||
|
|-------------|-----------------|--------|
|
||||||
|
| **Fine-tune CLIP on logos** | Improve F1 by 20-40% | Medium |
|
||||||
|
| Contrastive training on logo pairs | Better embedding separation | |
|
||||||
|
| Use LogoDet-3K for training data | Domain-specific features | |
|
||||||
|
| **Alternative detection models** | Improve detection quality | Medium |
|
||||||
|
| Test YOLOv8 for logo detection | Faster, potentially more accurate | |
|
||||||
|
| Train custom detector on logo data | Better region proposals | |
|
||||||
|
| **Learned similarity metric** | Improve precision by 30-50% | Medium |
|
||||||
|
| Train siamese network for logo matching | Replace cosine similarity | |
|
||||||
|
| Learn logo-specific distance function | Better discrimination | |
|
||||||
|
|
||||||
|
#### Long-Term Improvements
|
||||||
|
|
||||||
|
| Improvement | Expected Impact | Effort |
|
||||||
|
|-------------|-----------------|--------|
|
||||||
|
| **End-to-end logo recognition model** | F1 > 85% | High |
|
||||||
|
| Single model for detection + recognition | Eliminate pipeline errors | |
|
||||||
|
| Train on large-scale logo dataset | Comprehensive coverage | |
|
||||||
|
| **Logo-specific foundation model** | F1 > 90% | High |
|
||||||
|
| Pre-train on millions of logo images | Domain expertise | |
|
||||||
|
| Fine-tune for specific brand sets | Production-ready accuracy | |
|
||||||
|
|
||||||
|
### Decision Framework
|
||||||
|
|
||||||
|
Use this framework to choose between precision and recall:
|
||||||
|
|
||||||
|
| Use Case | Priority | Recommended Adjustments |
|
||||||
|
|----------|----------|------------------------|
|
||||||
|
| **Content moderation** | High recall | Use defaults; accept FPs for human review |
|
||||||
|
| **Brand monitoring** | Balanced | Use defaults; filter obvious FPs |
|
||||||
|
| **Automated licensing** | High precision | Use threshold=0.90; accept low recall |
|
||||||
|
| **Search/discovery** | High recall | Lower threshold to 0.65; more refs |
|
||||||
|
|
||||||
|
### Conclusion
|
||||||
|
|
||||||
|
The current DETR + CLIP pipeline with multi-ref matching achieves moderate accuracy (~60% F1) that is suitable for screening applications but falls short of production requirements for automated decision-making. The fundamental limitation is that general-purpose vision models lack the fine-grained discrimination needed for logo recognition.
|
||||||
|
|
||||||
|
**To achieve production-quality accuracy (>85% F1), the system requires:**
|
||||||
|
1. A logo-specific embedding model (fine-tuned or trained from scratch)
|
||||||
|
2. Additional visual features beyond CLIP embeddings
|
||||||
|
3. Potentially an end-to-end architecture designed for logo recognition
|
||||||
|
|
||||||
|
The test framework established here provides the foundation for evaluating these future improvements systematically.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Test Run: [Next Test Name]
|
## Test Run: [Next Test Name]
|
||||||
|
|
||||||
*Results pending...*
|
*Results pending...*
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
============================================================
|
||||||
|
THRESHOLD OPTIMIZATION RESULTS
|
||||||
|
Model: finetuned (models/logo_detection/clip_finetuned)
|
||||||
|
============================================================
|
||||||
|
|
||||||
|
Threshold TP FP FN Prec Recall F1
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
0.70 167 477 120 25.9% 67.1% 37.4%
|
||||||
|
0.72 158 339 116 31.8% 63.5% 42.4%
|
||||||
|
0.74 150 252 123 37.3% 60.2% 46.1%
|
||||||
|
0.76 160 166 119 49.1% 64.3% 55.7%
|
||||||
|
0.78 120 102 147 54.1% 48.2% 51.0%
|
||||||
|
0.80 110 73 151 60.1% 44.2% 50.9%
|
||||||
|
0.82 103 33 159 75.7% 41.4% 53.5%
|
||||||
|
0.84 74 18 180 80.4% 29.7% 43.4%
|
||||||
|
0.86 70 9 187 88.6% 28.1% 42.7%
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
|
||||||
|
BEST THRESHOLD: 0.76 (F1 = 55.7%)
|
||||||
|
|
||||||
193
test_results/threshold_analysis/threshold_test_results.txt
Normal file
193
test_results/threshold_analysis/threshold_test_results.txt
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
Threshold Optimization Tests
|
||||||
|
=============================
|
||||||
|
Date: Fri Jan 2 10:11:34 AM MST 2026
|
||||||
|
|
||||||
|
Common Parameters:
|
||||||
|
Matching method: multi-ref (max)
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Positive samples: 20
|
||||||
|
Negative samples: 100
|
||||||
|
Min matching refs: 3
|
||||||
|
Seed: 42
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 10:29:26
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2358
|
||||||
|
Similarity threshold: 0.7
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 265
|
||||||
|
False Positives: 288
|
||||||
|
False Negatives: 141
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.4792 (47.9%)
|
||||||
|
Recall: 0.7182 (71.8%)
|
||||||
|
F1 Score: 0.5748 (57.5%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.05)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 10:47:35
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2348
|
||||||
|
Similarity threshold: 0.8
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 233
|
||||||
|
False Positives: 472
|
||||||
|
False Negatives: 165
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.3305 (33.0%)
|
||||||
|
Recall: 0.6314 (63.1%)
|
||||||
|
F1 Score: 0.4339 (43.4%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.1)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 11:05:34
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2357
|
||||||
|
Similarity threshold: 0.8
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 187
|
||||||
|
False Positives: 375
|
||||||
|
False Negatives: 208
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.3327 (33.3%)
|
||||||
|
Recall: 0.5068 (50.7%)
|
||||||
|
F1 Score: 0.4017 (40.2%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.1)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 11:23:33
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2356
|
||||||
|
Similarity threshold: 0.85
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 160
|
||||||
|
False Positives: 434
|
||||||
|
False Negatives: 223
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.2694 (26.9%)
|
||||||
|
Recall: 0.4336 (43.4%)
|
||||||
|
F1 Score: 0.3323 (33.2%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.15)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 11:41:47
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2359
|
||||||
|
Similarity threshold: 0.85
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 163
|
||||||
|
False Positives: 410
|
||||||
|
False Negatives: 220
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.2845 (28.4%)
|
||||||
|
Recall: 0.4417 (44.2%)
|
||||||
|
F1 Score: 0.3461 (34.6%)
|
||||||
|
|
||||||
|
======================================================================
|
||||||
|
TEST: MULTI-REF MATCHING
|
||||||
|
Model: openai/clip-vit-large-patch14
|
||||||
|
Method: Multi-ref (max, min_refs=3, margin=0.15)
|
||||||
|
======================================================================
|
||||||
|
Date: 2026-01-02 12:00:00
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
Embedding model: openai/clip-vit-large-patch14
|
||||||
|
Reference logos: 20
|
||||||
|
Refs per logo: 10
|
||||||
|
Total reference embeddings:189
|
||||||
|
Positive samples/logo: 20
|
||||||
|
Negative samples/logo: 100
|
||||||
|
Test images processed: 2363
|
||||||
|
Similarity threshold: 0.9
|
||||||
|
DETR threshold: 0.5
|
||||||
|
Random seed: 42
|
||||||
|
|
||||||
|
Results:
|
||||||
|
True Positives: 84
|
||||||
|
False Positives: 69
|
||||||
|
False Negatives: 288
|
||||||
|
Total Expected: 369
|
||||||
|
|
||||||
|
Scores:
|
||||||
|
Precision: 0.5490 (54.9%)
|
||||||
|
Recall: 0.2276 (22.8%)
|
||||||
|
F1 Score: 0.3218 (32.2%)
|
||||||
|
|
||||||
Reference in New Issue
Block a user