Add script to test optimal refs per logo for baseline CLIP

Add comprehensive model comparison analysis
Combine all test results in a single directory
2026-01-07 12:52:16 -05:00 · 2026-01-07 12:44:15 -05:00 · 2026-01-07 10:22:54 -05:00 · 2026-01-07 10:14:21 -05:00
13 changed files with 2047 additions and 0 deletions
--- a/run_refs_per_logo_test.sh
+++ b/run_refs_per_logo_test.sh
@ -0,0 +1,132 @@
 #!/bin/bash
 #
 # Test different numbers of reference logos per brand to find optimal setting.
 # Uses baseline CLIP with multi-ref (max) matching method.
 #
 # Usage:
 #   ./run_refs_per_logo_test.sh
 #
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 OUTPUT_FILE="${SCRIPT_DIR}/test_results/refs_per_logo_analysis.txt"
 # Model - baseline CLIP (best for unknown logos)
 MODEL="openai/clip-vit-large-patch14"
 # Fixed parameters
 NUM_LOGOS=20
 POSITIVE_SAMPLES=20
 NEGATIVE_SAMPLES=100
 MIN_MATCHING_REFS=1
 THRESHOLD=0.70
 MARGIN=0.05
 SEED=42
 # Refs per logo values to test
 REFS_TO_TEST="1 2 3 5 7 10 15 20"
 # Create output directory if needed
 mkdir -p "${SCRIPT_DIR}/test_results"
 # Clear output file and write header
 cat > "$OUTPUT_FILE" << EOF
 Reference Logos Per Brand Optimization
 ======================================
 Date: $(date)
 Model: ${MODEL}
 Method: multi-ref (max)
 Fixed Parameters:
  Number of logo brands: ${NUM_LOGOS}
  Similarity threshold: ${THRESHOLD}
  Margin: ${MARGIN}
  Min matching refs: ${MIN_MATCHING_REFS}
  Positive samples/logo: ${POSITIVE_SAMPLES}
  Negative samples/logo: ${NEGATIVE_SAMPLES}
  Seed: ${SEED}
 Testing refs per logo: ${REFS_TO_TEST}
 EOF
 echo "Reference Logos Per Brand Optimization"
 echo "======================================="
 echo "Model: ${MODEL}"
 echo "Testing refs per logo: ${REFS_TO_TEST}"
 echo ""
 # Results table header
 echo "Results Summary:" >> "$OUTPUT_FILE"
 echo "----------------" >> "$OUTPUT_FILE"
 printf "%-12s %8s %8s %8s %8s %8s %8s\n" "Refs/Logo" "TP" "FP" "FN" "Prec" "Recall" "F1" >> "$OUTPUT_FILE"
 echo "------------------------------------------------------------------------" >> "$OUTPUT_FILE"
 # Track best result
 BEST_F1=0
 BEST_REFS=0
 for REFS in ${REFS_TO_TEST}; do
    echo "=== Testing refs_per_logo=${REFS} ==="
    # Run test and capture output
    OUTPUT=$(uv run python "$SCRIPT_DIR/test_logo_detection.py" \
        --num-logos $NUM_LOGOS \
        --refs-per-logo $REFS \
        --positive-samples $POSITIVE_SAMPLES \
        --negative-samples $NEGATIVE_SAMPLES \
        --matching-method multi-ref \
        --min-matching-refs $MIN_MATCHING_REFS \
        --use-max-similarity \
        --threshold $THRESHOLD \
        --margin $MARGIN \
        --seed $SEED \
        --embedding-model "$MODEL" \
        2>&1)
    # Extract metrics
    TP=$(echo "${OUTPUT}" | grep "True Positives" | grep -oE "[0-9]+" | head -1)
    FP=$(echo "${OUTPUT}" | grep "False Positives" | grep -oE "[0-9]+" | head -1)
    FN=$(echo "${OUTPUT}" | grep "False Negatives" | grep -oE "[0-9]+" | head -1)
    PREC=$(echo "${OUTPUT}" | grep "Precision:" | grep -oE "[0-9]+\.[0-9]+%" | head -1)
    RECALL=$(echo "${OUTPUT}" | grep "Recall:" | grep -oE "[0-9]+\.[0-9]+%" | head -1)
    F1=$(echo "${OUTPUT}" | grep "F1 Score:" | grep -oE "[0-9]+\.[0-9]+%" | head -1)
    # Print to console
    echo "  TP: ${TP}, FP: ${FP}, FN: ${FN}"
    echo "  Precision: ${PREC}, Recall: ${RECALL}, F1: ${F1}"
    echo ""
    # Add to results table
    printf "%-12s %8s %8s %8s %8s %8s %8s\n" "${REFS}" "${TP}" "${FP}" "${FN}" "${PREC}" "${RECALL}" "${F1}" >> "$OUTPUT_FILE"
    # Track best F1
    F1_NUM=$(echo "${F1}" | tr -d '%')
    if [ -n "$F1_NUM" ]; then
        BETTER=$(echo "${F1_NUM} > ${BEST_F1}" | bc -l 2>/dev/null || echo "0")
        if [ "$BETTER" = "1" ]; then
            BEST_F1="${F1_NUM}"
            BEST_REFS="${REFS}"
        fi
    fi
    # Also append full output for this test
    echo "" >> "$OUTPUT_FILE"
    echo "======================================================================" >> "$OUTPUT_FILE"
    echo "DETAILED RESULTS: refs_per_logo=${REFS}" >> "$OUTPUT_FILE"
    echo "======================================================================" >> "$OUTPUT_FILE"
    echo "${OUTPUT}" | grep -A 50 "Configuration:" | head -30 >> "$OUTPUT_FILE"
    echo "" >> "$OUTPUT_FILE"
 done
 # Summary
 echo "------------------------------------------------------------------------" >> "$OUTPUT_FILE"
 echo "" >> "$OUTPUT_FILE"
 echo "OPTIMAL SETTING: refs_per_logo=${BEST_REFS} (F1 = ${BEST_F1}%)" >> "$OUTPUT_FILE"
 echo "" >> "$OUTPUT_FILE"
 echo "======================================="
 echo "OPTIMAL: refs_per_logo=${BEST_REFS} (F1 = ${BEST_F1}%)"
 echo "======================================="
 echo ""
 echo "Results saved to: $OUTPUT_FILE"
--- a/run_threshold_tests_image_split.sh
+++ b/run_threshold_tests_image_split.sh
@ -0,0 +1,181 @@
 #!/bin/bash
 #
 # Run logo detection tests with the image-split fine-tuned model.
 # Tests various threshold and margin settings to find optimal parameters.
 #
 # Usage:
 #   ./run_threshold_tests_image_split.sh
 #
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 OUTPUT_FILE="${SCRIPT_DIR}/threshold_test_results_image_split.txt"
 # Model path
 MODEL_PATH="models/logo_detection/clip_finetuned_image_split"
 # Common parameters
 NUM_LOGOS=20
 REFS_PER_LOGO=10
 POSITIVE_SAMPLES=20
 NEGATIVE_SAMPLES=100
 MIN_MATCHING_REFS=3
 SEED=42
 # Check if model exists
 if [ ! -d "${SCRIPT_DIR}/${MODEL_PATH}" ]; then
    echo "Error: Image-split model not found at ${SCRIPT_DIR}/${MODEL_PATH}"
    echo "Train the model first with: python train_clip_logo.py --config configs/cloud_rtx4090_image_split.yaml"
    exit 1
 fi
 # Clear output file and write header
 echo "Threshold Optimization Tests - Image-Split Model" > "$OUTPUT_FILE"
 echo "=================================================" >> "$OUTPUT_FILE"
 echo "Date: $(date)" >> "$OUTPUT_FILE"
 echo "" >> "$OUTPUT_FILE"
 echo "Model: ${MODEL_PATH}" >> "$OUTPUT_FILE"
 echo "" >> "$OUTPUT_FILE"
 echo "Common Parameters:" >> "$OUTPUT_FILE"
 echo "  Matching method: multi-ref (max)" >> "$OUTPUT_FILE"
 echo "  Reference logos: $NUM_LOGOS" >> "$OUTPUT_FILE"
 echo "  Refs per logo: $REFS_PER_LOGO" >> "$OUTPUT_FILE"
 echo "  Positive samples: $POSITIVE_SAMPLES" >> "$OUTPUT_FILE"
 echo "  Negative samples: $NEGATIVE_SAMPLES" >> "$OUTPUT_FILE"
 echo "  Min matching refs: $MIN_MATCHING_REFS" >> "$OUTPUT_FILE"
 echo "  Seed: $SEED" >> "$OUTPUT_FILE"
 echo "" >> "$OUTPUT_FILE"
 echo "Running threshold optimization tests for image-split model..."
 echo "  Model: ${MODEL_PATH}"
 echo "  Matching method: multi-ref (max)"
 echo "  Reference logos: $NUM_LOGOS"
 echo "  Refs per logo: $REFS_PER_LOGO"
 echo "  Seed: $SEED"
 echo ""
 # Test 1: Lower threshold (image-split model may have different distribution)
 echo "=== Test 1: threshold=0.65, margin=0.05 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.65 \
    --margin 0.05 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 # Test 2: Default threshold
 echo "=== Test 2: threshold=0.70, margin=0.05 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.70 \
    --margin 0.05 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 # Test 3: threshold=0.75
 echo "=== Test 3: threshold=0.75, margin=0.05 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.75 \
    --margin 0.05 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 # Test 4: threshold=0.80
 echo "=== Test 4: threshold=0.80, margin=0.05 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.80 \
    --margin 0.05 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 # Test 5: threshold=0.80 with larger margin
 echo "=== Test 5: threshold=0.80, margin=0.10 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.80 \
    --margin 0.10 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 # Test 6: threshold=0.85
 echo "=== Test 6: threshold=0.85, margin=0.10 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.85 \
    --margin 0.10 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 # Test 7: threshold=0.90
 echo "=== Test 7: threshold=0.90, margin=0.10 ==="
 uv run python "$SCRIPT_DIR/test_logo_detection.py" \
    --num-logos $NUM_LOGOS \
    --refs-per-logo $REFS_PER_LOGO \
    --positive-samples $POSITIVE_SAMPLES \
    --negative-samples $NEGATIVE_SAMPLES \
    --matching-method multi-ref \
    --min-matching-refs $MIN_MATCHING_REFS \
    --use-max-similarity \
    --threshold 0.90 \
    --margin 0.10 \
    --seed $SEED \
    --embedding-model "$MODEL_PATH" \
    --output-file "$OUTPUT_FILE"
 echo ""
 echo "Results saved to: $OUTPUT_FILE"
--- a/test_results/FINAL_MODEL_ANALYSIS.md
+++ b/test_results/FINAL_MODEL_ANALYSIS.md
@ -0,0 +1,216 @@
 # Logo Recognition Model Analysis
 **Date:** January 7, 2026
 **Purpose:** Determine the best model and threshold for logo recognition of logos not currently in the test set.
 ---
 ## Executive Summary
 | Model | Best Threshold | F1 Score | Precision | Recall | Recommended Use |
 |-------|---------------|----------|-----------|--------|-----------------|
 | **Image-Split Fine-tuned** | 0.70-0.75 | **67-68%** | 66-80% | 59-68% | Known logos (in reference set) |
 | Baseline CLIP | 0.70 | 57-60% | 48-49% | 72-77% | Unknown logos (never seen before) |
 | Logo-Split Fine-tuned | 0.76 | 56% | 49% | 64% | Not recommended |
 | DINOv2 (small/large) | - | 29-30% | 22-32% | 28-43% | Not suitable |
 **Winner: Image-Split Fine-tuned Model** at threshold **0.70-0.75**
 ---
 ## Detailed Model Comparison
 ### 1. Baseline CLIP (openai/clip-vit-large-patch14)
 The pre-trained CLIP model without any fine-tuning.
 **Threshold Performance:**
 | Threshold | Precision | Recall | F1 |
 |-----------|-----------|--------|-----|
 | 0.70 | 47.9% | 71.8% | 57.5% |
 | 0.80 | 33.0% | 63.1% | 43.4% |
 | 0.85 | 26.9% | 43.4% | 33.2% |
 | 0.90 | 54.9% | 22.8% | 32.2% |
 **Similarity Distribution:**
 - True Positive mean: 0.854 (range: 0.75-0.95)
 - False Positive mean: 0.846 (range: 0.75-0.95)
 - **Problem:** TP and FP distributions almost completely overlap
 **Suggested optimal threshold:** 0.756 (predicted F1 = 67.1%)
 **Strengths:**
 - Good recall at low thresholds
 - Works on completely unseen logos
 - No training required
 **Weaknesses:**
 - Poor separation between correct and incorrect matches
 - High false positive rate
 ---
 ### 2. Fine-tuned CLIP (Logo-Level Splits)
 Trained with contrastive learning, tested on completely unseen logo brands.
 **Threshold Performance:**
 | Threshold | Precision | Recall | F1 |
 |-----------|-----------|--------|-----|
 | 0.70 | 25.9% | 67.1% | 37.4% |
 | 0.76 | **49.1%** | 64.3% | **55.7%** |
 | 0.82 | 75.7% | 41.4% | 53.5% |
 | 0.86 | 88.6% | 28.1% | 42.7% |
 **Similarity Distribution:**
 - True Positive mean: 0.853
 - False Positive mean: 0.787 (better separation than baseline)
 - Missed logos mean: 0.711 (only 43.7% above 0.75)
 **Suggested optimal threshold:** 0.82 (predicted F1 = 71.9%)
 **Strengths:**
 - Better TP/FP separation than baseline
 - Very high precision at high thresholds (88.6% at t=0.86)
 **Weaknesses:**
 - Does not generalize well to unseen logo brands
 - Many correct logos score below threshold (56% of missed logos below 0.75)
 - Worse than baseline at threshold 0.70
 ---
 ### 3. Fine-tuned CLIP (Image-Level Splits) ⭐ BEST
 Trained with contrastive learning, all logo brands seen during training (different images held out for testing).
 **Threshold Performance:**
 | Threshold | Precision | Recall | F1 |
 |-----------|-----------|--------|-----|
 | 0.65 | 56.9% | **75.9%** | 65.0% |
 | 0.70 | 66.3% | 68.3% | **67.3%** |
 | 0.75 | **79.9%** | 59.3% | **68.1%** |
 | 0.80 | 83.7% | 52.8% | 64.8% |
 | 0.85 | 92.4% | 42.8% | 58.5% |
 | 0.90 | 98.9% | 24.7% | 39.5% |
 **Similarity Distribution:**
 - True Positive mean: 0.866 (higher than other models)
 - False Positive mean: 0.807
 - TP-FP gap: 0.059 (best separation)
 - At t=0.75: 92 TP vs only 38 FP (excellent ratio)
 **Suggested optimal threshold:** 0.755 (predicted F1 = 85.6%)
 **Strengths:**
 - Best overall F1 score (68.1% at t=0.75)
 - Best precision at any threshold (79.9-98.9%)
 - Excellent TP/FP ratio
 - Highest true positive similarity scores
 **Weaknesses:**
 - Requires logos to be in the reference set during training
 - May not generalize to completely novel logos
 ---
 ### 4. DINOv2 Models
 Tested for comparison but significantly underperformed.
 | Model | Precision | Recall | F1 |
 |-------|-----------|--------|-----|
 | DINOv2-small | 22.4% | 42.8% | 29.5% |
 | DINOv2-large | 32.2% | 28.5% | 30.2% |
 **Not recommended** for logo recognition tasks.
 ---
 ## Recommendations
 ### For Logo Recognition of Known Logos (logos in your reference set)
 **Use: Image-Split Fine-tuned Model**
 ```bash
 # Recommended configuration
 python test_logo_detection.py \
    -e models/logo_detection/clip_finetuned_image_split \
    -t 0.70 \
    --matching-method multi-ref \
    --use-max-similarity
 ```
 | Use Case | Threshold | Expected Performance |
 |----------|-----------|---------------------|
 | Balanced (recommended) | 0.70 | 66% precision, 68% recall, 67% F1 |
 | High precision | 0.75 | 80% precision, 59% recall, 68% F1 |
 | Very high precision | 0.80 | 84% precision, 53% recall, 65% F1 |
 | Maximum precision | 0.85+ | 92%+ precision, <43% recall |
 ### For Logo Recognition of Unknown Logos (completely novel brands)
 **Use: Baseline CLIP** (the fine-tuned models don't generalize well)
 ```bash
 # Recommended configuration
 python test_logo_detection.py \
    -e openai/clip-vit-large-patch14 \
    -t 0.70 \
    --matching-method multi-ref \
    --use-max-similarity
 ```
 Expected: ~48% precision, ~72% recall, ~58% F1
 ---
 ## Key Findings
 ### 1. Image-Level Splits Dramatically Improve Performance
 The image-split fine-tuned model outperforms all others because:
 - It learns brand-specific features during training
 - Test images are different but from same brands
 - Better represents real-world use where you have reference images for logos you want to detect
 ### 2. Logo-Level Splits Test True Generalization (but results are poor)
 The logo-split model tests whether fine-tuning helps with completely unseen logos:
 - Result: It doesn't help much (56% F1 vs 58% baseline)
 - Contrastive learning doesn't transfer well to novel brands
 - Use baseline CLIP for novel logo detection
 ### 3. Threshold Sweet Spot is 0.70-0.75
 For all models, the optimal F1 occurs around threshold 0.70-0.75:
 - Lower thresholds: Too many false positives
 - Higher thresholds: Misses too many correct logos
 - At 0.90+: Precision is high but recall drops below 25%
 ### 4. Precision-Recall Tradeoff
 | Priority | Threshold | Tradeoff |
 |----------|-----------|----------|
 | Recall | 0.65-0.70 | More matches, more false positives |
 | Balanced | 0.70-0.75 | Best F1 score |
 | Precision | 0.75-0.80 | Fewer false positives, misses some matches |
 | High Precision | 0.85+ | Very few false positives, misses many matches |
 ---
 ## Conclusion
 **For production use with known logos:**
 - Use **Image-Split Fine-tuned Model** at **threshold 0.70-0.75**
 - Expected F1: 67-68%, Precision: 66-80%
 **For discovering unknown logos:**
 - Use **Baseline CLIP** at **threshold 0.70**
 - Expected F1: ~58%, Precision: ~48%
 The image-split fine-tuning provides significant improvements (+8-10% F1) over baseline for known logos, but does not help with completely novel brands. For a production system, ensure all target logos are included in the training/reference set.
--- a/test_results/comparison_results/baseline_20260105_100740.txt
+++ b/test_results/comparison_results/baseline_20260105_100740.txt
--- a/test_results/comparison_results/comparison_summary_20260105_100740.txt
+++ b/test_results/comparison_results/comparison_summary_20260105_100740.txt
@ -0,0 +1,29 @@
 ============================================================
 Test Parameters:
  Logos: 50, Seed: 42, Threshold: 0.7
  Method: multi-ref, Refs/logo: 3, Margin: 0.05
 BASELINE (openai/clip-vit-large-patch14):
  True Positives (correct matches):  101
  False Positives (wrong matches):   104
  False Negatives (missed logos):    156
  Precision: 0.4927 (49.3%)
  Recall:    0.4056 (40.6%)
  F1 Score:  0.4449 (44.5%)
 FINE-TUNED (models/logo_detection/clip_finetuned):
  True Positives (correct matches):  164
  False Positives (wrong matches):   414
  False Negatives (missed logos):    115
  Precision: 0.2837 (28.4%)
  Recall:    0.6586 (65.9%)
  F1 Score:  0.3966 (39.7%)
 ------------------------------------------------------------
 F1 SCORE COMPARISON:
  Baseline:    44.5%
  Fine-tuned:  39.7%
 ------------------------------------------------------------
 Full results saved to: comparison_results/
--- a/test_results/comparison_results/finetuned_20260105_100740.txt
+++ b/test_results/comparison_results/finetuned_20260105_100740.txt
--- a/test_results/comparison_results_clip_defaults_all_methods.txt
+++ b/test_results/comparison_results_clip_defaults_all_methods.txt
@ -0,0 +1,124 @@
 Logo Detection Comparison Tests
 ================================
 Date: Wed Dec 31 03:43:45 PM MST 2025
 Common Parameters:
  Reference logos: 20
  Refs per logo: 10
  Positive samples: 20
  Negative samples: 100
  Min matching refs: 3
  Seed: 42
 ======================================================================
 TEST: SIMPLE MATCHING
 Method: Simple (all matches above threshold)
 ======================================================================
 Date: 2025-12-31 16:02:25
 Configuration:
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2355
  CLIP threshold:            0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      751
  False Positives:   58221
  False Negatives:       9
  Total Expected:      369
 Scores:
  Precision:  0.0127 (1.3%)
  Recall:     2.0352 (203.5%)
  F1 Score:   0.0253 (2.5%)
 ======================================================================
 TEST: MARGIN MATCHING
 Method: Margin-based (margin=0.05)
 ======================================================================
 Date: 2025-12-31 16:20:42
 Configuration:
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2361
  CLIP threshold:            0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:       60
  False Positives:      26
  False Negatives:     310
  Total Expected:      369
 Scores:
  Precision:  0.6977 (69.8%)
  Recall:     0.1626 (16.3%)
  F1 Score:   0.2637 (26.4%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Method: Multi-ref (mean, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2025-12-31 16:38:59
 Configuration:
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2352
  CLIP threshold:            0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      233
  False Positives:     217
  False Negatives:     170
  Total Expected:      369
 Scores:
  Precision:  0.5178 (51.8%)
  Recall:     0.6314 (63.1%)
  F1 Score:   0.5690 (56.9%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Method: Multi-ref (max, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2025-12-31 16:56:49
 Configuration:
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2350
  CLIP threshold:            0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      278
  False Positives:     259
  False Negatives:     136
  Total Expected:      369
 Scores:
  Precision:  0.5177 (51.8%)
  Recall:     0.7534 (75.3%)
  F1 Score:   0.6137 (61.4%)
--- a/test_results/model_comparison_results.txt
+++ b/test_results/model_comparison_results.txt
@ -0,0 +1,105 @@
 Embedding Model Comparison Tests
 =================================
 Date: Fri Jan  2 12:47:03 PM MST 2026
 Common Parameters:
  Matching method: multi-ref (max)
  Reference logos: 20
  Refs per logo: 10
  Positive samples: 20
  Negative samples: 100
  Min matching refs: 3
  Threshold: 0.70
  Margin: 0.05
  Seed: 42
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2026-01-02 13:05:17
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2355
  Similarity threshold:      0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      284
  False Positives:     295
  False Negatives:     124
  Total Expected:      369
 Scores:
  Precision:  0.4905 (49.1%)
  Recall:     0.7696 (77.0%)
  F1 Score:   0.5992 (59.9%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: facebook/dinov2-small
 Method: Multi-ref (max, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2026-01-02 13:19:01
 Configuration:
  Embedding model:           facebook/dinov2-small
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2358
  Similarity threshold:      0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      158
  False Positives:     546
  False Negatives:     234
  Total Expected:      369
 Scores:
  Precision:  0.2244 (22.4%)
  Recall:     0.4282 (42.8%)
  F1 Score:   0.2945 (29.5%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: facebook/dinov2-large
 Method: Multi-ref (max, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2026-01-02 13:39:33
 Configuration:
  Embedding model:           facebook/dinov2-large
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2355
  Similarity threshold:      0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      105
  False Positives:     221
  False Negatives:     277
  Total Expected:      369
 Scores:
  Precision:  0.3221 (32.2%)
  Recall:     0.2846 (28.5%)
  F1 Score:   0.3022 (30.2%)
--- a/test_results/similarity_analysis/baseline_similarity_20260105_113827.txt
+++ b/test_results/similarity_analysis/baseline_similarity_20260105_113827.txt
--- a/test_results/similarity_analysis/finetuned_similarity_20260105_113827.txt
+++ b/test_results/similarity_analysis/finetuned_similarity_20260105_113827.txt
--- a/test_results/test_results_analysis.md
+++ b/test_results/test_results_analysis.md
@ -346,6 +346,131 @@ DINOv2 Small produces over 3x as many false positives as true positives, making
 ---
 ## Summary and Recommendations
 This section synthesizes findings from all test runs to provide actionable recommendations for logo detection configuration and future improvements.
 ### Best Configuration
 Based on all tests conducted, the optimal configuration is:
 | Parameter | Recommended Value | Rationale |
 |-----------|-------------------|-----------|
 | **Embedding Model** | `openai/clip-vit-large-patch14` | 2x better F1 than DINOv2 alternatives |
 | **Matching Method** | `multi-ref` with max similarity | Best F1 (59.9%) and recall (77.0%) |
 | **Similarity Threshold** | 0.70 | Lower thresholds outperform higher ones |
 | **Margin** | 0.05 | Minimal impact; keep low to avoid rejecting valid matches |
 | **Min Matching Refs** | 3 | Provides better discrimination than threshold alone |
 | **Refs Per Logo** | 10 | More references improve robustness |
 | **DETR Threshold** | 0.50 | Standard detection confidence |
 ### Performance Expectations
 With the recommended configuration:
 | Metric | Expected Value | Interpretation |
 |--------|----------------|----------------|
 | Precision | ~49% | About half of detections are correct |
 | Recall | ~77% | Finds most logos present in images |
 | F1 Score | ~60% | Moderate overall accuracy |
 | FP:TP Ratio | ~1:1 | Approximately equal true and false positives |
 **Important**: These results indicate the system is suitable for applications that can tolerate a high false positive rate, such as:
 - Initial screening with human review
 - Flagging content for further analysis
 - Low-stakes logo presence detection
 The system is **not suitable** for high-precision applications without additional filtering or verification steps.
 ### Key Insights from Testing
 #### What Works
 1. **Multi-ref matching with max aggregation** consistently outperforms other methods
 2. **Multiple references per logo** (10) provides robustness against logo variations
 3. **min_matching_refs=3** is more effective at discrimination than threshold tuning
 4. **CLIP embeddings** significantly outperform self-supervised alternatives (DINOv2)
 #### What Doesn't Work
 1. **Raising similarity threshold** paradoxically increases false positives in the 0.70-0.85 range
 2. **Margin-only matching** fails with multiple references (same-logo refs compete)
 3. **DINOv2 models** produce 2-3x worse results than CLIP
 4. **Simple threshold-based matching** produces unacceptable 78:1 FP:TP ratio
 #### Limitations
 1. **~50% precision ceiling**: Even the best configuration produces nearly as many false positives as true positives
 2. **No clean threshold separation**: CLIP's embedding space doesn't provide clear decision boundaries for logos
 3. **General-purpose models**: Neither CLIP nor DINOv2 are optimized for fine-grained logo discrimination
 4. **Pipeline dependencies**: Results depend heavily on DETR detection quality
 ### Recommendations for Future Improvements
 #### Short-Term Improvements
 | Improvement | Expected Impact | Effort |
 |-------------|-----------------|--------|
 | **Post-processing filters** | Reduce FP by 20-30% | Low |
 | Add color histogram matching | Filter matches with wrong colors | |
 | Add aspect ratio validation | Reject shape mismatches | |
 | Add text detection | Filter if expected text is missing | |
 | **Reference curation** | Improve TP by 10-20% | Low |
 | Remove low-quality references | Reduce noise in ref embeddings | |
 | Ensure diverse logo variants | Improve coverage | |
 | **Ensemble scoring** | Improve F1 by 10-15% | Medium |
 | Combine CLIP + color features | Multi-signal confidence | |
 | Weighted voting across refs | More robust aggregation | |
 #### Medium-Term Improvements
 | Improvement | Expected Impact | Effort |
 |-------------|-----------------|--------|
 | **Fine-tune CLIP on logos** | Improve F1 by 20-40% | Medium |
 | Contrastive training on logo pairs | Better embedding separation | |
 | Use LogoDet-3K for training data | Domain-specific features | |
 | **Alternative detection models** | Improve detection quality | Medium |
 | Test YOLOv8 for logo detection | Faster, potentially more accurate | |
 | Train custom detector on logo data | Better region proposals | |
 | **Learned similarity metric** | Improve precision by 30-50% | Medium |
 | Train siamese network for logo matching | Replace cosine similarity | |
 | Learn logo-specific distance function | Better discrimination | |
 #### Long-Term Improvements
 | Improvement | Expected Impact | Effort |
 |-------------|-----------------|--------|
 | **End-to-end logo recognition model** | F1 > 85% | High |
 | Single model for detection + recognition | Eliminate pipeline errors | |
 | Train on large-scale logo dataset | Comprehensive coverage | |
 | **Logo-specific foundation model** | F1 > 90% | High |
 | Pre-train on millions of logo images | Domain expertise | |
 | Fine-tune for specific brand sets | Production-ready accuracy | |
 ### Decision Framework
 Use this framework to choose between precision and recall:
 | Use Case | Priority | Recommended Adjustments |
 |----------|----------|------------------------|
 | **Content moderation** | High recall | Use defaults; accept FPs for human review |
 | **Brand monitoring** | Balanced | Use defaults; filter obvious FPs |
 | **Automated licensing** | High precision | Use threshold=0.90; accept low recall |
 | **Search/discovery** | High recall | Lower threshold to 0.65; more refs |
 ### Conclusion
 The current DETR + CLIP pipeline with multi-ref matching achieves moderate accuracy (~60% F1) that is suitable for screening applications but falls short of production requirements for automated decision-making. The fundamental limitation is that general-purpose vision models lack the fine-grained discrimination needed for logo recognition.
 **To achieve production-quality accuracy (>85% F1), the system requires:**
 1. A logo-specific embedding model (fine-tuned or trained from scratch)
 2. Additional visual features beyond CLIP embeddings
 3. Potentially an end-to-end architecture designed for logo recognition
 The test framework established here provides the foundation for evaluating these future improvements systematically.
 ---
 ## Test Run: [Next Test Name]
 *Results pending...*
--- a/test_results/threshold_analysis/finetuned_thresholds_20260105_122213.txt
+++ b/test_results/threshold_analysis/finetuned_thresholds_20260105_122213.txt
@ -0,0 +1,20 @@
 ============================================================
 THRESHOLD OPTIMIZATION RESULTS
 Model: finetuned (models/logo_detection/clip_finetuned)
 ============================================================
 Threshold        TP       FP       FN     Prec   Recall       F1
 --------------------------------------------------------------------
 0.70            167      477      120    25.9%    67.1%    37.4%
 0.72            158      339      116    31.8%    63.5%    42.4%
 0.74            150      252      123    37.3%    60.2%    46.1%
 0.76            160      166      119    49.1%    64.3%    55.7%
 0.78            120      102      147    54.1%    48.2%    51.0%
 0.80            110       73      151    60.1%    44.2%    50.9%
 0.82            103       33      159    75.7%    41.4%    53.5%
 0.84             74       18      180    80.4%    29.7%    43.4%
 0.86             70        9      187    88.6%    28.1%    42.7%
 --------------------------------------------------------------------
 BEST THRESHOLD: 0.76 (F1 = 55.7%)
--- a/test_results/threshold_analysis/threshold_test_results.txt
+++ b/test_results/threshold_analysis/threshold_test_results.txt
@ -0,0 +1,193 @@
 Threshold Optimization Tests
 =============================
 Date: Fri Jan  2 10:11:34 AM MST 2026
 Common Parameters:
  Matching method: multi-ref (max)
  Reference logos: 20
  Refs per logo: 10
  Positive samples: 20
  Negative samples: 100
  Min matching refs: 3
  Seed: 42
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2026-01-02 10:29:26
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2358
  Similarity threshold:      0.7
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      265
  False Positives:     288
  False Negatives:     141
  Total Expected:      369
 Scores:
  Precision:  0.4792 (47.9%)
  Recall:     0.7182 (71.8%)
  F1 Score:   0.5748 (57.5%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.05)
 ======================================================================
 Date: 2026-01-02 10:47:35
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2348
  Similarity threshold:      0.8
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      233
  False Positives:     472
  False Negatives:     165
  Total Expected:      369
 Scores:
  Precision:  0.3305 (33.0%)
  Recall:     0.6314 (63.1%)
  F1 Score:   0.4339 (43.4%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.1)
 ======================================================================
 Date: 2026-01-02 11:05:34
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2357
  Similarity threshold:      0.8
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      187
  False Positives:     375
  False Negatives:     208
  Total Expected:      369
 Scores:
  Precision:  0.3327 (33.3%)
  Recall:     0.5068 (50.7%)
  F1 Score:   0.4017 (40.2%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.1)
 ======================================================================
 Date: 2026-01-02 11:23:33
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2356
  Similarity threshold:      0.85
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      160
  False Positives:     434
  False Negatives:     223
  Total Expected:      369
 Scores:
  Precision:  0.2694 (26.9%)
  Recall:     0.4336 (43.4%)
  F1 Score:   0.3323 (33.2%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.15)
 ======================================================================
 Date: 2026-01-02 11:41:47
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2359
  Similarity threshold:      0.85
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:      163
  False Positives:     410
  False Negatives:     220
  Total Expected:      369
 Scores:
  Precision:  0.2845 (28.4%)
  Recall:     0.4417 (44.2%)
  F1 Score:   0.3461 (34.6%)
 ======================================================================
 TEST: MULTI-REF MATCHING
 Model: openai/clip-vit-large-patch14
 Method: Multi-ref (max, min_refs=3, margin=0.15)
 ======================================================================
 Date: 2026-01-02 12:00:00
 Configuration:
  Embedding model:           openai/clip-vit-large-patch14
  Reference logos:           20
  Refs per logo:             10
  Total reference embeddings:189
  Positive samples/logo:     20
  Negative samples/logo:     100
  Test images processed:     2363
  Similarity threshold:      0.9
  DETR threshold:            0.5
  Random seed:               42
 Results:
  True Positives:       84
  False Positives:      69
  False Negatives:     288
  Total Expected:      369
 Scores:
  Precision:  0.5490 (54.9%)
  Recall:     0.2276 (22.8%)
  F1 Score:   0.3218 (32.2%)
Author	SHA1	Message	Date
Rick McEwen	78f46f04bf	Add script to test optimal refs per logo for baseline CLIP	2026-01-07 12:52:16 -05:00
Rick McEwen	b5432c9ef7	Add comprehensive model comparison analysis	2026-01-07 12:44:15 -05:00
Rick McEwen	440e8fcdb4	Combine all test results in a single directory	2026-01-07 10:22:54 -05:00
Rick McEwen	2f28aa6052	Add threshold test script for image-split model	2026-01-07 10:14:21 -05:00