Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,93 @@
+# Jersey Detection Testing
+
+This project contains test scripts, results, and utilities for evaluating vision-language models on jersey number detection tasks using llama.cpp.
+
+## Directory Structure
+
+```
+jersey_test/
+├── scan_utils/
+│   ├── jersey_detection.py      # Core detection class using VLM
+│   └── llama_cpp_client.py      # Client for llama.cpp server
+├── docs/
+│   ├── JERSEY_DETECTION_MODEL_ANALYSIS.md  # Model comparison results
+│   └── LLAMA_SWAP_SETUP.md      # Server setup instructions
+├── test_images/                  # Place test images here
+├── test_images_output/           # Output directory for annotated images
+├── test_jersey_detection.py      # Main test runner
+├── analyze_jersey_results.py     # Results analysis script
+├── test_all_models.sh            # Batch testing shell script
+├── jersey_prompt.txt             # Basic detection prompt
+├── jersey_prompt_with_confidence.txt  # Prompt with confidence scoring
+└── jersey_detection_results.jsonl     # Historical test results
+```
+
+## Prerequisites
+
+- Python 3.10+
+- llama.cpp server running with a vision-language model
+- Test images with ground truth encoded in filenames
+
+## Test Image Naming Convention
+
+Test images should follow this naming pattern to encode ground truth:
+```
+prefix-number1-number2-number3.jpg
+```
+
+Example: `game1-23-45-7.jpg` contains jerseys with numbers 23, 45, and 7.
+
+## Running Tests
+
+### Single Model Test
+
+```bash
+python test_jersey_detection.py \
+    --images-dir ./test_images \
+    --prompt-file jersey_prompt_with_confidence.txt \
+    --server-url http://localhost:8080 \
+    --resize 1024 \
+    --output jersey_detection_results.jsonl
+```
+
+### Batch Testing All Models
+
+```bash
+./test_all_models.sh
+```
+
+Edit variables at the top of the script to configure:
+- `IMAGES_DIR` - test images directory
+- `PROMPT_FILE` - prompt file to use
+- `SERVER_URL` - llama.cpp/llama-swap server URL
+- `LLAMA_SWAP_CONFIG` - path to llama-swap config for model list
+
+### Analyzing Results
+
+```bash
+python analyze_jersey_results.py jersey_detection_results.jsonl
+```
+
+Options:
+- `--csv output.csv` - Export results to CSV
+- `--filter-model "model_name"` - Filter by model name
+
+## Historical Results
+
+The `jersey_detection_results.jsonl` file contains results from 6 test runs:
+
+| Model | F1 Score | Avg Time/Image | Avg Confidence |
+|-------|----------|----------------|----------------|
+| qwen2.5-vl-7b | 72.9% | - | - |
+| gemma-3-27b | 72.1% | 18.1s | 87.1 |
+| Mistral-Small-3.2-24B (Q4) | - | 14.2s | 92.1 |
+| Kimi-VL-A3B-Thinking | - | 29.1s | 88.9 |
+
+See `docs/JERSEY_DETECTION_MODEL_ANALYSIS.md` for detailed analysis.
+
+## Key Findings
+
+1. **Top Recommendation**: qwen2.5-vl-7b (72.9% F1 score)
+2. **Best Confidence Calibration**: gemma-3-27b
+3. **Speed Champion**: gemma-3-4b (7.9s/img, 63.8% F1)
+4. Confidence threshold of 85+ recommended for filtering uncertain detections
--- a/analyze_jersey_results.py
+++ b/analyze_jersey_results.py
@ -0,0 +1,663 @@
+#!/usr/bin/env python3
+"""
+Analyze jersey detection test results and compare model performance.
+
+Usage:
+    python analyze_jersey_results.py [results_file]
+    python analyze_jersey_results.py [results_file] --csv output.csv
+    python analyze_jersey_results.py [results_file] --csv-only output.csv
+
+Arguments:
+    results_file: Path to the results file (default: jersey_detection_results.jsonl)
+    --csv: Also export results to CSV file
+    --csv-only: Export to CSV only, skip analysis display
+"""
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+
+
+def load_results(results_file: str) -> List[Dict[str, Any]]:
+    """Load test results from a JSON Lines file."""
+    results = []
+    try:
+        with open(results_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    results.append(json.loads(line))
+        return results
+    except FileNotFoundError:
+        print(f"Error: Results file not found: {results_file}")
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON in results file: {e}")
+        sys.exit(1)
+
+
+def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
+    """
+    Calculate standard deviation of confidence scores from distribution.
+
+    Returns:
+        Tuple of (stdev, quality_rating)
+        quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
+    """
+    if not conf_stats or 'distribution' not in conf_stats:
+        return None, "N/A"
+
+    dist = conf_stats['distribution']
+
+    # Reconstruct approximate confidence values from buckets
+    # Use midpoint of each bucket
+    values = []
+    bucket_midpoints = {
+        '90-100': 95,
+        '70-89': 79.5,
+        '50-69': 59.5,
+        '30-49': 39.5,
+        '0-29': 14.5
+    }
+
+    for bucket, count in dist.items():
+        midpoint = bucket_midpoints.get(bucket, 50)
+        values.extend([midpoint] * count)
+
+    if len(values) < 2:
+        return None, "N/A"
+
+    # Calculate standard deviation
+    import math
+    mean = sum(values) / len(values)
+    variance = sum((x - mean) ** 2 for x in values) / len(values)
+    stdev = math.sqrt(variance)
+
+    # Assign quality rating based on StDev
+    if stdev < 5:
+        quality = "Poor"
+    elif stdev < 10:
+        quality = "Fair"
+    elif stdev < 15:
+        quality = "Good"
+    else:
+        quality = "Excel"  # Shortened for table
+
+    return stdev, quality
+
+
+def print_ascii_comparison_table(results: List[Dict[str, Any]]):
+    """Print a detailed ASCII comparison table of all test runs."""
+    if not results:
+        print("No results to display.")
+        return
+
+    print("=" * 280)
+    print("DETAILED MODEL COMPARISON TABLE")
+    print("=" * 280)
+    print()
+    print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
+    print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
+    print()
+
+    # Table headers with ground truth and confidence calibration columns
+    print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐")
+    print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format(
+        "Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
+    ))
+    print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤")
+
+    # Data rows
+    for i, result in enumerate(results):
+        model = result.get('model_name', 'unknown')[:20]
+        prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
+        total_images = result.get('total_images', 0)
+        valid_jerseys = result.get('total_valid_jerseys', 0)
+        hallucinated = result.get('total_hallucinated', 0)
+        total_detections = valid_jerseys + hallucinated
+        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
+        hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
+        avg_time = result.get('avg_processing_time', 0)
+
+        # Calculate confidence quality
+        conf_stats = result.get('confidence_stats')
+        has_conf = 'Yes' if conf_stats else 'No'
+        stdev, quality = calculate_confidence_stdev(conf_stats)
+
+        # Format confidence quality display
+        if stdev is not None:
+            conf_qual_str = f"{quality} ({stdev:.1f})"
+        else:
+            conf_qual_str = "N/A"
+
+        # Ground truth metrics
+        gt = result.get('ground_truth', {})
+        precision = gt.get('overall_precision', 0) * 100
+        recall = gt.get('overall_recall', 0) * 100
+        f1 = gt.get('overall_f1', 0) * 100
+
+        # Confidence calibration
+        conf_correct = gt.get('avg_confidence_correct')
+        conf_incorrect = gt.get('avg_confidence_incorrect')
+        conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
+        conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"
+
+        resize_max = result.get('resize_max')
+        resize_str = f"{resize_max}px" if resize_max else "No"
+        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')
+
+        print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format(
+            model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
+        ))
+
+    # Bottom border
+    print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘")
+
+    print()
+
+
+def print_comparison_table(results: List[Dict[str, Any]]):
+    """Print a simple comparison table of all test runs."""
+    if not results:
+        print("No results to display.")
+        return
+
+    print("=" * 140)
+    print("MODEL COMPARISON TABLE")
+    print("=" * 140)
+    print()
+
+    # Header
+    header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
+    print(header)
+    print("-" * 150)
+
+    # Data rows
+    for result in results:
+        model = result.get('model_name', 'unknown')[:24]
+        prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
+        total_images = result.get('total_images', 0)
+        valid_jerseys = result.get('total_valid_jerseys', 0)
+        hallucinated = result.get('total_hallucinated', 0)
+        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
+        avg_time = result.get('avg_processing_time', 0)
+        has_conf = 'Yes' if result.get('confidence_stats') else 'No'
+        resize_max = result.get('resize_max')
+        resize_str = f"{resize_max}px" if resize_max else "No"
+        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')
+
+        row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
+        print(row)
+
+    print()
+
+
+def print_model_performance_chart(results: List[Dict[str, Any]]):
+    """Print a performance chart showing key metrics for each model."""
+    if not results:
+        return
+
+    print("=" * 140)
+    print("MODEL PERFORMANCE CHART")
+    print("=" * 140)
+    print()
+
+    # Group results by model
+    models = {}
+    for result in results:
+        model_name = result.get('model_name', 'unknown')
+        if model_name not in models:
+            models[model_name] = []
+        models[model_name].append(result)
+
+    # Calculate aggregate statistics for each model
+    for model_name, model_results in models.items():
+        print(f"\n{model_name}")
+        print("-" * 100)
+
+        total_runs = len(model_results)
+        total_images = sum(r.get('total_images', 0) for r in model_results)
+        total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
+        total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
+        avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
+        avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0
+
+        # Check if any runs have confidence stats
+        has_confidence = any(r.get('confidence_stats') for r in model_results)
+
+        # Check resize status
+        resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
+        resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
+        resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"
+
+        print(f"  Total test runs: {total_runs}")
+        print(f"  Total images processed: {total_images}")
+        print(f"  Total valid detections: {total_valid}")
+        print(f"  Total hallucinations: {total_hallu}")
+        print(f"  Average empty response rate: {avg_empty_pct:.1f}%")
+        print(f"  Average processing time: {avg_time:.2f}s/image")
+        print(f"  Resize: {resize_info}")
+        print(f"  Confidence support: {'Yes' if has_confidence else 'No'}")
+
+        # Show hallucination rate
+        if total_valid + total_hallu > 0:
+            hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
+            print(f"  Hallucination rate: {hallu_rate:.1f}%")
+
+            # Visual bar
+            bar_length = int(hallu_rate / 2)  # Scale to max 50 chars
+            bar = '█' * bar_length
+            print(f"  Hallucination chart: {bar} ({hallu_rate:.1f}%)")
+
+        # Ground truth performance
+        gt_runs = [r for r in model_results if r.get('ground_truth')]
+        if gt_runs:
+            avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
+            avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
+            avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
+            total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
+            total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
+            total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
+            total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)
+
+            print(f"\n  Ground truth performance:")
+            print(f"    Total expected jerseys: {total_expected}")
+            print(f"    True positives: {total_tp}")
+            print(f"    False positives: {total_fp}")
+            print(f"    False negatives: {total_fn}")
+            print(f"    Average Precision: {avg_precision:.1%}")
+            print(f"    Average Recall: {avg_recall:.1%}")
+            print(f"    Average F1 Score: {avg_f1:.1%}")
+
+            # Visual F1 bar
+            bar_length = int(avg_f1 * 50)  # Scale to max 50 chars
+            bar = '█' * bar_length
+            print(f"    F1 Score chart: {bar} ({avg_f1:.1%})")
+
+            # Confidence calibration
+            conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
+            conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]
+
+            if conf_correct_vals or conf_incorrect_vals:
+                print(f"\n  Confidence calibration:")
+                if conf_correct_vals:
+                    avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
+                    print(f"    Avg confidence (correct detections): {avg_conf_correct:.2f}")
+                if conf_incorrect_vals:
+                    avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
+                    print(f"    Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
+                if conf_correct_vals and conf_incorrect_vals:
+                    diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
+                    if diff > 0:
+                        print(f"    Confidence difference: +{diff:.2f} (good calibration)")
+                    else:
+                        print(f"    Confidence difference: {diff:.2f} (⚠ poor calibration)")
+
+        # Confidence distribution if available
+        if has_confidence:
+            print(f"\n  Confidence distribution (across all runs):")
+            all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
+            total_conf_count = 0
+
+            for result in model_results:
+                conf_stats = result.get('confidence_stats')
+                if conf_stats and 'distribution' in conf_stats:
+                    for bucket, count in conf_stats['distribution'].items():
+                        all_dist[bucket] += count
+                        total_conf_count += count
+
+            if total_conf_count > 0:
+                for bucket, count in all_dist.items():
+                    pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
+                    bar_length = int(pct / 2)
+                    bar = '█' * bar_length
+                    print(f"    {bucket}: {count:4d} ({pct:5.1f}%) {bar}")
+
+    print()
+
+
+def print_best_performers(results: List[Dict[str, Any]]):
+    """Print summary of best performing models."""
+    if not results:
+        return
+
+    print("=" * 140)
+    print("BEST PERFORMERS")
+    print("=" * 140)
+    print()
+
+    # Group by model and calculate averages
+    models = {}
+    for result in results:
+        model_name = result.get('model_name', 'unknown')
+        if model_name not in models:
+            models[model_name] = {
+                'runs': 0,
+                'total_hallu': 0,
+                'total_detections': 0,
+                'avg_time': [],
+                'empty_capable': []
+            }
+
+        models[model_name]['runs'] += 1
+        models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
+        models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
+        models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
+        models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))
+
+    # Calculate scores
+    model_scores = []
+    for model_name, stats in models.items():
+        hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
+        avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
+        empty_capable = any(stats['empty_capable'])
+
+        model_scores.append({
+            'model': model_name,
+            'hallu_rate': hallu_rate,
+            'avg_time': avg_time,
+            'empty_capable': empty_capable,
+            'runs': stats['runs']
+        })
+
+    # Sort by hallucination rate (lower is better)
+    model_scores.sort(key=lambda x: x['hallu_rate'])
+
+    print("Lowest hallucination rate:")
+    for i, score in enumerate(model_scores[:3], 1):
+        capable = "✓" if score['empty_capable'] else "✗"
+        print(f"  {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")
+
+    print()
+
+    # Sort by speed (lower is better)
+    model_scores.sort(key=lambda x: x['avg_time'])
+
+    print("Fastest processing:")
+    for i, score in enumerate(model_scores[:3], 1):
+        capable = "✓" if score['empty_capable'] else "✗"
+        print(f"  {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")
+
+    print()
+
+    # Models with empty response capability
+    empty_models = [s for s in model_scores if s['empty_capable']]
+    print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
+    for score in empty_models:
+        print(f"  - {score['model']}")
+
+    print()
+
+    # Best F1 scores (ground truth accuracy)
+    models_with_gt = {}
+    for result in results:
+        if result.get('ground_truth'):
+            model_name = result.get('model_name', 'unknown')
+            if model_name not in models_with_gt:
+                models_with_gt[model_name] = {
+                    'f1_scores': [],
+                    'precision_scores': [],
+                    'recall_scores': []
+                }
+            gt = result['ground_truth']
+            models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
+            models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
+            models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))
+
+    if models_with_gt:
+        gt_scores = []
+        for model_name, stats in models_with_gt.items():
+            avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
+            avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
+            avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
+            gt_scores.append({
+                'model': model_name,
+                'avg_f1': avg_f1,
+                'avg_precision': avg_precision,
+                'avg_recall': avg_recall
+            })
+
+        # Sort by F1 score (higher is better)
+        gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)
+
+        print("Highest ground truth F1 scores:")
+        for i, score in enumerate(gt_scores[:3], 1):
+            print(f"  {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")
+
+        print()
+
+
+def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
+    """Export results to CSV file for spreadsheet import."""
+    if not results:
+        print("No results to export.")
+        return
+
+    try:
+        with open(csv_file, 'w', newline='') as f:
+            # Define CSV columns
+            fieldnames = [
+                'timestamp',
+                'model_name',
+                'model_tag',
+                'prompt_file',
+                'prompt_length',
+                'total_images',
+                'images_with_jerseys',
+                'images_without_jerseys',
+                'images_with_errors',
+                'total_raw_detections',
+                'total_valid_jerseys',
+                'total_hallucinated',
+                'hallucination_rate_pct',
+                'empty_response_rate_pct',
+                'avg_processing_time',
+                'total_processing_time',
+                'resize_enabled',
+                'resize_max',
+                'images_resized',
+                'has_confidence',
+                'confidence_avg',
+                'confidence_min',
+                'confidence_max',
+                'confidence_count',
+                'confidence_stdev',
+                'confidence_quality',
+                'conf_90_100',
+                'conf_70_89',
+                'conf_50_69',
+                'conf_30_49',
+                'conf_0_29',
+                # Ground truth columns
+                'gt_total_expected',
+                'gt_total_true_positives',
+                'gt_total_false_positives',
+                'gt_total_false_negatives',
+                'gt_overall_precision',
+                'gt_overall_recall',
+                'gt_overall_f1',
+                'gt_avg_precision',
+                'gt_avg_recall',
+                'gt_avg_f1',
+                # Confidence calibration
+                'gt_avg_confidence_correct',
+                'gt_avg_confidence_incorrect',
+                'gt_confidence_correct_count',
+                'gt_confidence_incorrect_count'
+            ]
+
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+
+            # Write data rows
+            for result in results:
+                # Calculate derived values
+                total_images = result.get('total_images', 0)
+                valid_jerseys = result.get('total_valid_jerseys', 0)
+                hallucinated = result.get('total_hallucinated', 0)
+                total_detections = valid_jerseys + hallucinated
+                hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
+                empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
+
+                # Extract confidence stats
+                conf_stats = result.get('confidence_stats')
+                has_confidence = conf_stats is not None
+                conf_avg = conf_stats.get('avg', '') if conf_stats else ''
+                conf_min = conf_stats.get('min', '') if conf_stats else ''
+                conf_max = conf_stats.get('max', '') if conf_stats else ''
+                conf_count = conf_stats.get('count', '') if conf_stats else ''
+
+                # Calculate confidence standard deviation and quality
+                conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)
+
+                # Extract confidence distribution
+                conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
+                conf_90_100 = conf_dist.get('90-100', '')
+                conf_70_89 = conf_dist.get('70-89', '')
+                conf_50_69 = conf_dist.get('50-69', '')
+                conf_30_49 = conf_dist.get('30-49', '')
+                conf_0_29 = conf_dist.get('0-29', '')
+
+                # Extract ground truth stats
+                gt = result.get('ground_truth', {})
+                gt_total_expected = gt.get('total_expected', '')
+                gt_total_tp = gt.get('total_true_positives', '')
+                gt_total_fp = gt.get('total_false_positives', '')
+                gt_total_fn = gt.get('total_false_negatives', '')
+                gt_overall_precision = gt.get('overall_precision', '')
+                gt_overall_recall = gt.get('overall_recall', '')
+                gt_overall_f1 = gt.get('overall_f1', '')
+                gt_avg_precision = gt.get('avg_precision', '')
+                gt_avg_recall = gt.get('avg_recall', '')
+                gt_avg_f1 = gt.get('avg_f1', '')
+                gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
+                gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
+                gt_conf_correct_count = gt.get('confidence_correct_count', '')
+                gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')
+
+                row = {
+                    'timestamp': result.get('timestamp', ''),
+                    'model_name': result.get('model_name', ''),
+                    'model_tag': result.get('model_tag', ''),
+                    'prompt_file': result.get('prompt_file', ''),
+                    'prompt_length': result.get('prompt_length', ''),
+                    'total_images': total_images,
+                    'images_with_jerseys': result.get('images_with_jerseys', ''),
+                    'images_without_jerseys': result.get('images_without_jerseys', ''),
+                    'images_with_errors': result.get('images_with_errors', ''),
+                    'total_raw_detections': result.get('total_raw_detections', ''),
+                    'total_valid_jerseys': valid_jerseys,
+                    'total_hallucinated': hallucinated,
+                    'hallucination_rate_pct': f"{hallu_rate:.2f}",
+                    'empty_response_rate_pct': f"{empty_rate:.2f}",
+                    'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
+                    'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
+                    'resize_enabled': result.get('resize_enabled', False),
+                    'resize_max': result.get('resize_max', ''),
+                    'images_resized': result.get('images_resized', ''),
+                    'has_confidence': has_confidence,
+                    'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
+                    'confidence_min': conf_min,
+                    'confidence_max': conf_max,
+                    'confidence_count': conf_count,
+                    'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
+                    'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
+                    'conf_90_100': conf_90_100,
+                    'conf_70_89': conf_70_89,
+                    'conf_50_69': conf_50_69,
+                    'conf_30_49': conf_30_49,
+                    'conf_0_29': conf_0_29,
+                    # Ground truth data
+                    'gt_total_expected': gt_total_expected,
+                    'gt_total_true_positives': gt_total_tp,
+                    'gt_total_false_positives': gt_total_fp,
+                    'gt_total_false_negatives': gt_total_fn,
+                    'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
+                    'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
+                    'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
+                    'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
+                    'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
+                    'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
+                    'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
+                    'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
+                    'gt_confidence_correct_count': gt_conf_correct_count,
+                    'gt_confidence_incorrect_count': gt_conf_incorrect_count
+                }
+
+                writer.writerow(row)
+
+        print(f"✓ Results exported to CSV: {csv_file}")
+        print(f"  Rows: {len(results)}")
+        print(f"  Columns: {len(fieldnames)}")
+
+    except Exception as e:
+        print(f"❌ Failed to export to CSV: {e}")
+        sys.exit(1)
+
+
+def main():
+    """Main entry point for the analysis script."""
+    parser = argparse.ArgumentParser(
+        description='Analyze jersey detection test results',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Show analysis
+  python analyze_jersey_results.py
+
+  # Show analysis and export to CSV
+  python analyze_jersey_results.py --csv results.csv
+
+  # Export to CSV only (no analysis display)
+  python analyze_jersey_results.py --csv-only results.csv
+
+  # Analyze custom results file
+  python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
+"""
+    )
+    parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
+                       help='Path to results file (default: jersey_detection_results.jsonl)')
+    parser.add_argument('--csv', metavar='FILE', dest='csv_file',
+                       help='Export results to CSV file (in addition to showing analysis)')
+    parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
+                       help='Export to CSV file only, skip analysis display')
+
+    args = parser.parse_args()
+
+    # Check if file exists
+    if not Path(args.results_file).exists():
+        print(f"Error: Results file not found: {args.results_file}")
+        print(f"Run some tests first with test_jersey_detection.py to generate results.")
+        sys.exit(1)
+
+    # Load results
+    results = load_results(args.results_file)
+
+    if not results:
+        print(f"No results found in {args.results_file}")
+        sys.exit(0)
+
+    print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")
+
+    # Handle CSV-only mode
+    if args.csv_only:
+        export_to_csv(results, args.csv_only)
+        return
+
+    # Print analyses (unless CSV-only mode)
+    print_ascii_comparison_table(results)
+    print_model_performance_chart(results)
+    print_best_performers(results)
+
+    # Export to CSV if requested
+    if args.csv_file:
+        print()
+        export_to_csv(results, args.csv_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md
+++ b/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md
@ -0,0 +1,296 @@
+# Jersey Detection Model Analysis Report
+
+**Date:** October 22, 2025
+**Models Tested:** 8 vision-language models
+**Test Images:** 194 images with known jersey numbers
+**Purpose:** Determine the best model for automated jersey number detection in sports photography
+
+---
+
+## Executive Summary
+
+After comprehensive testing of 8 different AI models on 194 sports images with known jersey numbers, we recommend **qwen2.5-vl-7b** as the best overall model for jersey detection, with **gemma-3-27b** as a close second choice depending on specific needs.
+
+### Key Findings:
+
+1. **Best Overall Performance**: qwen2.5-vl-7b achieves the highest accuracy (72.9% F1 score)
+2. **Confidence Scores Are Useful**: 7 out of 8 models show reliable confidence calibration, meaning higher confidence scores correlate with correct detections
+3. **Speed vs Accuracy Trade-off**: The most accurate models take 13-21 seconds per image; faster models sacrifice significant accuracy
+
+---
+
+## Model Performance Comparison
+
+### Top 3 Recommended Models
+
+| Rank | Model | Accuracy (F1) | Speed | Correct Detections | False Alarms | Confidence Reliability |
+|------|-------|---------------|-------|--------------------|--------------|-----------------------|
+| 🥇 1 | qwen2.5-vl-7b | 72.9% | 13.4s | 328 / 436 (75%) | 136 | Good |
+| 🥈 2 | gemma-3-27b | 72.1% | 20.9s | 343 / 462 (74%) | 147 | Very Good (+6.0) |
+| 🥉 3 | gemma-3-12b | 69.8% | 18.9s | 322 / 462 (70%) | 139 | Good (+3.1) |
+
+### Complete Results Table
+
+| Model | Accuracy (F1 Score) | Correct Detections | False Alarms | Missed Jerseys | Speed (sec/image) | Confidence Calibration |
+|-------|--------------------|--------------------|--------------|----------------|-------------------|------------------------|
+| **qwen2.5-vl-7b** | **72.9%** ⭐ | 328 / 436 | 136 | 108 | 13.4 | +0.5 (Good) |
+| **gemma-3-27b** | **72.1%** | 343 / 462 | 147 | 119 | 20.9 | +6.0 (Very Good) |
+| **gemma-3-12b** | 69.8% | 322 / 462 | 139 | 140 | 18.9 | +3.1 (Good) |
+| mistral-small-24b-q4 | 67.6% | 328 / 462 | 180 | 134 | 15.1 | +2.4 (Good) |
+| mistral-small-24b-q8 | 67.2% | 330 / 462 | 190 | 132 | 22.6 | +3.1 (Good) |
+| gemma-3-4b | 63.8% | 277 / 462 | 130 | 185 | 7.9 ⚡ | +6.2 (Very Good) |
+| lfm2-vl-1.6b | 50.5% | 171 / 448 | 58 | 277 | 4.6 ⚡⚡ | +11.9 (Excellent) |
+| kimi-vl-3b | 2.0% ❌ | 5 / 416 | 67 | 411 | 40.0 🐌 | -1.3 (Poor) |
+
+---
+
+## Understanding the Metrics
+
+### What the Numbers Mean:
+
+- **Accuracy (F1 Score)**: Overall effectiveness balancing correct detections and false alarms
+  - 70%+ = Excellent for production use
+  - 60-70% = Good for assisted workflows
+  - Below 60% = Not recommended
+
+- **Correct Detections**: Out of all jerseys that should have been found, how many were actually detected
+  - Example: "328 / 436" means the model found 328 jerseys out of 436 that were actually in the images
+
+- **False Alarms**: Jersey numbers detected that weren't actually in the image
+  - Lower is better - these are incorrect detections
+  - Can be filtered using confidence scores
+
+- **Missed Jerseys**: Jersey numbers that were in the image but not detected
+  - Lower is better - these are opportunities lost
+
+- **Speed**: Average seconds to process one image
+  - ⚡⚡ = Very fast (< 8s)
+  - ⚡ = Fast (8-15s)
+  - Standard = 15-25s
+  - 🐌 = Slow (> 30s)
+
+- **Confidence Calibration**: The difference between average confidence on correct vs incorrect detections
+  - Positive number (e.g., +6.0) = Good calibration - correct detections have higher confidence
+  - Negative number = Poor calibration - can't trust confidence scores
+  - Higher positive values = Better for filtering with confidence thresholds
+
+---
+
+## Detailed Analysis
+
+### 1. Best Model: qwen2.5-vl-7b
+
+**Why It's the Best:**
+- ✅ Highest overall accuracy (72.9%)
+- ✅ Best recall - finds 75% of all jerseys
+- ✅ Reasonable speed (13.4 seconds per image)
+- ✅ Very low hallucination rate (only 1%)
+- ✅ Confidence scores are reliable for filtering
+
+**Strengths:**
+- Finds the most jerseys (highest recall at 75.2%)
+- Rarely makes up fake jersey numbers (hallucination rate: 1%)
+- Almost always returns results (empty response rate: 2.6%)
+
+**Weaknesses:**
+- Generates 136 false positives (30% of detections are incorrect)
+- Confidence calibration is minimal (+0.5), making threshold filtering less effective
+- All confidence scores are 90-95, showing limited variation
+
+**Best For:**
+- Applications where finding all jerseys is critical
+- Batch processing where moderate false positives are acceptable
+- When combined with manual review of results
+
+### 2. Runner-Up: gemma-3-27b
+
+**Why It's Excellent:**
+- ✅ Nearly identical accuracy to the winner (72.1% vs 72.9%)
+- ✅ Finds the most total jerseys (343 correct detections)
+- ✅ Excellent confidence calibration (+6.0 difference)
+- ✅ No hallucinations
+- ⚠️ Slower processing (20.9s per image)
+
+**Strengths:**
+- Best for confidence-based filtering (6-point difference between correct/incorrect)
+- Highest absolute number of correct detections (343)
+- More varied confidence scores (54% in 90-100 range, 42% in 70-89 range)
+
+**Weaknesses:**
+- 56% slower than qwen2.5-vl-7b
+- Similar false positive rate
+
+**Best For:**
+- Applications requiring confidence-based filtering
+- When processing time is not critical
+- Maximizing total correct detections
+
+### 3. Alternative: gemma-3-4b (Speed Champion)
+
+**Why Consider It:**
+- ⚡ Fast processing (7.9 seconds per image)
+- ✅ Very good confidence calibration (+6.2)
+- ✅ Zero hallucinations
+- ⚠️ Lower accuracy (63.8%)
+
+**Trade-offs:**
+- 41% faster than qwen2.5-vl-7b
+- But 12% lower accuracy
+- Misses 40% of jerseys (185 false negatives)
+
+**Best For:**
+- Real-time or high-volume processing
+- Applications where speed is more important than completeness
+- Initial rough filtering before manual review
+
+---
+
+## Should You Use Confidence Scores for Filtering?
+
+### Answer: **YES** - Confidence scores are useful for most models
+
+### Evidence from Testing:
+
+**7 out of 8 models show good confidence calibration:**
+
+| Model | Avg Confidence (Correct) | Avg Confidence (Incorrect) | Difference | Reliability |
+|-------|--------------------------|---------------------------|------------|-------------|
+| lfm2-vl-1.6b | 91.8 | 80.0 | **+11.9** | ⭐⭐⭐ Excellent |
+| gemma-3-4b | 85.2 | 79.0 | **+6.2** | ⭐⭐ Very Good |
+| gemma-3-27b | 88.2 | 82.2 | **+6.0** | ⭐⭐ Very Good |
+| gemma-3-12b | 91.8 | 88.7 | **+3.1** | ⭐ Good |
+| mistral-small-24b-q8 | 92.3 | 89.1 | **+3.1** | ⭐ Good |
+| mistral-small-24b-q4 | 93.0 | 90.7 | **+2.4** | ⭐ Good |
+| qwen2.5-vl-7b | 94.6 | 94.1 | +0.5 | Limited utility |
+| kimi-vl-3b | 88.4 | 89.7 | **-1.3** | ❌ Not reliable |
+
+### What This Means:
+
+**For most models**, setting a confidence threshold can significantly reduce false positives:
+- A threshold of 85 on gemma-3-27b would keep most correct detections (88.2 avg) while filtering many incorrect ones (82.2 avg)
+- A threshold of 85 on gemma-3-4b would be even more effective
+
+**Exception: qwen2.5-vl-7b** has minimal difference (94.6 vs 94.1), making threshold filtering less useful despite being the most accurate model.
+
+### Recommended Filtering Strategy:
+
+1. **Use gemma-3-27b with confidence threshold of 85+** for best balance of accuracy and filtering
+2. **Use gemma-3-4b with confidence threshold of 85+** for faster processing with good filtering
+3. **Use qwen2.5-vl-7b without filtering** when you need maximum recall and will manually review results
+
+---
+
+## Model-Specific Recommendations
+
+### For Different Use Cases:
+
+#### 🎯 **Highest Accuracy Required**
+- **Model:** qwen2.5-vl-7b
+- **Expected Results:** Find 75% of jerseys, 30% false positive rate
+- **Processing:** 13.4 seconds per image
+- **Setup:** Use raw results, manually review all detections
+
+#### 🎯 **Best Balance of Speed and Accuracy**
+- **Model:** gemma-3-12b
+- **Expected Results:** Find 70% of jerseys, reasonable false positive rate
+- **Processing:** 18.9 seconds per image
+- **Setup:** Apply confidence threshold of 90+ to reduce false positives
+
+#### 🎯 **Maximum Quality with Confidence Filtering**
+- **Model:** gemma-3-27b
+- **Expected Results:** Find 74% of jerseys, filter false positives effectively
+- **Processing:** 20.9 seconds per image
+- **Setup:** Apply confidence threshold of 85+ to reduce false positives by ~50%
+
+#### ⚡ **Speed is Critical**
+- **Model:** gemma-3-4b
+- **Expected Results:** Find 60% of jerseys quickly
+- **Processing:** 7.9 seconds per image
+- **Setup:** Apply confidence threshold of 85+ for quality filtering
+
+#### ❌ **Do Not Use**
+- **kimi-vl-3b**: Only 2% accuracy, extremely slow, poor confidence calibration
+
+---
+
+## Implementation Recommendations
+
+### 1. Production Deployment Strategy
+
+**Recommended:** Two-tier approach
+- **Tier 1 (Automatic):** gemma-3-27b with confidence threshold 85+
+  - Automatically tag high-confidence detections
+  - Expected: ~200 correct detections per 194 images with minimal false positives
+
+- **Tier 2 (Review Queue):** qwen2.5-vl-7b on remaining images
+  - Human review of all detections below confidence threshold
+  - Catches jerseys missed by Tier 1
+
+### 2. Confidence Threshold Guidelines
+
+Based on testing data:
+
+| Model | Recommended Threshold | Expected Precision | Expected Recall |
+|-------|----------------------|-------------------|-----------------|
+| gemma-3-27b | 85+ | ~85-90% | ~60-65% |
+| gemma-3-4b | 85+ | ~80-85% | ~50-55% |
+| gemma-3-12b | 90+ | ~80-85% | ~60-65% |
+| qwen2.5-vl-7b | Don't filter | 70.7% | 75.2% |
+
+### 3. Performance Optimization
+
+**Processing 1000 images:**
+- qwen2.5-vl-7b: ~3.7 hours
+- gemma-3-27b: ~5.8 hours
+- gemma-3-4b: ~2.2 hours
+
+**Recommendation:** Use gemma-3-4b for initial pass, qwen2.5-vl-7b for second pass on low-confidence results.
+
+---
+
+## Conclusions
+
+### Main Findings:
+
+1. **qwen2.5-vl-7b is the most accurate model** but has limited confidence score utility
+2. **gemma-3-27b offers the best combination** of accuracy and confidence-based filtering
+3. **Confidence scores are highly valuable** for reducing false positives in most models
+4. **Speed vs accuracy trade-offs are significant** - fastest model is 9% less accurate than best
+5. **One model (kimi-vl-3b) is completely unsuitable** for this task
+
+### Strategic Recommendations:
+
+**For most users:** Deploy gemma-3-27b with confidence threshold of 85+
+- Balances accuracy, speed, and filtering capability
+- Reduces manual review burden significantly
+- Good confidence calibration enables automated decision-making
+
+**For maximum accuracy:** Deploy qwen2.5-vl-7b without filtering
+- Best for finding all possible jerseys
+- Requires manual review of results
+- Accept higher false positive rate
+
+**For high-volume processing:** Deploy gemma-3-4b with confidence threshold of 85+
+- Fast enough for real-time applications
+- Good accuracy for the speed
+- Effective filtering capability
+
+### Final Verdict:
+
+**Winner: qwen2.5-vl-7b** for pure accuracy
+**Best Overall: gemma-3-27b** for practical deployment with confidence filtering
+**Best Value: gemma-3-4b** for speed-sensitive applications
+
+---
+
+## Technical Notes
+
+- **Test Dataset:** 194 images with ground truth jersey numbers encoded in filenames
+- **Total Expected Jerseys:** 416-462 depending on which images each model processed successfully
+- **Evaluation Metrics:** Precision, Recall, F1 Score, Confidence Calibration
+- **Hardware:** Testing performed on comparable hardware configurations
+- **Prompt:** All models used identical jersey detection prompt with confidence scores
+
+---
+
+*Report generated from comprehensive testing of 8 vision-language models for jersey number detection in sports photography.*
--- a/docs/LLAMA_SWAP_SETUP.md
+++ b/docs/LLAMA_SWAP_SETUP.md
@ -0,0 +1,237 @@
+# llama-swap Setup Guide for Jersey Detection Testing
+
+This guide explains how to use [llama-swap](https://github.com/mostlygeek/llama-swap) to automatically switch between different vision language models when testing jersey detection.
+
+## What is llama-swap?
+
+llama-swap is a model-swapping proxy that sits between your application and llama.cpp servers. It automatically loads and unloads models based on the `model` parameter in API requests, allowing you to test multiple models without manually restarting servers.
+
+## Installation
+
+### Docker (Recommended)
+
+```bash
+# Pull the CUDA image (or cpu, vulkan, intel depending on your hardware)
+docker pull ghcr.io/mostlygeek/llama-swap:cuda
+```
+
+### Homebrew (macOS/Linux)
+
+```bash
+brew tap mostlygeek/llama-swap
+brew install llama-swap
+```
+
+### Pre-built Binaries
+
+Download from the [releases page](https://github.com/mostlygeek/llama-swap/releases).
+
+## Configuration
+
+A configuration file `llama-swap-config.yaml` is provided with 8 pre-configured vision models:
+
+### Small Models (1-4B parameters)
+- `lfm2-vl-1.6b` - LiquidAI LFM2-VL 1.6B (F16)
+- `gemma-3-4b` - Gemma 3 4B Instruct (F16)
+- `kimi-vl-3b` - Kimi VL A3B Thinking (F16)
+
+### Medium Models (7-12B parameters)
+- `qwen2.5-vl-7b` - Qwen2.5-VL 7B Instruct (F16)
+- `gemma-3-12b` - Gemma 3 12B Instruct (F16)
+
+### Large Models (24-27B parameters)
+- `mistral-small-24b-q8` - Mistral Small 3.2 24B (Q8_K_XL)
+- `mistral-small-24b-q4` - Mistral Small 3.2 24B (Q4_K_XL)
+- `gemma-3-27b` - Gemma 3 27B Instruct (Q8_0)
+
+## Starting llama-swap
+
+### Using Docker
+
+```bash
+docker run -it --rm --runtime nvidia -p 8080:8080 \
+  -v $(pwd)/llama-swap-config.yaml:/app/config.yaml \
+  -v /path/to/hf/cache:/root/.cache/huggingface \
+  ghcr.io/mostlygeek/llama-swap:cuda
+```
+
+### Using Binary
+
+```bash
+llama-swap --config llama-swap-config.yaml --listen localhost:8080
+```
+
+## Testing with Jersey Detection Script
+
+Once llama-swap is running, you can test different models by specifying the `--model-tag` parameter:
+
+### Test a Single Model
+
+```bash
+# Test Qwen2.5-VL 7B with resizing
+python test_jersey_detection.py ./images jersey_prompt.txt \
+  --model-tag "qwen2.5-vl-7b" \
+  --resize 1024
+```
+
+### Test Multiple Models Sequentially
+
+```bash
+# Test small models
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b" --resize 1024
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-4b" --resize 1024
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "kimi-vl-3b" --resize 1024
+
+# Test medium models
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-12b" --resize 1024
+
+# Test large models
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "mistral-small-24b-q4" --resize 1024
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-27b" --resize 1024
+```
+
+### Automated Testing Scripts
+
+Two bash scripts are provided for automated testing:
+
+#### 1. Full Test Suite (`test_all_models.sh`)
+
+Tests **all models** defined in `llama-swap-config.yaml`:
+
+```bash
+# Basic usage (uses defaults)
+./test_all_models.sh ./test_images
+
+# Customize configuration with environment variables
+RESIZE=2048 ./test_all_models.sh ./test_images
+OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh ./test_images
+PROMPT_FILE=custom_prompt.txt ./test_all_models.sh ./test_images
+
+# Disable resize
+RESIZE= ./test_all_models.sh ./test_images
+```
+
+**Features:**
+- Automatically extracts all model tags from YAML config
+- Color-coded output with progress tracking
+- Confirms before starting tests
+- Shows summary with success/failure counts
+- Asks to continue if a model fails
+
+**Default Configuration:**
+- Images: `./test_images`
+- Prompt: `jersey_prompt_with_confidence.txt`
+- Resize: `1024px`
+- Output: `jersey_detection_results.jsonl`
+
+#### 2. Quick Test (`test_quick.sh`)
+
+Tests a **small subset** of models for rapid iteration:
+
+```bash
+# Test default selection (small, medium, large)
+./test_quick.sh ./test_images
+
+# Test custom models
+MODELS="lfm2-vl-1.6b qwen2.5-vl-7b" ./test_quick.sh ./test_images
+
+# Customize settings
+RESIZE=512 MODELS="gemma-3-4b" ./test_quick.sh ./test_images
+```
+
+**Default Models:**
+- `lfm2-vl-1.6b` (Small - 1.6B)
+- `qwen2.5-vl-7b` (Medium - 7B)
+- `mistral-small-24b-q4` (Large - 24B Q4)
+
+**Use Cases:**
+- Quick validation after prompt changes
+- Testing configuration adjustments
+- Rapid prototyping before full test run
+
+## Analyzing Results
+
+After testing multiple models, use the analysis script to compare performance:
+
+```bash
+python analyze_jersey_results.py
+```
+
+This will show:
+- Comparison table of all models tested
+- Performance charts with hallucination rates
+- Best performers by speed and accuracy
+- Confidence distribution (if applicable)
+
+## Model Swapping Behavior
+
+llama-swap will:
+1. **Automatically load** the requested model when you specify `--model-tag`
+2. **Automatically unload** the previous model (if different from current request)
+3. **Keep running** if you test the same model multiple times
+4. **Monitor** model loading/unloading in the web UI at `http://localhost:8080/ui`
+
+## Optional: Model Auto-Unloading
+
+To automatically unload models after 5 minutes of inactivity, uncomment this line in `llama-swap-config.yaml`:
+
+```yaml
+ttl: 300
+```
+
+## Optional: Preload Model on Startup
+
+To preload a specific model when llama-swap starts, uncomment and modify this section:
+
+```yaml
+hooks:
+  onStartup:
+    - loadModel: qwen2.5-vl-7b
+```
+
+## Customizing Models
+
+To add or modify models, edit `llama-swap-config.yaml`:
+
+```yaml
+models:
+  my-custom-model:
+    name: "My Custom Model Description"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf user/model-name:quantization
+```
+
+Then test with:
+
+```bash
+python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "my-custom-model"
+```
+
+## Troubleshooting
+
+### Model not loading
+- Check llama-swap logs at `http://localhost:8080/log` or via `curl http://localhost:8080/log/stream`
+- Verify the model name in the config matches the `--model-tag` parameter
+- Ensure sufficient GPU memory for the model
+
+### Connection refused
+- Verify llama-swap is running: `curl http://localhost:8080/health`
+- Check the server URL matches: default is `http://192.168.1.126:8080` (from scan.ini)
+
+### Slow model switching
+- First load downloads models from HuggingFace (can be slow)
+- Subsequent loads are faster (cached locally)
+- Use quantized models (Q4, Q8) for faster loading and lower memory usage
+
+## Web UI
+
+llama-swap includes a web interface for monitoring:
+- **Dashboard**: `http://localhost:8080/ui` - View loaded models and logs
+- **Activity**: See recent API requests
+- **Logs**: Real-time log monitoring
+
+## References
+
+- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
+- [llama-swap Documentation](https://github.com/mostlygeek/llama-swap/tree/main/docs)
+- [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp)
--- a/jersey_detection_results.jsonl
+++ b/jersey_detection_results.jsonl
@ -0,0 +1,6 @@
+{"timestamp": "2025-10-19T19:30:44.272849", "model_name": "LFM2-VL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 88, "images_without_jerseys": 110, "images_with_errors": 0, "total_raw_detections": 470, "total_valid_jerseys": 235, "total_hallucinated": 235, "avg_processing_time": 4.607636096501591, "total_processing_time": 912.3119471073151, "confidence_stats": {"avg": 84.14893617021276, "min": 0, "max": 100, "count": 235, "distribution": {"90-100": 138, "70-89": 70, "50-69": 8, "30-49": 8, "0-29": 11}}, "empty_response_capable": true}
+{"timestamp": "2025-10-19T22:10:05.135029", "model_name": "ggml-org_Kimi-VL-A3B-Thinking-2506-GGUF_Kimi-VL-A3B-Thinking-2506-bf16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 28, "images_without_jerseys": 163, "images_with_errors": 7, "total_raw_detections": 49, "total_valid_jerseys": 49, "total_hallucinated": 0, "avg_processing_time": 29.11009831259949, "total_processing_time": 5763.799465894699, "confidence_stats": {"avg": 88.85714285714286, "min": 60, "max": 95, "count": 49, "distribution": {"90-100": 37, "70-89": 9, "50-69": 3, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
+{"timestamp": "2025-10-20T01:20:31.076468", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-BF16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 37.221905313356956, "total_processing_time": 7369.937252044678, "confidence_stats": {"avg": 90.81983805668017, "min": 70, "max": 95, "count": 494, "distribution": {"90-100": 362, "70-89": 132, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
+{"timestamp": "2025-10-20T12:04:37.833650", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q8_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 496, "total_valid_jerseys": 496, "total_hallucinated": 0, "avg_processing_time": 20.684308366342023, "total_processing_time": 4095.493056535721, "confidence_stats": {"avg": 90.76612903225806, "min": 70, "max": 95, "count": 496, "distribution": {"90-100": 363, "70-89": 133, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
+{"timestamp": "2025-10-20T13:01:42.747694", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 14.196594772916852, "total_processing_time": 2810.9257650375366, "confidence_stats": {"avg": 92.09514170040485, "min": 80, "max": 95, "count": 494, "distribution": {"90-100": 415, "70-89": 79, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
+{"timestamp": "2025-10-20T15:01:25.669340", "model_name": "unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-Q8_0", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 185, "images_without_jerseys": 13, "images_with_errors": 0, "total_raw_detections": 428, "total_valid_jerseys": 428, "total_hallucinated": 0, "avg_processing_time": 18.127051142731098, "total_processing_time": 3589.1561262607574, "confidence_stats": {"avg": 87.14953271028037, "min": 55, "max": 100, "count": 428, "distribution": {"90-100": 250, "70-89": 166, "50-69": 12, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
--- a/jersey_prompt.txt
+++ b/jersey_prompt.txt
@ -0,0 +1,43 @@
+You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
+
+CRITICAL INSTRUCTIONS:
+1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
+2. ONLY include jersey numbers that you can ACTUALLY READ in the image
+3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
+4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
+5. DO NOT include jerseys if you cannot clearly see the number
+
+RESPONSE FORMAT:
+Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
+
+Use DOUBLE QUOTES (") for all JSON keys and string values.
+
+The JSON must have a single key "jerseys" with an array of dictionaries.
+
+Each dictionary must have exactly these three keys:
+- "jersey_number": The number on the jersey (as a string, only if clearly visible)
+- "jersey_color": The primary color of the jersey
+- "number_color": The color of the number on the jersey
+
+Example response for an image WITH visible jerseys:
+{
+  "jerseys": [
+    {
+      "jersey_number": "101",
+      "jersey_color": "red",
+      "number_color": "white"
+    },
+    {
+      "jersey_number": "142",
+      "jersey_color": "blue",
+      "number_color": "yellow"
+    }
+  ]
+}
+
+Example response for an image WITHOUT jerseys or with unclear numbers:
+{"jerseys": []}
+
+REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
+
+Now analyze the image and return the JSON object.
--- a/jersey_prompt_with_confidence.txt
+++ b/jersey_prompt_with_confidence.txt
@ -0,0 +1,53 @@
+You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
+
+CRITICAL INSTRUCTIONS:
+1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
+2. ONLY include jersey numbers that you can ACTUALLY READ in the image
+3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
+4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
+5. DO NOT include jerseys if you cannot clearly see the number
+
+RESPONSE FORMAT:
+Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
+
+Use DOUBLE QUOTES (") for all JSON keys and string values.
+
+The JSON must have a single key "jerseys" with an array of dictionaries.
+
+Each dictionary must have exactly these four keys:
+- "jersey_number": The number on the jersey (as a string, only if clearly visible)
+- "jersey_color": The primary color of the jersey
+- "number_color": The color of the number on the jersey
+- "confidence": A number from 0 to 100 representing your confidence in this detection (0 = no confidence, 100 = absolutely certain)
+
+CONFIDENCE SCORING GUIDELINES:
+- 90-100: Jersey number is extremely clear and unambiguous
+- 70-89: Jersey number is clear but might have minor occlusion or angle issues
+- 50-69: Jersey number is partially visible or somewhat unclear
+- 30-49: Jersey number is difficult to read but you can make it out
+- 0-29: Very uncertain, number is barely visible
+
+Example response for an image WITH visible jerseys:
+{
+  "jerseys": [
+    {
+      "jersey_number": "101",
+      "jersey_color": "red",
+      "number_color": "white",
+      "confidence": 95
+    },
+    {
+      "jersey_number": "142",
+      "jersey_color": "blue",
+      "number_color": "yellow",
+      "confidence": 78
+    }
+  ]
+}
+
+Example response for an image WITHOUT jerseys or with unclear numbers:
+{"jerseys": []}
+
+REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array. Always provide a confidence score that honestly reflects how certain you are about each detection.
+
+Now analyze the image and return the JSON object.
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@ -0,0 +1,59 @@
+# llama-swap configuration for jersey detection testing
+# ==================================================
+# This configuration allows automatic model switching for testing
+# different vision language models with the jersey detection test script.
+#
+# Usage:
+#   llama-swap --config llama-swap-config.yaml --listen localhost:8080
+#
+# Then use the test script with --model-tag:
+#   python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b"
+#
+# llama-swap will automatically load the requested model and swap models
+# as needed when you run tests with different --model-tag values.
+
+models:
+  # Small vision models (1-4B parameters)
+  lfm2-vl-1.6b:
+    name: "LiquidAI LFM2-VL 1.6B (F16)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf LiquidAI/LFM2-VL-1.6B-GGUF:F16
+
+  gemma-3-4b:
+    name: "Gemma 3 4B Instruct (F16)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-4b-it-GGUF:F16
+
+  kimi-vl-3b:
+    name: "Kimi VL A3B Thinking (F16)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:F16
+
+  # Medium vision models (7-12B parameters)
+  qwen2.5-vl-7b:
+    name: "Qwen2.5-VL 7B Instruct (F16)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:F16
+
+  gemma-3-12b:
+    name: "Gemma 3 12B Instruct (F16)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-12b-it-GGUF:F16
+
+  # Large models (24-27B parameters)
+  mistral-small-24b-q8:
+    name: "Mistral Small 3.2 24B Instruct (Q8_K_XL)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q8_K_XL
+
+  mistral-small-24b-q4:
+    name: "Mistral Small 3.2 24B Instruct (Q4_K_XL)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q4_K_XL
+
+  gemma-3-27b:
+    name: "Gemma 3 27B Instruct (Q8_0)"
+    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-27b-it-GGUF:Q8_0
+
+# Optional: Automatically unload models after 5 minutes of inactivity
+# Uncomment to enable
+# ttl: 300
+
+# Optional: Preload a specific model on startup
+# Uncomment to enable
+# hooks:
+#   onStartup:
+#     - loadModel: qwen2.5-vl-7b
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+# Jersey Detection Test Dependencies
+# Install with: pip install -r requirements.txt
+
+# HTTP client for llama.cpp server communication
+requests>=2.28.0
+
+# Image processing
+opencv-python>=4.8.0
+numpy>=1.24.0
--- a/scan_utils/init.py
+++ b/scan_utils/init.py
@ -0,0 +1 @@
+# Jersey detection scan utilities
--- a/scan_utils/jersey_detection.py
+++ b/scan_utils/jersey_detection.py
@ -0,0 +1,149 @@
+import json
+import cv2
+import numpy as np
+from typing import Dict, Any, Optional
+import logging
+
+# Read the default jersey detection prompt
+try:
+    with open('jersey_prompt.txt', 'r') as f:
+        DEFAULT_JERSEY_PROMPT = f.read()
+except FileNotFoundError:
+    # Fallback prompt if file is not found
+    DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
+
+CRITICAL INSTRUCTIONS:
+1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
+2. ONLY include jersey numbers that you can ACTUALLY READ in the image
+3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
+4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
+5. DO NOT include jerseys if you cannot clearly see the number
+
+RESPONSE FORMAT:
+Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
+
+Use DOUBLE QUOTES (") for all JSON keys and string values.
+
+The JSON must have a single key "jerseys" with an array of dictionaries.
+
+Each dictionary must have exactly these three keys:
+- "jersey_number": The number on the jersey (as a string, only if clearly visible)
+- "jersey_color": The primary color of the jersey
+- "number_color": The color of the number on the jersey
+
+Example response for an image WITH visible jerseys:
+{
+  "jerseys": [
+    {
+      "jersey_number": "101",
+      "jersey_color": "red",
+      "number_color": "white"
+    }
+  ]
+}
+
+Example response for an image WITHOUT jerseys or with unclear numbers:
+{"jerseys": []}
+
+REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
+
+Now analyze the image and return the JSON object."""
+
+
+class DetectJerseys:
+    """A class for detecting sports jerseys using a vision language model."""
+
+    def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
+        """
+        Initialize the jersey detection class.
+        
+        Args:
+            llama_cpp_base_url: Base URL for the llama.cpp server
+            logger: Logger instance for logging messages
+            prompt: Custom prompt to use for jersey detection (optional)
+        """
+        self.logger = logger or logging.getLogger(__name__)
+        self.prompt = prompt or DEFAULT_JERSEY_PROMPT
+        
+        # Import here to avoid circular dependencies
+        try:
+            from scan_utils.llama_cpp_client import LlamaCppClient
+            self.client = LlamaCppClient(base_url=llama_cpp_base_url)
+            self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
+        except ImportError as e:
+            self.logger.error(f"Failed to import LlamaCppClient: {e}")
+            raise
+
+    def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
+        """
+        Detect jerseys in an image using the vision language model.
+
+        Args:
+            image: OpenCV image (numpy array) to analyze
+            temperature: Temperature value for the model (default: 0.1)
+
+        Returns:
+            Dictionary containing detected jerseys or empty dict if invalid
+        """
+        try:
+            # Create multimodal message with image and prompt
+            message = self.client.create_multimodal_message(
+                role="user",
+                content=self.prompt,
+                images=[image]
+            )
+            
+            # Send chat completion request
+            response = self.client.chat_completion(
+                messages=[message],
+                temperature=temperature,
+                max_tokens=1000
+            )
+            
+            # Extract the response text
+            if 'choices' in response and len(response['choices']) > 0:
+                response_text = response['choices'][0]['message']['content']
+                
+                # Log the raw response for debugging
+                self.logger.debug(f"Raw VLM response: {response_text}")
+                
+                # Parse JSON response
+                try:
+                    result = json.loads(response_text)
+
+                    # Process jerseys to ensure they have all required fields
+                    jerseys = result.get('jerseys', [])
+
+                    # Hallucination detection: filter out example numbers from the prompt
+                    # Using numbers > 100 as examples to avoid filtering valid jersey numbers
+                    HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
+
+                    processed_jerseys = []
+                    for jersey in jerseys:
+                        jersey_number = jersey.get('jersey_number', '')
+
+                        # Check for hallucination (model returning example numbers)
+                        if jersey_number in HALLUCINATION_NUMBERS:
+                            self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
+                            continue
+
+                        # Ensure all required fields are present
+                        processed_jersey = {
+                            'jersey_number': jersey_number,
+                            'jersey_color': jersey.get('jersey_color', ''),
+                            'number_color': jersey.get('number_color', 'unknown')  # Default to 'unknown' if missing
+                        }
+                        processed_jerseys.append(processed_jersey)
+
+                    return {"jerseys": processed_jerseys}
+                except json.JSONDecodeError as e:
+                    self.logger.error(f"Failed to parse JSON response: {e}")
+                    self.logger.debug(f"Response text was: {response_text}")
+                    return {"jerseys": []}
+            else:
+                self.logger.warning("Empty response from VLM")
+                return {"jerseys": []}
+                
+        except Exception as e:
+            self.logger.error(f"Error during jersey detection: {e}")
+            return {"jerseys": []}
--- a/scan_utils/llama_cpp_client.py
+++ b/scan_utils/llama_cpp_client.py
@ -0,0 +1,237 @@
+import base64
+import json
+import cv2
+import numpy as np
+import requests
+from typing import List, Dict, Any, Optional, Union
+
+
+class LlamaCppClient:
+    """A Python client for interacting with a llama.cpp server."""
+
+    def __init__(self, base_url: str = "http://192.168.1.34:8080"):
+        """
+        Initialize the client with the base URL of the llama.cpp server.
+
+        Args:
+            base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
+        """
+        self.base_url = base_url.rstrip('/')
+        
+    def health_check(self) -> Dict[str, Any]:
+        """
+        Check the health status of the server.
+        
+        Returns:
+            Health status response from the server
+        """
+        response = requests.get(f"{self.base_url}/health")
+        response.raise_for_status()
+        return response.json()
+        
+    def get_models(self) -> Dict[str, Any]:
+        """
+        Get information about loaded models.
+        
+        Returns:
+            Model information from the server
+        """
+        response = requests.get(f"{self.base_url}/v1/models")
+        response.raise_for_status()
+        return response.json()
+        
+    def chat_completion(
+        self,
+        messages: List[Dict[str, Any]],
+        temperature: float = 0.1,
+        min_p: float = 0.15,
+        repetition_penalty: float = 1.05,
+        min_image_tokens: int = 64,
+        max_image_tokens: int = 256,
+        do_image_splitting: bool = True,
+        max_tokens: int = -1,
+        stream: bool = False,
+        **kwargs
+    ) -> Union[Dict[str, Any], requests.Response]:
+        """
+        Generate a chat completion using the OpenAI-compatible API.
+        
+        Args:
+            messages: List of message dictionaries with role and content
+            temperature: Sampling temperature (default: 0.1)
+            min_p: Minimum probability for sampling (default: 0.15)
+            repetition_penalty: Repetition penalty factor (default: 1.05)
+            min_image_tokens: Minimum image tokens (default: 64)
+            max_image_tokens: Maximum image tokens (default: 256)
+            do_image_splitting: Whether to split images (default: True)
+            max_tokens: Maximum tokens to generate (default: -1 for infinity)
+            stream: Whether to stream the response (default: False)
+            **kwargs: Additional parameters for the completion
+            
+        Returns:
+            Completion response or streaming response
+        """
+        payload = {
+            "messages": messages,
+            "temperature": temperature,
+            "min_p": min_p,
+            "repetition_penalty": repetition_penalty,
+            "min_image_tokens": min_image_tokens,
+            "max_image_tokens": max_image_tokens,
+            "do_image_splitting": do_image_splitting,
+            "max_tokens": max_tokens,
+            "cache_prompt": True,
+            "stream": stream,
+            **kwargs
+        }
+
+        # Debug: Show model parameter if present (for llama-swap debugging)
+        if 'model' in payload and payload['model']:
+            import os
+            if os.environ.get('DEBUG_LLAMA_SWAP'):
+                print(f"[DEBUG] Requesting model: {payload['model']}")
+
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            stream=stream
+        )
+        response.raise_for_status()
+        
+        if stream:
+            return response
+            
+        return response.json()
+        
+    def completion(
+        self,
+        prompt: Union[str, List[Union[str, int]]],
+        temperature: float = 0.1,
+        min_p: float = 0.15,
+        repetition_penalty: float = 1.05,
+        min_image_tokens: int = 64,
+        max_image_tokens: int = 256,
+        do_image_splitting: bool = True,
+        max_tokens: int = -1,
+        stream: bool = False,
+        **kwargs
+    ) -> Union[Dict[str, Any], requests.Response]:
+        """
+        Generate a completion using the non-OAI compatible API.
+        
+        Args:
+            prompt: The prompt string or list of tokens
+            temperature: Sampling temperature (default: 0.1)
+            min_p: Minimum probability for sampling (default: 0.15)
+            repetition_penalty: Repetition penalty factor (default: 1.05)
+            min_image_tokens: Minimum image tokens (default: 64)
+            max_image_tokens: Maximum image tokens (default: 256)
+            do_image_splitting: Whether to split images (default: True)
+            max_tokens: Maximum tokens to generate (default: -1 for infinity)
+            stream: Whether to stream the response (default: False)
+            **kwargs: Additional parameters for the completion
+            
+        Returns:
+            Completion response or streaming response
+        """
+        payload = {
+            "prompt": prompt,
+            "temperature": temperature,
+            "min_p": min_p,
+            "repeat_penalty": repetition_penalty,
+            "min_image_tokens": min_image_tokens,
+            "max_image_tokens": max_image_tokens,
+            "do_image_splitting": do_image_splitting,
+            "cache_prompt": True,
+            "n_predict": max_tokens,
+            "stream": stream,
+            **kwargs
+        }
+        
+        response = requests.post(
+            f"{self.base_url}/completion",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            stream=stream
+        )
+        response.raise_for_status()
+        
+        if stream:
+            return response
+            
+        return response.json()
+        
+    @staticmethod
+    def _encode_image_to_base64(image_path: str) -> str:
+        """
+        Encode an image file to base64 string.
+        
+        Args:
+            image_path: Path to the image file
+            
+        Returns:
+            Base64 encoded image string
+        """
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+            
+    @staticmethod
+    def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
+        """
+        Encode an OpenCV image to base64 string.
+
+        Args:
+            image: OpenCV image (numpy array)
+
+        Returns:
+            Base64 encoded image string
+        """
+        _, buffer = cv2.imencode('.jpg', image)
+        return base64.b64encode(buffer).decode('utf-8')
+        
+    def create_multimodal_message(
+        self,
+        role: str,
+        content: str,
+        images: Optional[List[Union[str, np.ndarray]]] = None
+    ) -> Dict[str, Any]:
+        """
+        Create a multimodal message with text and images.
+
+        Args:
+            role: Role of the message (system, user, assistant)
+            content: Text content of the message
+            images: List of image paths or OpenCV images (numpy arrays)
+
+        Returns:
+            Formatted message dictionary
+        """
+        if not images:
+            return {"role": role, "content": content}
+            
+        # Process images
+        image_data = []
+        for img in images:
+            if isinstance(img, str):
+                # Image path
+                encoded_image = self._encode_image_to_base64(img)
+            else:
+                # OpenCV image
+                encoded_image = self._encode_cv2_image_to_base64(img)
+            image_data.append(encoded_image)
+            
+        # Create multimodal content
+        multimodal_content = [
+            {"type": "text", "text": content}
+        ]
+        
+        for img_data in image_data:
+            multimodal_content.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{img_data}"
+                }
+            })
+            
+        return {"role": role, "content": multimodal_content}
--- a/test_all_models.sh
+++ b/test_all_models.sh
@ -0,0 +1,263 @@
+#!/bin/bash
+# ==============================================================================
+# Test All Models Script for Jersey Detection
+# ==============================================================================
+# This script automatically tests all models defined in llama-swap-config.yaml
+# with the jersey detection test suite.
+#
+# Usage:
+#   ./test_all_models.sh
+#   ./test_all_models.sh /path/to/images
+#   RESIZE=2048 ./test_all_models.sh
+#   OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
+# ==============================================================================
+
+# Note: We don't use 'set -e' here because we have explicit error handling
+# in the test loop and want to give the user the option to continue on failures
+
+# ==============================================================================
+# Configuration Variables
+# ==============================================================================
+
+# Image directory containing test images
+IMAGES_DIR="${1:-./test_images}"
+
+# Prompt file to use for testing
+PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
+
+# Resize images to this max dimension (set to empty string to disable)
+RESIZE="${RESIZE:-1024}"
+
+# Output file for results
+OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
+
+# llama-swap configuration file
+LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
+
+# Server URL
+SERVER_URL="${SERVER_URL:-http://localhost:8080}"
+
+# ==============================================================================
+# Color codes for output
+# ==============================================================================
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# ==============================================================================
+# Helper Functions
+# ==============================================================================
+
+print_header() {
+    echo -e "${CYAN}============================================================================${NC}"
+    echo -e "${CYAN}$1${NC}"
+    echo -e "${CYAN}============================================================================${NC}"
+}
+
+print_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+# ==============================================================================
+# Validation
+# ==============================================================================
+
+print_header "Jersey Detection - Test All Models"
+
+# Check if images directory exists
+if [ ! -d "$IMAGES_DIR" ]; then
+    print_error "Image directory not found: $IMAGES_DIR"
+    echo "Usage: $0 <image_directory>"
+    exit 1
+fi
+
+# Check if prompt file exists
+if [ ! -f "$PROMPT_FILE" ]; then
+    print_error "Prompt file not found: $PROMPT_FILE"
+    exit 1
+fi
+
+# Check if llama-swap config exists
+if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
+    print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
+    exit 1
+fi
+
+# Check if test script exists
+if [ ! -f "test_jersey_detection.py" ]; then
+    print_error "test_jersey_detection.py not found in current directory"
+    exit 1
+fi
+
+# Check if server is running
+print_info "Checking if llama-swap server is running at $SERVER_URL..."
+if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
+    print_error "Cannot connect to llama-swap at $SERVER_URL"
+    echo ""
+    echo "Please start llama-swap first:"
+    echo "  llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
+    echo ""
+    exit 1
+fi
+print_success "Server is running"
+
+# ==============================================================================
+# Extract model tags from YAML
+# ==============================================================================
+
+print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
+
+# Extract model IDs (keys under 'models:')
+# This uses grep and sed to parse the YAML (simple parser, works for our format)
+MODEL_TAGS=$(grep "^  [a-z]" "$LLAMA_SWAP_CONFIG" | \
+             grep -v "    " | \
+             sed 's/:.*//' | \
+             sed 's/^  //')
+
+if [ -z "$MODEL_TAGS" ]; then
+    print_error "No model tags found in $LLAMA_SWAP_CONFIG"
+    exit 1
+fi
+
+# Convert to array
+readarray -t MODELS <<< "$MODEL_TAGS"
+
+MODEL_COUNT=${#MODELS[@]}
+print_success "Found $MODEL_COUNT models to test"
+
+# ==============================================================================
+# Display Configuration
+# ==============================================================================
+
+echo ""
+print_info "Test Configuration:"
+echo "  Images directory:  $IMAGES_DIR"
+echo "  Prompt file:       $PROMPT_FILE"
+echo "  Resize:            ${RESIZE:-Disabled}"
+echo "  Output file:       $OUTPUT_FILE"
+echo "  Server URL:        $SERVER_URL"
+echo "  Models to test:    $MODEL_COUNT"
+echo ""
+
+# List all models
+print_info "Models:"
+for i in "${!MODELS[@]}"; do
+    echo "  $((i+1)). ${MODELS[$i]}"
+done
+echo ""
+
+# ==============================================================================
+# Confirmation
+# ==============================================================================
+
+read -p "Continue with testing? (y/N) " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+    print_warning "Testing cancelled"
+    exit 0
+fi
+
+# ==============================================================================
+# Run Tests
+# ==============================================================================
+
+print_header "Starting Tests"
+
+START_TIME=$(date +%s)
+SUCCESSFUL=0
+FAILED=0
+
+for i in "${!MODELS[@]}"; do
+    MODEL="${MODELS[$i]}"
+    MODEL_NUM=$((i+1))
+
+    echo ""
+    print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
+
+    # Build command
+    CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
+    CMD="$CMD --model-tag \"$MODEL\""
+    CMD="$CMD --output-file \"$OUTPUT_FILE\""
+    CMD="$CMD --server-url \"$SERVER_URL\""
+
+    # Add resize if configured
+    if [ -n "$RESIZE" ]; then
+        CMD="$CMD --resize $RESIZE"
+    fi
+
+    print_info "Running: $CMD"
+    echo ""
+
+    # Run the test
+    if eval "$CMD"; then
+        print_success "Model $MODEL completed successfully"
+        SUCCESSFUL=$((SUCCESSFUL + 1))
+    else
+        print_error "Model $MODEL failed"
+        FAILED=$((FAILED + 1))
+
+        # Ask if user wants to continue
+        echo ""
+        read -p "Continue with remaining models? (Y/n) " -n 1 -r
+        echo
+        if [[ $REPLY =~ ^[Nn]$ ]]; then
+            print_warning "Testing stopped by user"
+            break
+        fi
+    fi
+
+    # Show progress
+    if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
+        print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
+    fi
+done
+
+# ==============================================================================
+# Summary
+# ==============================================================================
+
+END_TIME=$(date +%s)
+DURATION=$((END_TIME - START_TIME))
+MINUTES=$((DURATION / 60))
+SECONDS=$((DURATION % 60))
+
+echo ""
+print_header "Testing Complete"
+echo ""
+print_info "Summary:"
+echo "  Total models:      $MODEL_COUNT"
+echo "  Successful:        $SUCCESSFUL"
+echo "  Failed:            $FAILED"
+echo "  Total time:        ${MINUTES}m ${SECONDS}s"
+echo ""
+
+if [ $SUCCESSFUL -gt 0 ]; then
+    print_success "Results saved to: $OUTPUT_FILE"
+    echo ""
+    print_info "Analyze results with:"
+    echo "  python analyze_jersey_results.py $OUTPUT_FILE"
+fi
+
+echo ""
+
+# Exit with error code if any tests failed
+if [ $FAILED -gt 0 ]; then
+    exit 1
+fi
+
+exit 0
--- a/test_jersey_detection.py
+++ b/test_jersey_detection.py
@ -0,0 +1,971 @@
+#!/usr/bin/env python3
+"""
+Test script for evaluating jersey detection performance with different models and prompts.
+
+Usage:
+    python test_jersey_detection.py <image_directory> <prompt_file> [options]
+
+Arguments:
+    image_directory: Path to directory containing test images
+    prompt_file: Path to text file containing the prompt to use
+    --model-name: Name of the model being tested (optional, auto-detected from server if not provided)
+    --model-tag: Model tag for llama-swap integration (optional)
+    --server-url: Optional llama.cpp server URL (default: read from scan.ini)
+    --output-file: Output file for results (default: jersey_detection_results.jsonl)
+    --resize: Maximum image dimension for resizing before processing
+
+Ground Truth:
+    Expected jersey numbers are parsed from filenames using dash-separated format:
+    Example: 1122-8-10-29.jpg expects jerseys 8, 10, and 29
+
+    The script calculates precision, recall, F1 score, and confidence calibration metrics
+    to evaluate model accuracy against known correct results.
+
+Output Files:
+    <output_file>: Summary statistics with ground truth metrics (default: jersey_detection_results.jsonl)
+
+Example:
+    # Auto-detect model name from server
+    python test_jersey_detection.py ./images jersey_prompt.txt
+
+    # Resize images to 1024px max dimension before processing
+    python test_jersey_detection.py ./images jersey_prompt.txt --resize 1024
+
+    # Use llama-swap to automatically load a specific model
+    python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
+
+    # Specify custom model name (for tracking in results)
+    python test_jersey_detection.py ./images jersey_prompt.txt --model-name "llama-3.2-vision"
+    python test_jersey_detection.py ./images jersey_prompt_with_confidence.txt --model-name "qwen2-vl" --resize 1024
+
+After running tests, analyze results with:
+    python analyze_jersey_results.py              # Performance and accuracy analysis
+"""
+
+import argparse
+import configparser
+import json
+import os
+import re
+import requests
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+import cv2
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from scan_utils.llama_cpp_client import LlamaCppClient
+
+
+# Hallucination detection: filter out example numbers from prompts
+# Using numbers > 100 as examples to avoid filtering valid jersey numbers
+HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
+
+
+def parse_expected_jerseys(filename: str) -> List[str]:
+    """
+    Parse expected jersey numbers from filename.
+
+    Format: prefix-number1-number2-number3.ext
+    Example: 1122-8-10-29.jpg -> ['8', '10', '29']
+
+    Args:
+        filename: Image filename
+
+    Returns:
+        List of expected jersey numbers as strings
+    """
+    # Remove extension
+    name_without_ext = Path(filename).stem
+
+    # Split by dash
+    parts = name_without_ext.split('-')
+
+    # First part is typically a prefix/identifier, rest are jersey numbers
+    # Skip the first part and collect numeric parts
+    expected = []
+    for i, part in enumerate(parts[1:], 1):  # Skip first part
+        # Check if part is numeric (jersey number)
+        if part.isdigit():
+            expected.append(part)
+
+    return expected
+
+
+def clean_response(text: str) -> str:
+    """
+    Clean the response by removing think tags and markdown code blocks.
+    Some models use <think> tags for chain-of-thought reasoning and wrap JSON in markdown.
+
+    Args:
+        text: Raw response text
+
+    Returns:
+        Cleaned text ready for JSON parsing
+    """
+    # Remove <think>...</think> tags and their content (standard angle brackets)
+    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    # Remove ◁think▷...◁/think▷ tags (unicode triangle brackets)
+    cleaned = re.sub(r'◁think▷.*?◁/think▷', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
+    # Also remove any standalone think tags (both formats)
+    cleaned = re.sub(r'</?think>', '', cleaned, flags=re.IGNORECASE)
+    cleaned = re.sub(r'◁/?think▷', '', cleaned, flags=re.IGNORECASE)
+
+    # Remove markdown code blocks (```json ... ``` or ``` ... ```)
+    # First try to extract content from ```json blocks
+    json_block_match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, flags=re.DOTALL | re.IGNORECASE)
+    if json_block_match:
+        # Extract just the content inside the code block
+        cleaned = json_block_match.group(1)
+    else:
+        # If no code block, just remove any stray ``` markers
+        cleaned = re.sub(r'```(?:json)?', '', cleaned, flags=re.IGNORECASE)
+
+    return cleaned.strip()
+
+
+def get_llama_server_url_from_config() -> Optional[str]:
+    """
+    Read the LLAMA_CPP_SERVER_URL from scan.ini.
+
+    Returns:
+        Server URL from config or None if not found
+    """
+    config_path = os.path.join(os.path.dirname(__file__), 'scan.ini')
+
+    if not os.path.exists(config_path):
+        return None
+
+    try:
+        config = configparser.ConfigParser()
+        config.read(config_path)
+
+        if 'DEFAULT' in config and 'LLAMA_CPP_SERVER_URL' in config['DEFAULT']:
+            return config['DEFAULT']['LLAMA_CPP_SERVER_URL']
+    except Exception as e:
+        print(f"Warning: Failed to read scan.ini: {e}")
+
+    return None
+
+
+class JerseyDetectionTester:
+    """Test runner for jersey detection evaluation."""
+
+    def __init__(self, server_url: str, prompt: str, model_name: Optional[str] = None, resize_max: Optional[int] = None, model_tag: Optional[str] = None):
+        """
+        Initialize the tester.
+
+        Args:
+            server_url: Base URL for the llama.cpp server
+            prompt: Prompt text to use for detection
+            model_name: Name of the model being tested (optional)
+            resize_max: Maximum image dimension (resize if larger, None = no resize)
+            model_tag: Model tag for llama-swap integration (optional)
+        """
+        self.client = LlamaCppClient(base_url=server_url)
+        self.prompt = prompt
+        self.model_name = model_name or "unknown"
+        self.resize_max = resize_max
+        self.model_tag = model_tag
+        self.results = []
+
+    def test_image(self, image_path: str) -> Dict[str, Any]:
+        """
+        Test jersey detection on a single image.
+
+        Args:
+            image_path: Path to the image file
+
+        Returns:
+            Dictionary containing test results for this image
+        """
+        start_time = time.time()
+
+        # Load image
+        image = cv2.imread(image_path)
+        if image is None:
+            filename = Path(image_path).name
+            expected_jerseys = parse_expected_jerseys(filename)
+            return {
+                'image_path': image_path,
+                'error': 'Failed to load image',
+                'jerseys': [],
+                'processing_time': 0,
+                'resized': False,
+                'original_size': None,
+                'final_size': None,
+                'expected_jerseys': expected_jerseys,
+                'detected_jerseys': [],
+                'true_positives': [],
+                'false_positives': [],
+                'false_negatives': expected_jerseys,
+                'precision': 0.0,
+                'recall': 0.0,
+                'f1_score': 0.0,
+                'avg_confidence_correct': None,
+                'avg_confidence_incorrect': None,
+                'confidence_correct_count': 0,
+                'confidence_incorrect_count': 0
+            }
+
+        # Track original size
+        original_height, original_width = image.shape[:2]
+        original_size = (original_width, original_height)
+        resized = False
+
+        # Resize if needed
+        if self.resize_max and (original_width > self.resize_max or original_height > self.resize_max):
+            # Calculate new dimensions maintaining aspect ratio
+            if original_width > original_height:
+                new_width = self.resize_max
+                new_height = int(original_height * (self.resize_max / original_width))
+            else:
+                new_height = self.resize_max
+                new_width = int(original_width * (self.resize_max / original_height))
+
+            # Resize image
+            image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+            resized = True
+
+        final_height, final_width = image.shape[:2]
+        final_size = (final_width, final_height)
+
+        # Create multimodal message
+        message = self.client.create_multimodal_message(
+            role="user",
+            content=self.prompt,
+            images=[image]
+        )
+
+        # Send to LLM
+        try:
+            # Prepare kwargs for chat completion
+            completion_kwargs = {
+                'messages': [message],
+                'temperature': 0.1,
+                'max_tokens': 1000
+            }
+
+            # Add model parameter if model_tag is specified (for llama-swap)
+            if self.model_tag:
+                completion_kwargs['model'] = self.model_tag
+                # Note: We don't print this for every image to avoid spam, but it's being sent
+
+            response = self.client.chat_completion(**completion_kwargs)
+
+            processing_time = time.time() - start_time
+
+            # Extract response text
+            if 'choices' in response and len(response['choices']) > 0:
+                response_text = response['choices'][0]['message']['content']
+
+                # Clean response (remove think tags and markdown code blocks)
+                cleaned_text = clean_response(response_text)
+
+                # Parse JSON response
+                try:
+                    result = json.loads(cleaned_text)
+                    jerseys = result.get('jerseys', [])
+
+                    # Apply hallucination detection
+                    filtered_jerseys = []
+                    hallucinated_count = 0
+
+                    for jersey in jerseys:
+                        jersey_number = jersey.get('jersey_number', '')
+
+                        # Check for hallucination (model returning example numbers)
+                        if jersey_number in HALLUCINATION_NUMBERS:
+                            hallucinated_count += 1
+                            continue
+
+                        filtered_jerseys.append(jersey)
+
+                    # Ground truth comparison
+                    filename = Path(image_path).name
+                    expected_jerseys = set(parse_expected_jerseys(filename))
+                    detected_jerseys = set(jersey.get('jersey_number', '') for jersey in filtered_jerseys if jersey.get('jersey_number', ''))
+
+                    # Calculate ground truth metrics
+                    true_positives = expected_jerseys & detected_jerseys  # Correctly detected
+                    false_positives = detected_jerseys - expected_jerseys  # Detected but not expected
+                    false_negatives = expected_jerseys - detected_jerseys  # Expected but not detected
+
+                    # Calculate precision, recall, F1
+                    tp_count = len(true_positives)
+                    fp_count = len(false_positives)
+                    fn_count = len(false_negatives)
+
+                    precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0
+                    recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0.0
+                    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
+
+                    # Handle edge case: if no expected jerseys, precision is 1.0 if no detections, else 0.0
+                    if len(expected_jerseys) == 0:
+                        precision = 1.0 if len(detected_jerseys) == 0 else 0.0
+                        recall = 1.0  # No jerseys to detect
+                        f1_score = 1.0 if len(detected_jerseys) == 0 else 0.0
+
+                    # Calculate confidence scores for correct vs incorrect detections
+                    confidence_correct = []  # Confidence for true positives
+                    confidence_incorrect = []  # Confidence for false positives
+
+                    for jersey in filtered_jerseys:
+                        jersey_number = jersey.get('jersey_number', '')
+                        confidence = jersey.get('confidence')
+
+                        if confidence is not None:
+                            if jersey_number in true_positives:
+                                confidence_correct.append(confidence)
+                            elif jersey_number in false_positives:
+                                confidence_incorrect.append(confidence)
+
+                    avg_confidence_correct = sum(confidence_correct) / len(confidence_correct) if confidence_correct else None
+                    avg_confidence_incorrect = sum(confidence_incorrect) / len(confidence_incorrect) if confidence_incorrect else None
+
+                    return {
+                        'image_path': image_path,
+                        'jerseys': filtered_jerseys,
+                        'hallucinated_count': hallucinated_count,
+                        'raw_response': cleaned_text,
+                        'processing_time': processing_time,
+                        'error': None,
+                        'resized': resized,
+                        'original_size': original_size,
+                        'final_size': final_size,
+                        # Ground truth metrics
+                        'expected_jerseys': sorted(expected_jerseys),
+                        'detected_jerseys': sorted(detected_jerseys),
+                        'true_positives': sorted(true_positives),
+                        'false_positives': sorted(false_positives),
+                        'false_negatives': sorted(false_negatives),
+                        'precision': precision,
+                        'recall': recall,
+                        'f1_score': f1_score,
+                        # Confidence calibration metrics
+                        'avg_confidence_correct': avg_confidence_correct,
+                        'avg_confidence_incorrect': avg_confidence_incorrect,
+                        'confidence_correct_count': len(confidence_correct),
+                        'confidence_incorrect_count': len(confidence_incorrect)
+                    }
+                except json.JSONDecodeError as e:
+                    filename = Path(image_path).name
+                    expected_jerseys = parse_expected_jerseys(filename)
+                    return {
+                        'image_path': image_path,
+                        'error': f'JSON parse error: {e}',
+                        'raw_response': cleaned_text,
+                        'original_response': response_text if cleaned_text != response_text else None,
+                        'jerseys': [],
+                        'processing_time': processing_time,
+                        'resized': resized,
+                        'original_size': original_size,
+                        'final_size': final_size,
+                        'expected_jerseys': expected_jerseys,
+                        'detected_jerseys': [],
+                        'true_positives': [],
+                        'false_positives': [],
+                        'false_negatives': expected_jerseys,
+                        'precision': 0.0,
+                        'recall': 0.0,
+                        'f1_score': 0.0
+                    }
+            else:
+                filename = Path(image_path).name
+                expected_jerseys = parse_expected_jerseys(filename)
+                return {
+                    'image_path': image_path,
+                    'error': 'Empty response from model',
+                    'jerseys': [],
+                    'processing_time': processing_time,
+                    'resized': resized,
+                    'original_size': original_size,
+                    'final_size': final_size,
+                    'expected_jerseys': expected_jerseys,
+                    'detected_jerseys': [],
+                    'true_positives': [],
+                    'false_positives': [],
+                    'false_negatives': expected_jerseys,
+                    'precision': 0.0,
+                    'recall': 0.0,
+                    'f1_score': 0.0
+                }
+
+        except Exception as e:
+            processing_time = time.time() - start_time
+            filename = Path(image_path).name
+            expected_jerseys = parse_expected_jerseys(filename)
+            return {
+                'image_path': image_path,
+                'error': f'Request error: {e}',
+                'jerseys': [],
+                'processing_time': processing_time,
+                'resized': resized,
+                'original_size': original_size,
+                'final_size': final_size,
+                'expected_jerseys': expected_jerseys,
+                'detected_jerseys': [],
+                'true_positives': [],
+                'false_positives': [],
+                'false_negatives': expected_jerseys,
+                'precision': 0.0,
+                'recall': 0.0,
+                'f1_score': 0.0,
+                'avg_confidence_correct': None,
+                'avg_confidence_incorrect': None,
+                'confidence_correct_count': 0,
+                'confidence_incorrect_count': 0
+            }
+
+    def test_directory(self, directory_path: str) -> List[Dict[str, Any]]:
+        """
+        Test all images in a directory.
+
+        Args:
+            directory_path: Path to directory containing images
+
+        Returns:
+            List of results for all images
+        """
+        # Get all image files
+        image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
+        image_files = []
+
+        for ext in image_extensions:
+            image_files.extend(Path(directory_path).glob(f'*{ext}'))
+            image_files.extend(Path(directory_path).glob(f'*{ext.upper()}'))
+
+        image_files = sorted(image_files)
+
+        if not image_files:
+            print(f"No image files found in {directory_path}")
+            return []
+
+        print(f"Found {len(image_files)} images to process\n")
+
+        # Process each image
+        results = []
+        for i, image_path in enumerate(image_files, 1):
+            # Show model tag in progress if using llama-swap
+            model_info = f" [{self.model_tag}]" if self.model_tag else ""
+            print(f"[{i}/{len(image_files)}]{model_info} Processing {image_path.name}...")
+            result = self.test_image(str(image_path))
+            results.append(result)
+
+            # Display result
+            self._display_result(result)
+            print()
+
+        return results
+
+    def _display_result(self, result: Dict[str, Any]):
+        """Display the result for a single image."""
+        if result.get('error'):
+            print(f"  ❌ Error: {result['error']}")
+            if 'raw_response' in result:
+                print(f"  Cleaned response: {result['raw_response']}...")
+            if result.get('original_response'):
+                print(f"  (Think tags and/or markdown were filtered from response)")
+        else:
+            jerseys = result.get('jerseys', [])
+            hallucinated_count = result.get('hallucinated_count', 0)
+
+            if jerseys:
+                print(f"  ✓ Found {len(jerseys)} jersey(s):")
+                for jersey in jerseys:
+                    number = jersey.get('jersey_number', 'N/A')
+                    jersey_color = jersey.get('jersey_color', 'N/A')
+                    number_color = jersey.get('number_color', 'N/A')
+                    confidence = jersey.get('confidence', None)
+
+                    conf_str = f" (confidence: {confidence})" if confidence is not None else ""
+                    print(f"    - #{number}: {jersey_color} jersey, {number_color} number{conf_str}")
+            else:
+                print(f"  ○ No jerseys detected")
+
+            if hallucinated_count > 0:
+                print(f"  ⚠ Filtered {hallucinated_count} hallucinated detection(s)")
+
+        # Display ground truth comparison
+        expected = result.get('expected_jerseys', [])
+        detected = result.get('detected_jerseys', [])
+        true_positives = result.get('true_positives', [])
+        false_positives = result.get('false_positives', [])
+        false_negatives = result.get('false_negatives', [])
+
+        if expected:
+            print(f"  Ground truth: Expected {expected}, Detected {detected}")
+            if true_positives:
+                print(f"    ✓ Correct: {true_positives}")
+            if false_positives:
+                print(f"    ✗ False positives: {false_positives}")
+            if false_negatives:
+                print(f"    ✗ Missed: {false_negatives}")
+            precision = result.get('precision', 0.0)
+            recall = result.get('recall', 0.0)
+            f1 = result.get('f1_score', 0.0)
+            print(f"    Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}")
+
+        print(f"  Processing time: {result['processing_time']:.2f}s")
+
+
+    def save_results_to_file(self, results: List[Dict[str, Any]], prompt_file: str, output_file: str = "jersey_detection_results.jsonl"):
+        """
+        Save test results to a JSON Lines file for later analysis.
+
+        Args:
+            results: List of all test results
+            prompt_file: Path to the prompt file used
+            output_file: Path to output file (default: jersey_detection_results.jsonl)
+        """
+        # Calculate summary statistics
+        total_images = len(results)
+        images_with_errors = sum(1 for r in results if r.get('error'))
+        images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
+        images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
+        total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
+        total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
+        total_raw_detections = total_jerseys + total_hallucinated
+        total_processing_time = sum(r.get('processing_time', 0) for r in results)
+        avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
+
+        # Collect confidence statistics if available
+        confidences = [
+            jersey.get('confidence')
+            for r in results if not r.get('error')
+            for jersey in r.get('jerseys', [])
+            if 'confidence' in jersey and jersey.get('confidence') is not None
+        ]
+
+        confidence_stats = None
+        if confidences:
+            buckets = {
+                '90-100': sum(1 for c in confidences if 90 <= c <= 100),
+                '70-89': sum(1 for c in confidences if 70 <= c <= 89),
+                '50-69': sum(1 for c in confidences if 50 <= c <= 69),
+                '30-49': sum(1 for c in confidences if 30 <= c <= 49),
+                '0-29': sum(1 for c in confidences if 0 <= c <= 29)
+            }
+            confidence_stats = {
+                'avg': sum(confidences) / len(confidences),
+                'min': min(confidences),
+                'max': max(confidences),
+                'count': len(confidences),
+                'distribution': buckets
+            }
+
+        # Calculate resize statistics
+        images_resized = sum(1 for r in results if r.get('resized', False))
+
+        # Calculate ground truth statistics
+        results_without_errors = [r for r in results if not r.get('error')]
+        total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
+        total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
+        total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
+        total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
+
+        # Calculate overall precision, recall, F1
+        overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
+        overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
+        overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
+
+        # Average per-image metrics
+        avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
+        avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
+        avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
+
+        # Calculate confidence calibration metrics (correct vs incorrect detections)
+        all_confidence_correct = []
+        all_confidence_incorrect = []
+        for r in results_without_errors:
+            if r.get('avg_confidence_correct') is not None:
+                # Weight by the count of correct detections in this image
+                count = r.get('confidence_correct_count', 0)
+                avg_conf = r.get('avg_confidence_correct')
+                all_confidence_correct.extend([avg_conf] * count)
+            if r.get('avg_confidence_incorrect') is not None:
+                # Weight by the count of incorrect detections in this image
+                count = r.get('confidence_incorrect_count', 0)
+                avg_conf = r.get('avg_confidence_incorrect')
+                all_confidence_incorrect.extend([avg_conf] * count)
+
+        overall_avg_confidence_correct = sum(all_confidence_correct) / len(all_confidence_correct) if all_confidence_correct else None
+        overall_avg_confidence_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect) if all_confidence_incorrect else None
+
+        # Create summary record
+        summary_record = {
+            'timestamp': datetime.now().isoformat(),
+            'model_name': self.model_name,
+            'model_tag': self.model_tag,
+            'prompt_file': prompt_file,
+            'prompt_length': len(self.prompt),
+            'total_images': total_images,
+            'images_with_jerseys': images_with_jerseys,
+            'images_without_jerseys': images_without_jerseys,
+            'images_with_errors': images_with_errors,
+            'total_raw_detections': total_raw_detections,
+            'total_valid_jerseys': total_jerseys,
+            'total_hallucinated': total_hallucinated,
+            'avg_processing_time': avg_processing_time,
+            'total_processing_time': total_processing_time,
+            'confidence_stats': confidence_stats,
+            'empty_response_capable': images_without_jerseys > 0,
+            'resize_enabled': self.resize_max is not None,
+            'resize_max': self.resize_max,
+            'images_resized': images_resized,
+            # Ground truth statistics
+            'ground_truth': {
+                'total_expected': total_expected_jerseys,
+                'total_true_positives': total_true_positives,
+                'total_false_positives': total_false_positives,
+                'total_false_negatives': total_false_negatives,
+                'overall_precision': overall_precision,
+                'overall_recall': overall_recall,
+                'overall_f1': overall_f1,
+                'avg_precision': avg_precision,
+                'avg_recall': avg_recall,
+                'avg_f1': avg_f1,
+                # Confidence calibration
+                'avg_confidence_correct': overall_avg_confidence_correct,
+                'avg_confidence_incorrect': overall_avg_confidence_incorrect,
+                'confidence_correct_count': len(all_confidence_correct),
+                'confidence_incorrect_count': len(all_confidence_incorrect)
+            }
+        }
+
+        # Append to file
+        try:
+            with open(output_file, 'a') as f:
+                f.write(json.dumps(summary_record) + '\n')
+            print(f"\n✓ Results saved to {output_file}")
+        except Exception as e:
+            print(f"\n❌ Failed to save results: {e}")
+
+    def print_summary(self, results: List[Dict[str, Any]]):
+        """
+        Print summary statistics for all results.
+
+        Args:
+            results: List of all test results
+        """
+        print("=" * 70)
+        print("SUMMARY")
+        print("=" * 70)
+        print(f"\nModel: {self.model_name}")
+        if self.model_tag:
+            print(f"Model tag: {self.model_tag}")
+
+        # Display resize info
+        if self.resize_max:
+            images_resized = sum(1 for r in results if r.get('resized', False))
+            print(f"Resize: Enabled (max: {self.resize_max}px, {images_resized} images resized)")
+        else:
+            print(f"Resize: Disabled")
+
+        total_images = len(results)
+        images_with_errors = sum(1 for r in results if r.get('error'))
+        images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
+        images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
+        total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
+        total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
+        total_raw_detections = total_jerseys + total_hallucinated
+        total_processing_time = sum(r.get('processing_time', 0) for r in results)
+        avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
+
+        print(f"\nTotal images processed: {total_images}")
+        print(f"  - Images with jerseys: {images_with_jerseys} ({images_with_jerseys/total_images*100:.1f}%)")
+        print(f"  - Images without jerseys: {images_without_jerseys} ({images_without_jerseys/total_images*100:.1f}%)")
+        print(f"  - Images with errors: {images_with_errors} ({images_with_errors/total_images*100:.1f}%)")
+        print(f"\nJersey detections:")
+        print(f"  - Total raw detections: {total_raw_detections}")
+        print(f"  - Valid jerseys (after filtering): {total_jerseys}")
+        print(f"  - Hallucinations filtered out: {total_hallucinated}")
+        if images_with_jerseys > 0:
+            print(f"  - Average valid jerseys per image (when detected): {total_jerseys/images_with_jerseys:.2f}")
+
+        # Empty response capability (important for evaluating model's ability to return empty results)
+        print(f"\nEmpty response capability:")
+        print(f"  - Empty responses returned: {images_without_jerseys}")
+        print(f"  - Percentage of images: {images_without_jerseys/total_images*100:.1f}%")
+        print(f"  - Model can return empty results: {'✓ Yes' if images_without_jerseys > 0 else '✗ No (potential issue)'}")
+
+        if total_hallucinated > 0:
+            print(f"\nHallucination detection:")
+            print(f"  - Total hallucinated detections filtered: {total_hallucinated}")
+            images_with_hallucinations = sum(1 for r in results if not r.get('error') and r.get('hallucinated_count', 0) > 0)
+            print(f"  - Images with hallucinations: {images_with_hallucinations} ({images_with_hallucinations/total_images*100:.1f}%)")
+
+        # Ground truth statistics
+        results_without_errors = [r for r in results if not r.get('error')]
+        total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
+
+        if total_expected_jerseys > 0:
+            total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
+            total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
+            total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
+
+            # Calculate overall metrics
+            overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
+            overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
+            overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
+
+            # Calculate average per-image metrics
+            avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
+            avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
+            avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
+
+            print(f"\nGround truth performance:")
+            print(f"  - Total expected jerseys: {total_expected_jerseys}")
+            print(f"  - True positives: {total_true_positives}")
+            print(f"  - False positives: {total_false_positives}")
+            print(f"  - False negatives: {total_false_negatives}")
+            print(f"\n  Overall metrics (across all jerseys):")
+            print(f"    - Precision: {overall_precision:.2%}")
+            print(f"    - Recall: {overall_recall:.2%}")
+            print(f"    - F1 Score: {overall_f1:.2%}")
+            print(f"\n  Average per-image metrics:")
+            print(f"    - Avg Precision: {avg_precision:.2%}")
+            print(f"    - Avg Recall: {avg_recall:.2%}")
+            print(f"    - Avg F1 Score: {avg_f1:.2%}")
+
+            # Confidence calibration metrics
+            all_confidence_correct = []
+            all_confidence_incorrect = []
+            for r in results_without_errors:
+                if r.get('avg_confidence_correct') is not None:
+                    count = r.get('confidence_correct_count', 0)
+                    avg_conf = r.get('avg_confidence_correct')
+                    all_confidence_correct.extend([avg_conf] * count)
+                if r.get('avg_confidence_incorrect') is not None:
+                    count = r.get('confidence_incorrect_count', 0)
+                    avg_conf = r.get('avg_confidence_incorrect')
+                    all_confidence_incorrect.extend([avg_conf] * count)
+
+            if all_confidence_correct or all_confidence_incorrect:
+                print(f"\n  Confidence calibration:")
+                if all_confidence_correct:
+                    avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
+                    print(f"    - Avg confidence (correct detections): {avg_conf_correct:.2f} ({len(all_confidence_correct)} detections)")
+                else:
+                    print(f"    - Avg confidence (correct detections): N/A (no correct detections with confidence)")
+
+                if all_confidence_incorrect:
+                    avg_conf_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect)
+                    print(f"    - Avg confidence (incorrect detections): {avg_conf_incorrect:.2f} ({len(all_confidence_incorrect)} detections)")
+
+                    # Show confidence difference
+                    if all_confidence_correct:
+                        avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
+                        diff = avg_conf_correct - avg_conf_incorrect
+                        if diff > 0:
+                            print(f"    - Confidence difference: +{diff:.2f} (correct > incorrect, good calibration)")
+                        else:
+                            print(f"    - Confidence difference: {diff:.2f} (⚠ incorrect ≥ correct, poor calibration)")
+                else:
+                    print(f"    - Avg confidence (incorrect detections): N/A (no incorrect detections with confidence)")
+
+        print(f"\nProcessing time:")
+        print(f"  - Total: {total_processing_time:.2f}s")
+        print(f"  - Average per image: {avg_processing_time:.2f}s")
+
+        # Check for confidence values
+        has_confidence = any(
+            any('confidence' in jersey for jersey in r.get('jerseys', []))
+            for r in results if not r.get('error')
+        )
+
+        if has_confidence:
+            print(f"\nConfidence statistics:")
+            confidences = [
+                jersey.get('confidence')
+                for r in results if not r.get('error')
+                for jersey in r.get('jerseys', [])
+                if 'confidence' in jersey and jersey.get('confidence') is not None
+            ]
+            if confidences:
+                avg_confidence = sum(confidences) / len(confidences)
+                min_confidence = min(confidences)
+                max_confidence = max(confidences)
+                print(f"  - Total detections with confidence: {len(confidences)}")
+                print(f"  - Average confidence: {avg_confidence:.2f}")
+                print(f"  - Min confidence: {min_confidence:.2f}")
+                print(f"  - Max confidence: {max_confidence:.2f}")
+
+                # Confidence distribution by bucket
+                print(f"\n  Confidence distribution:")
+                buckets = {
+                    '90-100 (Extremely clear)': (90, 100),
+                    '70-89  (Clear, minor issues)': (70, 89),
+                    '50-69  (Partially visible)': (50, 69),
+                    '30-49  (Difficult to read)': (30, 49),
+                    '0-29   (Very uncertain)': (0, 29)
+                }
+
+                for bucket_name, (min_val, max_val) in buckets.items():
+                    count = sum(1 for c in confidences if min_val <= c <= max_val)
+                    percentage = (count / len(confidences) * 100) if len(confidences) > 0 else 0
+                    bar_length = int(percentage / 2)  # Scale to max 50 chars
+                    bar = '█' * bar_length
+                    print(f"    {bucket_name}: {count:3d} ({percentage:5.1f}%) {bar}")
+
+        # List errors if any
+        if images_with_errors > 0:
+            print(f"\nErrors encountered:")
+            for r in results:
+                if r.get('error'):
+                    print(f"  - {Path(r['image_path']).name}: {r['error']}")
+
+        print()
+
+
+def main():
+    """Main entry point for the test script."""
+    # Get default server URL from config
+    default_server_url = get_llama_server_url_from_config() or 'http://192.168.1.34:8080'
+
+    parser = argparse.ArgumentParser(
+        description='Test jersey detection with different models and prompts',
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('image_directory', help='Path to directory containing test images')
+    parser.add_argument('prompt_file', help='Path to text file containing the prompt')
+    parser.add_argument('--model-name', default=None,
+                       help='Name of the model being tested (auto-detected from server if not provided)')
+    parser.add_argument('--server-url', default=default_server_url,
+                       help=f'llama.cpp server URL (default: {default_server_url})')
+    parser.add_argument('--output-file', default='jersey_detection_results.jsonl',
+                       help='Output file for results (default: jersey_detection_results.jsonl)')
+    parser.add_argument('--resize', type=int, default=None, metavar='MAX_SIZE',
+                       help='Resize images to maximum dimension (e.g., 1024) before processing')
+    parser.add_argument('--model-tag', default=None,
+                       help='Model tag for llama-swap (e.g., "qwen2.5-vl-7b"). If not specified, uses whatever model is loaded.')
+
+    args = parser.parse_args()
+
+    # Validate inputs
+    if not os.path.isdir(args.image_directory):
+        print(f"Error: Directory not found: {args.image_directory}")
+        sys.exit(1)
+
+    if not os.path.isfile(args.prompt_file):
+        print(f"Error: Prompt file not found: {args.prompt_file}")
+        sys.exit(1)
+
+    # Load prompt
+    try:
+        with open(args.prompt_file, 'r') as f:
+            prompt = f.read()
+    except Exception as e:
+        print(f"Error reading prompt file: {e}")
+        sys.exit(1)
+
+    # Print test configuration
+    print("=" * 70)
+    print("JERSEY DETECTION TEST")
+    print("=" * 70)
+    print(f"Model name: {args.model_name if args.model_name else '(auto-detect)'}")
+    print(f"Model tag: {args.model_tag if args.model_tag else 'None (use loaded model)'}")
+    print(f"Server URL: {args.server_url}")
+    print(f"Image directory: {args.image_directory}")
+    print(f"Prompt file: {args.prompt_file}")
+    print(f"Prompt length: {len(prompt)} characters")
+    print(f"Output file: {args.output_file}")
+    print(f"Resize images: {f'Yes (max: {args.resize}px)' if args.resize else 'No'}")
+    print("=" * 70)
+    print()
+
+    # Check server health
+    print("Checking server health...")
+    try:
+        client = LlamaCppClient(base_url=args.server_url)
+
+        # Try health check (handle both JSON and plain text responses)
+        try:
+            health = client.health_check()
+            print(f"✓ Server is healthy: {health}")
+        except json.JSONDecodeError:
+            # llama-swap returns plain text "OK" instead of JSON
+            response = requests.get(f"{args.server_url}/health")
+            response.raise_for_status()
+            print(f"✓ Server is healthy: {response.text}")
+
+        # Determine model name to use
+        model_name = args.model_name
+
+        # If model_tag is provided, use it as the model name (unless user explicitly provided a model_name)
+        if args.model_tag and not args.model_name:
+            model_name = args.model_tag
+            print(f"✓ Using model tag as model name: {model_name}")
+        elif not model_name:
+            # Only auto-detect if neither model_tag nor model_name was provided
+            detected_model_name = None
+            try:
+                models = client.get_models()
+                if 'data' in models and len(models['data']) > 0:
+                    model_id = models['data'][0].get('id', 'unknown')
+                    print(f"✓ Active model: {model_id}")
+
+                    # Extract just the model filename (without path)
+                    if model_id and model_id != 'unknown':
+                        # Remove path and get base filename
+                        model_filename = os.path.basename(model_id)
+                        # Remove common extensions (.gguf, .bin, etc.)
+                        model_name_no_ext = os.path.splitext(model_filename)[0]
+                        detected_model_name = model_name_no_ext
+            except:
+                pass
+
+            if detected_model_name:
+                model_name = detected_model_name
+                print(f"✓ Using auto-detected model name: {model_name}")
+            else:
+                model_name = "unknown"
+                print(f"⚠ Could not detect model name, using 'unknown'")
+        else:
+            # User explicitly provided model_name
+            print(f"✓ Using provided model name: {model_name}")
+
+    except Exception as e:
+        print(f"❌ Failed to connect to server: {e}")
+        print(f"Make sure llama.cpp server is running at {args.server_url}")
+        sys.exit(1)
+
+    print()
+
+    # Show model tag info if using llama-swap
+    if args.model_tag:
+        print(f"Requesting model from llama-swap: {args.model_tag}")
+
+        # Check currently running models on llama-swap
+        try:
+            running_response = requests.get(f"{args.server_url}/running")
+            if running_response.status_code == 200:
+                try:
+                    running_models = running_response.json()
+                    if running_models:
+                        print(f"Currently running models: {running_models}")
+                except:
+                    pass
+        except:
+            pass
+
+        print()
+
+    # Run tests
+    tester = JerseyDetectionTester(args.server_url, prompt, model_name, args.resize, args.model_tag)
+    results = tester.test_directory(args.image_directory)
+
+    # Print summary
+    if results:
+        tester.print_summary(results)
+
+        # Save results to file
+        tester.save_results_to_file(results, args.prompt_file, args.output_file)
+
+
+if __name__ == '__main__':
+    main()