Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,93 @@
 # Jersey Detection Testing
 This project contains test scripts, results, and utilities for evaluating vision-language models on jersey number detection tasks using llama.cpp.
 ## Directory Structure
 ```
 jersey_test/
 ├── scan_utils/
 │   ├── jersey_detection.py      # Core detection class using VLM
 │   └── llama_cpp_client.py      # Client for llama.cpp server
 ├── docs/
 │   ├── JERSEY_DETECTION_MODEL_ANALYSIS.md  # Model comparison results
 │   └── LLAMA_SWAP_SETUP.md      # Server setup instructions
 ├── test_images/                  # Place test images here
 ├── test_images_output/           # Output directory for annotated images
 ├── test_jersey_detection.py      # Main test runner
 ├── analyze_jersey_results.py     # Results analysis script
 ├── test_all_models.sh            # Batch testing shell script
 ├── jersey_prompt.txt             # Basic detection prompt
 ├── jersey_prompt_with_confidence.txt  # Prompt with confidence scoring
 └── jersey_detection_results.jsonl     # Historical test results
 ```
 ## Prerequisites
 - Python 3.10+
 - llama.cpp server running with a vision-language model
 - Test images with ground truth encoded in filenames
 ## Test Image Naming Convention
 Test images should follow this naming pattern to encode ground truth:
 ```
 prefix-number1-number2-number3.jpg
 ```
 Example: `game1-23-45-7.jpg` contains jerseys with numbers 23, 45, and 7.
 ## Running Tests
 ### Single Model Test
 ```bash
 python test_jersey_detection.py \
    --images-dir ./test_images \
    --prompt-file jersey_prompt_with_confidence.txt \
    --server-url http://localhost:8080 \
    --resize 1024 \
    --output jersey_detection_results.jsonl
 ```
 ### Batch Testing All Models
 ```bash
 ./test_all_models.sh
 ```
 Edit variables at the top of the script to configure:
 - `IMAGES_DIR` - test images directory
 - `PROMPT_FILE` - prompt file to use
 - `SERVER_URL` - llama.cpp/llama-swap server URL
 - `LLAMA_SWAP_CONFIG` - path to llama-swap config for model list
 ### Analyzing Results
 ```bash
 python analyze_jersey_results.py jersey_detection_results.jsonl
 ```
 Options:
 - `--csv output.csv` - Export results to CSV
 - `--filter-model "model_name"` - Filter by model name
 ## Historical Results
 The `jersey_detection_results.jsonl` file contains results from 6 test runs:
 | Model | F1 Score | Avg Time/Image | Avg Confidence |
 |-------|----------|----------------|----------------|
 | qwen2.5-vl-7b | 72.9% | - | - |
 | gemma-3-27b | 72.1% | 18.1s | 87.1 |
 | Mistral-Small-3.2-24B (Q4) | - | 14.2s | 92.1 |
 | Kimi-VL-A3B-Thinking | - | 29.1s | 88.9 |
 See `docs/JERSEY_DETECTION_MODEL_ANALYSIS.md` for detailed analysis.
 ## Key Findings
 1. **Top Recommendation**: qwen2.5-vl-7b (72.9% F1 score)
 2. **Best Confidence Calibration**: gemma-3-27b
 3. **Speed Champion**: gemma-3-4b (7.9s/img, 63.8% F1)
 4. Confidence threshold of 85+ recommended for filtering uncertain detections
--- a/analyze_jersey_results.py
+++ b/analyze_jersey_results.py
@ -0,0 +1,663 @@
 #!/usr/bin/env python3
 """
 Analyze jersey detection test results and compare model performance.
 Usage:
    python analyze_jersey_results.py [results_file]
    python analyze_jersey_results.py [results_file] --csv output.csv
    python analyze_jersey_results.py [results_file] --csv-only output.csv
 Arguments:
    results_file: Path to the results file (default: jersey_detection_results.jsonl)
    --csv: Also export results to CSV file
    --csv-only: Export to CSV only, skip analysis display
 """
 import argparse
 import csv
 import json
 import sys
 from pathlib import Path
 from typing import List, Dict, Any
 from datetime import datetime
 def load_results(results_file: str) -> List[Dict[str, Any]]:
    """Load test results from a JSON Lines file."""
    results = []
    try:
        with open(results_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    results.append(json.loads(line))
        return results
    except FileNotFoundError:
        print(f"Error: Results file not found: {results_file}")
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in results file: {e}")
        sys.exit(1)
 def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
    """
    Calculate standard deviation of confidence scores from distribution.
    Returns:
        Tuple of (stdev, quality_rating)
        quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
    """
    if not conf_stats or 'distribution' not in conf_stats:
        return None, "N/A"
    dist = conf_stats['distribution']
    # Reconstruct approximate confidence values from buckets
    # Use midpoint of each bucket
    values = []
    bucket_midpoints = {
        '90-100': 95,
        '70-89': 79.5,
        '50-69': 59.5,
        '30-49': 39.5,
        '0-29': 14.5
    }
    for bucket, count in dist.items():
        midpoint = bucket_midpoints.get(bucket, 50)
        values.extend([midpoint] * count)
    if len(values) < 2:
        return None, "N/A"
    # Calculate standard deviation
    import math
    mean = sum(values) / len(values)
    variance = sum((x - mean) ** 2 for x in values) / len(values)
    stdev = math.sqrt(variance)
    # Assign quality rating based on StDev
    if stdev < 5:
        quality = "Poor"
    elif stdev < 10:
        quality = "Fair"
    elif stdev < 15:
        quality = "Good"
    else:
        quality = "Excel"  # Shortened for table
    return stdev, quality
 def print_ascii_comparison_table(results: List[Dict[str, Any]]):
    """Print a detailed ASCII comparison table of all test runs."""
    if not results:
        print("No results to display.")
        return
    print("=" * 280)
    print("DETAILED MODEL COMPARISON TABLE")
    print("=" * 280)
    print()
    print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
    print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
    print()
    # Table headers with ground truth and confidence calibration columns
    print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐")
    print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format(
        "Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
    ))
    print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤")
    # Data rows
    for i, result in enumerate(results):
        model = result.get('model_name', 'unknown')[:20]
        prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
        total_images = result.get('total_images', 0)
        valid_jerseys = result.get('total_valid_jerseys', 0)
        hallucinated = result.get('total_hallucinated', 0)
        total_detections = valid_jerseys + hallucinated
        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
        hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
        avg_time = result.get('avg_processing_time', 0)
        # Calculate confidence quality
        conf_stats = result.get('confidence_stats')
        has_conf = 'Yes' if conf_stats else 'No'
        stdev, quality = calculate_confidence_stdev(conf_stats)
        # Format confidence quality display
        if stdev is not None:
            conf_qual_str = f"{quality} ({stdev:.1f})"
        else:
            conf_qual_str = "N/A"
        # Ground truth metrics
        gt = result.get('ground_truth', {})
        precision = gt.get('overall_precision', 0) * 100
        recall = gt.get('overall_recall', 0) * 100
        f1 = gt.get('overall_f1', 0) * 100
        # Confidence calibration
        conf_correct = gt.get('avg_confidence_correct')
        conf_incorrect = gt.get('avg_confidence_incorrect')
        conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
        conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"
        resize_max = result.get('resize_max')
        resize_str = f"{resize_max}px" if resize_max else "No"
        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')
        print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format(
            model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
        ))
    # Bottom border
    print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘")
    print()
 def print_comparison_table(results: List[Dict[str, Any]]):
    """Print a simple comparison table of all test runs."""
    if not results:
        print("No results to display.")
        return
    print("=" * 140)
    print("MODEL COMPARISON TABLE")
    print("=" * 140)
    print()
    # Header
    header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
    print(header)
    print("-" * 150)
    # Data rows
    for result in results:
        model = result.get('model_name', 'unknown')[:24]
        prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
        total_images = result.get('total_images', 0)
        valid_jerseys = result.get('total_valid_jerseys', 0)
        hallucinated = result.get('total_hallucinated', 0)
        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
        avg_time = result.get('avg_processing_time', 0)
        has_conf = 'Yes' if result.get('confidence_stats') else 'No'
        resize_max = result.get('resize_max')
        resize_str = f"{resize_max}px" if resize_max else "No"
        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')
        row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
        print(row)
    print()
 def print_model_performance_chart(results: List[Dict[str, Any]]):
    """Print a performance chart showing key metrics for each model."""
    if not results:
        return
    print("=" * 140)
    print("MODEL PERFORMANCE CHART")
    print("=" * 140)
    print()
    # Group results by model
    models = {}
    for result in results:
        model_name = result.get('model_name', 'unknown')
        if model_name not in models:
            models[model_name] = []
        models[model_name].append(result)
    # Calculate aggregate statistics for each model
    for model_name, model_results in models.items():
        print(f"\n{model_name}")
        print("-" * 100)
        total_runs = len(model_results)
        total_images = sum(r.get('total_images', 0) for r in model_results)
        total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
        total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
        avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
        avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0
        # Check if any runs have confidence stats
        has_confidence = any(r.get('confidence_stats') for r in model_results)
        # Check resize status
        resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
        resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
        resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"
        print(f"  Total test runs: {total_runs}")
        print(f"  Total images processed: {total_images}")
        print(f"  Total valid detections: {total_valid}")
        print(f"  Total hallucinations: {total_hallu}")
        print(f"  Average empty response rate: {avg_empty_pct:.1f}%")
        print(f"  Average processing time: {avg_time:.2f}s/image")
        print(f"  Resize: {resize_info}")
        print(f"  Confidence support: {'Yes' if has_confidence else 'No'}")
        # Show hallucination rate
        if total_valid + total_hallu > 0:
            hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
            print(f"  Hallucination rate: {hallu_rate:.1f}%")
            # Visual bar
            bar_length = int(hallu_rate / 2)  # Scale to max 50 chars
            bar = '█' * bar_length
            print(f"  Hallucination chart: {bar} ({hallu_rate:.1f}%)")
        # Ground truth performance
        gt_runs = [r for r in model_results if r.get('ground_truth')]
        if gt_runs:
            avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
            avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
            avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
            total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
            total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
            total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
            total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)
            print(f"\n  Ground truth performance:")
            print(f"    Total expected jerseys: {total_expected}")
            print(f"    True positives: {total_tp}")
            print(f"    False positives: {total_fp}")
            print(f"    False negatives: {total_fn}")
            print(f"    Average Precision: {avg_precision:.1%}")
            print(f"    Average Recall: {avg_recall:.1%}")
            print(f"    Average F1 Score: {avg_f1:.1%}")
            # Visual F1 bar
            bar_length = int(avg_f1 * 50)  # Scale to max 50 chars
            bar = '█' * bar_length
            print(f"    F1 Score chart: {bar} ({avg_f1:.1%})")
            # Confidence calibration
            conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
            conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]
            if conf_correct_vals or conf_incorrect_vals:
                print(f"\n  Confidence calibration:")
                if conf_correct_vals:
                    avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
                    print(f"    Avg confidence (correct detections): {avg_conf_correct:.2f}")
                if conf_incorrect_vals:
                    avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
                    print(f"    Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
                if conf_correct_vals and conf_incorrect_vals:
                    diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
                    if diff > 0:
                        print(f"    Confidence difference: +{diff:.2f} (good calibration)")
                    else:
                        print(f"    Confidence difference: {diff:.2f} (⚠ poor calibration)")
        # Confidence distribution if available
        if has_confidence:
            print(f"\n  Confidence distribution (across all runs):")
            all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
            total_conf_count = 0
            for result in model_results:
                conf_stats = result.get('confidence_stats')
                if conf_stats and 'distribution' in conf_stats:
                    for bucket, count in conf_stats['distribution'].items():
                        all_dist[bucket] += count
                        total_conf_count += count
            if total_conf_count > 0:
                for bucket, count in all_dist.items():
                    pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
                    bar_length = int(pct / 2)
                    bar = '█' * bar_length
                    print(f"    {bucket}: {count:4d} ({pct:5.1f}%) {bar}")
    print()
 def print_best_performers(results: List[Dict[str, Any]]):
    """Print summary of best performing models."""
    if not results:
        return
    print("=" * 140)
    print("BEST PERFORMERS")
    print("=" * 140)
    print()
    # Group by model and calculate averages
    models = {}
    for result in results:
        model_name = result.get('model_name', 'unknown')
        if model_name not in models:
            models[model_name] = {
                'runs': 0,
                'total_hallu': 0,
                'total_detections': 0,
                'avg_time': [],
                'empty_capable': []
            }
        models[model_name]['runs'] += 1
        models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
        models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
        models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
        models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))
    # Calculate scores
    model_scores = []
    for model_name, stats in models.items():
        hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
        avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
        empty_capable = any(stats['empty_capable'])
        model_scores.append({
            'model': model_name,
            'hallu_rate': hallu_rate,
            'avg_time': avg_time,
            'empty_capable': empty_capable,
            'runs': stats['runs']
        })
    # Sort by hallucination rate (lower is better)
    model_scores.sort(key=lambda x: x['hallu_rate'])
    print("Lowest hallucination rate:")
    for i, score in enumerate(model_scores[:3], 1):
        capable = "✓" if score['empty_capable'] else "✗"
        print(f"  {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")
    print()
    # Sort by speed (lower is better)
    model_scores.sort(key=lambda x: x['avg_time'])
    print("Fastest processing:")
    for i, score in enumerate(model_scores[:3], 1):
        capable = "✓" if score['empty_capable'] else "✗"
        print(f"  {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")
    print()
    # Models with empty response capability
    empty_models = [s for s in model_scores if s['empty_capable']]
    print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
    for score in empty_models:
        print(f"  - {score['model']}")
    print()
    # Best F1 scores (ground truth accuracy)
    models_with_gt = {}
    for result in results:
        if result.get('ground_truth'):
            model_name = result.get('model_name', 'unknown')
            if model_name not in models_with_gt:
                models_with_gt[model_name] = {
                    'f1_scores': [],
                    'precision_scores': [],
                    'recall_scores': []
                }
            gt = result['ground_truth']
            models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
            models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
            models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))
    if models_with_gt:
        gt_scores = []
        for model_name, stats in models_with_gt.items():
            avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
            avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
            avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
            gt_scores.append({
                'model': model_name,
                'avg_f1': avg_f1,
                'avg_precision': avg_precision,
                'avg_recall': avg_recall
            })
        # Sort by F1 score (higher is better)
        gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)
        print("Highest ground truth F1 scores:")
        for i, score in enumerate(gt_scores[:3], 1):
            print(f"  {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")
        print()
 def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
    """Export results to CSV file for spreadsheet import."""
    if not results:
        print("No results to export.")
        return
    try:
        with open(csv_file, 'w', newline='') as f:
            # Define CSV columns
            fieldnames = [
                'timestamp',
                'model_name',
                'model_tag',
                'prompt_file',
                'prompt_length',
                'total_images',
                'images_with_jerseys',
                'images_without_jerseys',
                'images_with_errors',
                'total_raw_detections',
                'total_valid_jerseys',
                'total_hallucinated',
                'hallucination_rate_pct',
                'empty_response_rate_pct',
                'avg_processing_time',
                'total_processing_time',
                'resize_enabled',
                'resize_max',
                'images_resized',
                'has_confidence',
                'confidence_avg',
                'confidence_min',
                'confidence_max',
                'confidence_count',
                'confidence_stdev',
                'confidence_quality',
                'conf_90_100',
                'conf_70_89',
                'conf_50_69',
                'conf_30_49',
                'conf_0_29',
                # Ground truth columns
                'gt_total_expected',
                'gt_total_true_positives',
                'gt_total_false_positives',
                'gt_total_false_negatives',
                'gt_overall_precision',
                'gt_overall_recall',
                'gt_overall_f1',
                'gt_avg_precision',
                'gt_avg_recall',
                'gt_avg_f1',
                # Confidence calibration
                'gt_avg_confidence_correct',
                'gt_avg_confidence_incorrect',
                'gt_confidence_correct_count',
                'gt_confidence_incorrect_count'
            ]
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            # Write data rows
            for result in results:
                # Calculate derived values
                total_images = result.get('total_images', 0)
                valid_jerseys = result.get('total_valid_jerseys', 0)
                hallucinated = result.get('total_hallucinated', 0)
                total_detections = valid_jerseys + hallucinated
                hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
                empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
                # Extract confidence stats
                conf_stats = result.get('confidence_stats')
                has_confidence = conf_stats is not None
                conf_avg = conf_stats.get('avg', '') if conf_stats else ''
                conf_min = conf_stats.get('min', '') if conf_stats else ''
                conf_max = conf_stats.get('max', '') if conf_stats else ''
                conf_count = conf_stats.get('count', '') if conf_stats else ''
                # Calculate confidence standard deviation and quality
                conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)
                # Extract confidence distribution
                conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
                conf_90_100 = conf_dist.get('90-100', '')
                conf_70_89 = conf_dist.get('70-89', '')
                conf_50_69 = conf_dist.get('50-69', '')
                conf_30_49 = conf_dist.get('30-49', '')
                conf_0_29 = conf_dist.get('0-29', '')
                # Extract ground truth stats
                gt = result.get('ground_truth', {})
                gt_total_expected = gt.get('total_expected', '')
                gt_total_tp = gt.get('total_true_positives', '')
                gt_total_fp = gt.get('total_false_positives', '')
                gt_total_fn = gt.get('total_false_negatives', '')
                gt_overall_precision = gt.get('overall_precision', '')
                gt_overall_recall = gt.get('overall_recall', '')
                gt_overall_f1 = gt.get('overall_f1', '')
                gt_avg_precision = gt.get('avg_precision', '')
                gt_avg_recall = gt.get('avg_recall', '')
                gt_avg_f1 = gt.get('avg_f1', '')
                gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
                gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
                gt_conf_correct_count = gt.get('confidence_correct_count', '')
                gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')
                row = {
                    'timestamp': result.get('timestamp', ''),
                    'model_name': result.get('model_name', ''),
                    'model_tag': result.get('model_tag', ''),
                    'prompt_file': result.get('prompt_file', ''),
                    'prompt_length': result.get('prompt_length', ''),
                    'total_images': total_images,
                    'images_with_jerseys': result.get('images_with_jerseys', ''),
                    'images_without_jerseys': result.get('images_without_jerseys', ''),
                    'images_with_errors': result.get('images_with_errors', ''),
                    'total_raw_detections': result.get('total_raw_detections', ''),
                    'total_valid_jerseys': valid_jerseys,
                    'total_hallucinated': hallucinated,
                    'hallucination_rate_pct': f"{hallu_rate:.2f}",
                    'empty_response_rate_pct': f"{empty_rate:.2f}",
                    'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
                    'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
                    'resize_enabled': result.get('resize_enabled', False),
                    'resize_max': result.get('resize_max', ''),
                    'images_resized': result.get('images_resized', ''),
                    'has_confidence': has_confidence,
                    'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
                    'confidence_min': conf_min,
                    'confidence_max': conf_max,
                    'confidence_count': conf_count,
                    'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
                    'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
                    'conf_90_100': conf_90_100,
                    'conf_70_89': conf_70_89,
                    'conf_50_69': conf_50_69,
                    'conf_30_49': conf_30_49,
                    'conf_0_29': conf_0_29,
                    # Ground truth data
                    'gt_total_expected': gt_total_expected,
                    'gt_total_true_positives': gt_total_tp,
                    'gt_total_false_positives': gt_total_fp,
                    'gt_total_false_negatives': gt_total_fn,
                    'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
                    'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
                    'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
                    'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
                    'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
                    'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
                    'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
                    'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
                    'gt_confidence_correct_count': gt_conf_correct_count,
                    'gt_confidence_incorrect_count': gt_conf_incorrect_count
                }
                writer.writerow(row)
        print(f"✓ Results exported to CSV: {csv_file}")
        print(f"  Rows: {len(results)}")
        print(f"  Columns: {len(fieldnames)}")
    except Exception as e:
        print(f"❌ Failed to export to CSV: {e}")
        sys.exit(1)
 def main():
    """Main entry point for the analysis script."""
    parser = argparse.ArgumentParser(
        description='Analyze jersey detection test results',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Show analysis
  python analyze_jersey_results.py
  # Show analysis and export to CSV
  python analyze_jersey_results.py --csv results.csv
  # Export to CSV only (no analysis display)
  python analyze_jersey_results.py --csv-only results.csv
  # Analyze custom results file
  python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
 """
    )
    parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
                       help='Path to results file (default: jersey_detection_results.jsonl)')
    parser.add_argument('--csv', metavar='FILE', dest='csv_file',
                       help='Export results to CSV file (in addition to showing analysis)')
    parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
                       help='Export to CSV file only, skip analysis display')
    args = parser.parse_args()
    # Check if file exists
    if not Path(args.results_file).exists():
        print(f"Error: Results file not found: {args.results_file}")
        print(f"Run some tests first with test_jersey_detection.py to generate results.")
        sys.exit(1)
    # Load results
    results = load_results(args.results_file)
    if not results:
        print(f"No results found in {args.results_file}")
        sys.exit(0)
    print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")
    # Handle CSV-only mode
    if args.csv_only:
        export_to_csv(results, args.csv_only)
        return
    # Print analyses (unless CSV-only mode)
    print_ascii_comparison_table(results)
    print_model_performance_chart(results)
    print_best_performers(results)
    # Export to CSV if requested
    if args.csv_file:
        print()
        export_to_csv(results, args.csv_file)
 if __name__ == '__main__':
    main()
--- a/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md
+++ b/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md
@ -0,0 +1,296 @@
 # Jersey Detection Model Analysis Report
 **Date:** October 22, 2025
 **Models Tested:** 8 vision-language models
 **Test Images:** 194 images with known jersey numbers
 **Purpose:** Determine the best model for automated jersey number detection in sports photography
 ---
 ## Executive Summary
 After comprehensive testing of 8 different AI models on 194 sports images with known jersey numbers, we recommend **qwen2.5-vl-7b** as the best overall model for jersey detection, with **gemma-3-27b** as a close second choice depending on specific needs.
 ### Key Findings:
 1. **Best Overall Performance**: qwen2.5-vl-7b achieves the highest accuracy (72.9% F1 score)
 2. **Confidence Scores Are Useful**: 7 out of 8 models show reliable confidence calibration, meaning higher confidence scores correlate with correct detections
 3. **Speed vs Accuracy Trade-off**: The most accurate models take 13-21 seconds per image; faster models sacrifice significant accuracy
 ---
 ## Model Performance Comparison
 ### Top 3 Recommended Models
 | Rank | Model | Accuracy (F1) | Speed | Correct Detections | False Alarms | Confidence Reliability |
 |------|-------|---------------|-------|--------------------|--------------|-----------------------|
 | 🥇 1 | qwen2.5-vl-7b | 72.9% | 13.4s | 328 / 436 (75%) | 136 | Good |
 | 🥈 2 | gemma-3-27b | 72.1% | 20.9s | 343 / 462 (74%) | 147 | Very Good (+6.0) |
 | 🥉 3 | gemma-3-12b | 69.8% | 18.9s | 322 / 462 (70%) | 139 | Good (+3.1) |
 ### Complete Results Table
 | Model | Accuracy (F1 Score) | Correct Detections | False Alarms | Missed Jerseys | Speed (sec/image) | Confidence Calibration |
 |-------|--------------------|--------------------|--------------|----------------|-------------------|------------------------|
 | **qwen2.5-vl-7b** | **72.9%** ⭐ | 328 / 436 | 136 | 108 | 13.4 | +0.5 (Good) |
 | **gemma-3-27b** | **72.1%** | 343 / 462 | 147 | 119 | 20.9 | +6.0 (Very Good) |
 | **gemma-3-12b** | 69.8% | 322 / 462 | 139 | 140 | 18.9 | +3.1 (Good) |
 | mistral-small-24b-q4 | 67.6% | 328 / 462 | 180 | 134 | 15.1 | +2.4 (Good) |
 | mistral-small-24b-q8 | 67.2% | 330 / 462 | 190 | 132 | 22.6 | +3.1 (Good) |
 | gemma-3-4b | 63.8% | 277 / 462 | 130 | 185 | 7.9 ⚡ | +6.2 (Very Good) |
 | lfm2-vl-1.6b | 50.5% | 171 / 448 | 58 | 277 | 4.6 ⚡⚡ | +11.9 (Excellent) |
 | kimi-vl-3b | 2.0% ❌ | 5 / 416 | 67 | 411 | 40.0 🐌 | -1.3 (Poor) |
 ---
 ## Understanding the Metrics
 ### What the Numbers Mean:
 - **Accuracy (F1 Score)**: Overall effectiveness balancing correct detections and false alarms
  - 70%+ = Excellent for production use
  - 60-70% = Good for assisted workflows
  - Below 60% = Not recommended
 - **Correct Detections**: Out of all jerseys that should have been found, how many were actually detected
  - Example: "328 / 436" means the model found 328 jerseys out of 436 that were actually in the images
 - **False Alarms**: Jersey numbers detected that weren't actually in the image
  - Lower is better - these are incorrect detections
  - Can be filtered using confidence scores
 - **Missed Jerseys**: Jersey numbers that were in the image but not detected
  - Lower is better - these are opportunities lost
 - **Speed**: Average seconds to process one image
  - ⚡⚡ = Very fast (< 8s)
  - ⚡ = Fast (8-15s)
  - Standard = 15-25s
  - 🐌 = Slow (> 30s)
 - **Confidence Calibration**: The difference between average confidence on correct vs incorrect detections
  - Positive number (e.g., +6.0) = Good calibration - correct detections have higher confidence
  - Negative number = Poor calibration - can't trust confidence scores
  - Higher positive values = Better for filtering with confidence thresholds
 ---
 ## Detailed Analysis
 ### 1. Best Model: qwen2.5-vl-7b
 **Why It's the Best:**
 - ✅ Highest overall accuracy (72.9%)
 - ✅ Best recall - finds 75% of all jerseys
 - ✅ Reasonable speed (13.4 seconds per image)
 - ✅ Very low hallucination rate (only 1%)
 - ✅ Confidence scores are reliable for filtering
 **Strengths:**
 - Finds the most jerseys (highest recall at 75.2%)
 - Rarely makes up fake jersey numbers (hallucination rate: 1%)
 - Almost always returns results (empty response rate: 2.6%)
 **Weaknesses:**
 - Generates 136 false positives (30% of detections are incorrect)
 - Confidence calibration is minimal (+0.5), making threshold filtering less effective
 - All confidence scores are 90-95, showing limited variation
 **Best For:**
 - Applications where finding all jerseys is critical
 - Batch processing where moderate false positives are acceptable
 - When combined with manual review of results
 ### 2. Runner-Up: gemma-3-27b
 **Why It's Excellent:**
 - ✅ Nearly identical accuracy to the winner (72.1% vs 72.9%)
 - ✅ Finds the most total jerseys (343 correct detections)
 - ✅ Excellent confidence calibration (+6.0 difference)
 - ✅ No hallucinations
 - ⚠️ Slower processing (20.9s per image)
 **Strengths:**
 - Best for confidence-based filtering (6-point difference between correct/incorrect)
 - Highest absolute number of correct detections (343)
 - More varied confidence scores (54% in 90-100 range, 42% in 70-89 range)
 **Weaknesses:**
 - 56% slower than qwen2.5-vl-7b
 - Similar false positive rate
 **Best For:**
 - Applications requiring confidence-based filtering
 - When processing time is not critical
 - Maximizing total correct detections
 ### 3. Alternative: gemma-3-4b (Speed Champion)
 **Why Consider It:**
 - ⚡ Fast processing (7.9 seconds per image)
 - ✅ Very good confidence calibration (+6.2)
 - ✅ Zero hallucinations
 - ⚠️ Lower accuracy (63.8%)
 **Trade-offs:**
 - 41% faster than qwen2.5-vl-7b
 - But 12% lower accuracy
 - Misses 40% of jerseys (185 false negatives)
 **Best For:**
 - Real-time or high-volume processing
 - Applications where speed is more important than completeness
 - Initial rough filtering before manual review
 ---
 ## Should You Use Confidence Scores for Filtering?
 ### Answer: **YES** - Confidence scores are useful for most models
 ### Evidence from Testing:
 **7 out of 8 models show good confidence calibration:**
 | Model | Avg Confidence (Correct) | Avg Confidence (Incorrect) | Difference | Reliability |
 |-------|--------------------------|---------------------------|------------|-------------|
 | lfm2-vl-1.6b | 91.8 | 80.0 | **+11.9** | ⭐⭐⭐ Excellent |
 | gemma-3-4b | 85.2 | 79.0 | **+6.2** | ⭐⭐ Very Good |
 | gemma-3-27b | 88.2 | 82.2 | **+6.0** | ⭐⭐ Very Good |
 | gemma-3-12b | 91.8 | 88.7 | **+3.1** | ⭐ Good |
 | mistral-small-24b-q8 | 92.3 | 89.1 | **+3.1** | ⭐ Good |
 | mistral-small-24b-q4 | 93.0 | 90.7 | **+2.4** | ⭐ Good |
 | qwen2.5-vl-7b | 94.6 | 94.1 | +0.5 | Limited utility |
 | kimi-vl-3b | 88.4 | 89.7 | **-1.3** | ❌ Not reliable |
 ### What This Means:
 **For most models**, setting a confidence threshold can significantly reduce false positives:
 - A threshold of 85 on gemma-3-27b would keep most correct detections (88.2 avg) while filtering many incorrect ones (82.2 avg)
 - A threshold of 85 on gemma-3-4b would be even more effective
 **Exception: qwen2.5-vl-7b** has minimal difference (94.6 vs 94.1), making threshold filtering less useful despite being the most accurate model.
 ### Recommended Filtering Strategy:
 1. **Use gemma-3-27b with confidence threshold of 85+** for best balance of accuracy and filtering
 2. **Use gemma-3-4b with confidence threshold of 85+** for faster processing with good filtering
 3. **Use qwen2.5-vl-7b without filtering** when you need maximum recall and will manually review results
 ---
 ## Model-Specific Recommendations
 ### For Different Use Cases:
 #### 🎯 **Highest Accuracy Required**
 - **Model:** qwen2.5-vl-7b
 - **Expected Results:** Find 75% of jerseys, 30% false positive rate
 - **Processing:** 13.4 seconds per image
 - **Setup:** Use raw results, manually review all detections
 #### 🎯 **Best Balance of Speed and Accuracy**
 - **Model:** gemma-3-12b
 - **Expected Results:** Find 70% of jerseys, reasonable false positive rate
 - **Processing:** 18.9 seconds per image
 - **Setup:** Apply confidence threshold of 90+ to reduce false positives
 #### 🎯 **Maximum Quality with Confidence Filtering**
 - **Model:** gemma-3-27b
 - **Expected Results:** Find 74% of jerseys, filter false positives effectively
 - **Processing:** 20.9 seconds per image
 - **Setup:** Apply confidence threshold of 85+ to reduce false positives by ~50%
 #### ⚡ **Speed is Critical**
 - **Model:** gemma-3-4b
 - **Expected Results:** Find 60% of jerseys quickly
 - **Processing:** 7.9 seconds per image
 - **Setup:** Apply confidence threshold of 85+ for quality filtering
 #### ❌ **Do Not Use**
 - **kimi-vl-3b**: Only 2% accuracy, extremely slow, poor confidence calibration
 ---
 ## Implementation Recommendations
 ### 1. Production Deployment Strategy
 **Recommended:** Two-tier approach
 - **Tier 1 (Automatic):** gemma-3-27b with confidence threshold 85+
  - Automatically tag high-confidence detections
  - Expected: ~200 correct detections per 194 images with minimal false positives
 - **Tier 2 (Review Queue):** qwen2.5-vl-7b on remaining images
  - Human review of all detections below confidence threshold
  - Catches jerseys missed by Tier 1
 ### 2. Confidence Threshold Guidelines
 Based on testing data:
 | Model | Recommended Threshold | Expected Precision | Expected Recall |
 |-------|----------------------|-------------------|-----------------|
 | gemma-3-27b | 85+ | ~85-90% | ~60-65% |
 | gemma-3-4b | 85+ | ~80-85% | ~50-55% |
 | gemma-3-12b | 90+ | ~80-85% | ~60-65% |
 | qwen2.5-vl-7b | Don't filter | 70.7% | 75.2% |
 ### 3. Performance Optimization
 **Processing 1000 images:**
 - qwen2.5-vl-7b: ~3.7 hours
 - gemma-3-27b: ~5.8 hours
 - gemma-3-4b: ~2.2 hours
 **Recommendation:** Use gemma-3-4b for initial pass, qwen2.5-vl-7b for second pass on low-confidence results.
 ---
 ## Conclusions
 ### Main Findings:
 1. **qwen2.5-vl-7b is the most accurate model** but has limited confidence score utility
 2. **gemma-3-27b offers the best combination** of accuracy and confidence-based filtering
 3. **Confidence scores are highly valuable** for reducing false positives in most models
 4. **Speed vs accuracy trade-offs are significant** - fastest model is 9% less accurate than best
 5. **One model (kimi-vl-3b) is completely unsuitable** for this task
 ### Strategic Recommendations:
 **For most users:** Deploy gemma-3-27b with confidence threshold of 85+
 - Balances accuracy, speed, and filtering capability
 - Reduces manual review burden significantly
 - Good confidence calibration enables automated decision-making
 **For maximum accuracy:** Deploy qwen2.5-vl-7b without filtering
 - Best for finding all possible jerseys
 - Requires manual review of results
 - Accept higher false positive rate
 **For high-volume processing:** Deploy gemma-3-4b with confidence threshold of 85+
 - Fast enough for real-time applications
 - Good accuracy for the speed
 - Effective filtering capability
 ### Final Verdict:
 **Winner: qwen2.5-vl-7b** for pure accuracy
 **Best Overall: gemma-3-27b** for practical deployment with confidence filtering
 **Best Value: gemma-3-4b** for speed-sensitive applications
 ---
 ## Technical Notes
 - **Test Dataset:** 194 images with ground truth jersey numbers encoded in filenames
 - **Total Expected Jerseys:** 416-462 depending on which images each model processed successfully
 - **Evaluation Metrics:** Precision, Recall, F1 Score, Confidence Calibration
 - **Hardware:** Testing performed on comparable hardware configurations
 - **Prompt:** All models used identical jersey detection prompt with confidence scores
 ---
 *Report generated from comprehensive testing of 8 vision-language models for jersey number detection in sports photography.*
--- a/docs/LLAMA_SWAP_SETUP.md
+++ b/docs/LLAMA_SWAP_SETUP.md
@ -0,0 +1,237 @@
 # llama-swap Setup Guide for Jersey Detection Testing
 This guide explains how to use [llama-swap](https://github.com/mostlygeek/llama-swap) to automatically switch between different vision language models when testing jersey detection.
 ## What is llama-swap?
 llama-swap is a model-swapping proxy that sits between your application and llama.cpp servers. It automatically loads and unloads models based on the `model` parameter in API requests, allowing you to test multiple models without manually restarting servers.
 ## Installation
 ### Docker (Recommended)
 ```bash
 # Pull the CUDA image (or cpu, vulkan, intel depending on your hardware)
 docker pull ghcr.io/mostlygeek/llama-swap:cuda
 ```
 ### Homebrew (macOS/Linux)
 ```bash
 brew tap mostlygeek/llama-swap
 brew install llama-swap
 ```
 ### Pre-built Binaries
 Download from the [releases page](https://github.com/mostlygeek/llama-swap/releases).
 ## Configuration
 A configuration file `llama-swap-config.yaml` is provided with 8 pre-configured vision models:
 ### Small Models (1-4B parameters)
 - `lfm2-vl-1.6b` - LiquidAI LFM2-VL 1.6B (F16)
 - `gemma-3-4b` - Gemma 3 4B Instruct (F16)
 - `kimi-vl-3b` - Kimi VL A3B Thinking (F16)
 ### Medium Models (7-12B parameters)
 - `qwen2.5-vl-7b` - Qwen2.5-VL 7B Instruct (F16)
 - `gemma-3-12b` - Gemma 3 12B Instruct (F16)
 ### Large Models (24-27B parameters)
 - `mistral-small-24b-q8` - Mistral Small 3.2 24B (Q8_K_XL)
 - `mistral-small-24b-q4` - Mistral Small 3.2 24B (Q4_K_XL)
 - `gemma-3-27b` - Gemma 3 27B Instruct (Q8_0)
 ## Starting llama-swap
 ### Using Docker
 ```bash
 docker run -it --rm --runtime nvidia -p 8080:8080 \
  -v $(pwd)/llama-swap-config.yaml:/app/config.yaml \
  -v /path/to/hf/cache:/root/.cache/huggingface \
  ghcr.io/mostlygeek/llama-swap:cuda
 ```
 ### Using Binary
 ```bash
 llama-swap --config llama-swap-config.yaml --listen localhost:8080
 ```
 ## Testing with Jersey Detection Script
 Once llama-swap is running, you can test different models by specifying the `--model-tag` parameter:
 ### Test a Single Model
 ```bash
 # Test Qwen2.5-VL 7B with resizing
 python test_jersey_detection.py ./images jersey_prompt.txt \
  --model-tag "qwen2.5-vl-7b" \
  --resize 1024
 ```
 ### Test Multiple Models Sequentially
 ```bash
 # Test small models
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b" --resize 1024
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-4b" --resize 1024
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "kimi-vl-3b" --resize 1024
 # Test medium models
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-12b" --resize 1024
 # Test large models
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "mistral-small-24b-q4" --resize 1024
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-27b" --resize 1024
 ```
 ### Automated Testing Scripts
 Two bash scripts are provided for automated testing:
 #### 1. Full Test Suite (`test_all_models.sh`)
 Tests **all models** defined in `llama-swap-config.yaml`:
 ```bash
 # Basic usage (uses defaults)
 ./test_all_models.sh ./test_images
 # Customize configuration with environment variables
 RESIZE=2048 ./test_all_models.sh ./test_images
 OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh ./test_images
 PROMPT_FILE=custom_prompt.txt ./test_all_models.sh ./test_images
 # Disable resize
 RESIZE= ./test_all_models.sh ./test_images
 ```
 **Features:**
 - Automatically extracts all model tags from YAML config
 - Color-coded output with progress tracking
 - Confirms before starting tests
 - Shows summary with success/failure counts
 - Asks to continue if a model fails
 **Default Configuration:**
 - Images: `./test_images`
 - Prompt: `jersey_prompt_with_confidence.txt`
 - Resize: `1024px`
 - Output: `jersey_detection_results.jsonl`
 #### 2. Quick Test (`test_quick.sh`)
 Tests a **small subset** of models for rapid iteration:
 ```bash
 # Test default selection (small, medium, large)
 ./test_quick.sh ./test_images
 # Test custom models
 MODELS="lfm2-vl-1.6b qwen2.5-vl-7b" ./test_quick.sh ./test_images
 # Customize settings
 RESIZE=512 MODELS="gemma-3-4b" ./test_quick.sh ./test_images
 ```
 **Default Models:**
 - `lfm2-vl-1.6b` (Small - 1.6B)
 - `qwen2.5-vl-7b` (Medium - 7B)
 - `mistral-small-24b-q4` (Large - 24B Q4)
 **Use Cases:**
 - Quick validation after prompt changes
 - Testing configuration adjustments
 - Rapid prototyping before full test run
 ## Analyzing Results
 After testing multiple models, use the analysis script to compare performance:
 ```bash
 python analyze_jersey_results.py
 ```
 This will show:
 - Comparison table of all models tested
 - Performance charts with hallucination rates
 - Best performers by speed and accuracy
 - Confidence distribution (if applicable)
 ## Model Swapping Behavior
 llama-swap will:
 1. **Automatically load** the requested model when you specify `--model-tag`
 2. **Automatically unload** the previous model (if different from current request)
 3. **Keep running** if you test the same model multiple times
 4. **Monitor** model loading/unloading in the web UI at `http://localhost:8080/ui`
 ## Optional: Model Auto-Unloading
 To automatically unload models after 5 minutes of inactivity, uncomment this line in `llama-swap-config.yaml`:
 ```yaml
 ttl: 300
 ```
 ## Optional: Preload Model on Startup
 To preload a specific model when llama-swap starts, uncomment and modify this section:
 ```yaml
 hooks:
  onStartup:
    - loadModel: qwen2.5-vl-7b
 ```
 ## Customizing Models
 To add or modify models, edit `llama-swap-config.yaml`:
 ```yaml
 models:
  my-custom-model:
    name: "My Custom Model Description"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf user/model-name:quantization
 ```
 Then test with:
 ```bash
 python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "my-custom-model"
 ```
 ## Troubleshooting
 ### Model not loading
 - Check llama-swap logs at `http://localhost:8080/log` or via `curl http://localhost:8080/log/stream`
 - Verify the model name in the config matches the `--model-tag` parameter
 - Ensure sufficient GPU memory for the model
 ### Connection refused
 - Verify llama-swap is running: `curl http://localhost:8080/health`
 - Check the server URL matches: default is `http://192.168.1.126:8080` (from scan.ini)
 ### Slow model switching
 - First load downloads models from HuggingFace (can be slow)
 - Subsequent loads are faster (cached locally)
 - Use quantized models (Q4, Q8) for faster loading and lower memory usage
 ## Web UI
 llama-swap includes a web interface for monitoring:
 - **Dashboard**: `http://localhost:8080/ui` - View loaded models and logs
 - **Activity**: See recent API requests
 - **Logs**: Real-time log monitoring
 ## References
 - [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
 - [llama-swap Documentation](https://github.com/mostlygeek/llama-swap/tree/main/docs)
 - [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp)
--- a/jersey_detection_results.jsonl
+++ b/jersey_detection_results.jsonl
@ -0,0 +1,6 @@
 {"timestamp": "2025-10-19T19:30:44.272849", "model_name": "LFM2-VL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 88, "images_without_jerseys": 110, "images_with_errors": 0, "total_raw_detections": 470, "total_valid_jerseys": 235, "total_hallucinated": 235, "avg_processing_time": 4.607636096501591, "total_processing_time": 912.3119471073151, "confidence_stats": {"avg": 84.14893617021276, "min": 0, "max": 100, "count": 235, "distribution": {"90-100": 138, "70-89": 70, "50-69": 8, "30-49": 8, "0-29": 11}}, "empty_response_capable": true}
 {"timestamp": "2025-10-19T22:10:05.135029", "model_name": "ggml-org_Kimi-VL-A3B-Thinking-2506-GGUF_Kimi-VL-A3B-Thinking-2506-bf16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 28, "images_without_jerseys": 163, "images_with_errors": 7, "total_raw_detections": 49, "total_valid_jerseys": 49, "total_hallucinated": 0, "avg_processing_time": 29.11009831259949, "total_processing_time": 5763.799465894699, "confidence_stats": {"avg": 88.85714285714286, "min": 60, "max": 95, "count": 49, "distribution": {"90-100": 37, "70-89": 9, "50-69": 3, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
 {"timestamp": "2025-10-20T01:20:31.076468", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-BF16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 37.221905313356956, "total_processing_time": 7369.937252044678, "confidence_stats": {"avg": 90.81983805668017, "min": 70, "max": 95, "count": 494, "distribution": {"90-100": 362, "70-89": 132, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
 {"timestamp": "2025-10-20T12:04:37.833650", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q8_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 496, "total_valid_jerseys": 496, "total_hallucinated": 0, "avg_processing_time": 20.684308366342023, "total_processing_time": 4095.493056535721, "confidence_stats": {"avg": 90.76612903225806, "min": 70, "max": 95, "count": 496, "distribution": {"90-100": 363, "70-89": 133, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
 {"timestamp": "2025-10-20T13:01:42.747694", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 14.196594772916852, "total_processing_time": 2810.9257650375366, "confidence_stats": {"avg": 92.09514170040485, "min": 80, "max": 95, "count": 494, "distribution": {"90-100": 415, "70-89": 79, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
 {"timestamp": "2025-10-20T15:01:25.669340", "model_name": "unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-Q8_0", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 185, "images_without_jerseys": 13, "images_with_errors": 0, "total_raw_detections": 428, "total_valid_jerseys": 428, "total_hallucinated": 0, "avg_processing_time": 18.127051142731098, "total_processing_time": 3589.1561262607574, "confidence_stats": {"avg": 87.14953271028037, "min": 55, "max": 100, "count": 428, "distribution": {"90-100": 250, "70-89": 166, "50-69": 12, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
--- a/jersey_prompt.txt
+++ b/jersey_prompt.txt
@ -0,0 +1,43 @@
 You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
 CRITICAL INSTRUCTIONS:
 1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
 2. ONLY include jersey numbers that you can ACTUALLY READ in the image
 3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
 4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
 5. DO NOT include jerseys if you cannot clearly see the number
 RESPONSE FORMAT:
 Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
 Use DOUBLE QUOTES (") for all JSON keys and string values.
 The JSON must have a single key "jerseys" with an array of dictionaries.
 Each dictionary must have exactly these three keys:
 - "jersey_number": The number on the jersey (as a string, only if clearly visible)
 - "jersey_color": The primary color of the jersey
 - "number_color": The color of the number on the jersey
 Example response for an image WITH visible jerseys:
 {
  "jerseys": [
    {
      "jersey_number": "101",
      "jersey_color": "red",
      "number_color": "white"
    },
    {
      "jersey_number": "142",
      "jersey_color": "blue",
      "number_color": "yellow"
    }
  ]
 }
 Example response for an image WITHOUT jerseys or with unclear numbers:
 {"jerseys": []}
 REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
 Now analyze the image and return the JSON object.
--- a/jersey_prompt_with_confidence.txt
+++ b/jersey_prompt_with_confidence.txt
@ -0,0 +1,53 @@
 You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
 CRITICAL INSTRUCTIONS:
 1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
 2. ONLY include jersey numbers that you can ACTUALLY READ in the image
 3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
 4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
 5. DO NOT include jerseys if you cannot clearly see the number
 RESPONSE FORMAT:
 Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
 Use DOUBLE QUOTES (") for all JSON keys and string values.
 The JSON must have a single key "jerseys" with an array of dictionaries.
 Each dictionary must have exactly these four keys:
 - "jersey_number": The number on the jersey (as a string, only if clearly visible)
 - "jersey_color": The primary color of the jersey
 - "number_color": The color of the number on the jersey
 - "confidence": A number from 0 to 100 representing your confidence in this detection (0 = no confidence, 100 = absolutely certain)
 CONFIDENCE SCORING GUIDELINES:
 - 90-100: Jersey number is extremely clear and unambiguous
 - 70-89: Jersey number is clear but might have minor occlusion or angle issues
 - 50-69: Jersey number is partially visible or somewhat unclear
 - 30-49: Jersey number is difficult to read but you can make it out
 - 0-29: Very uncertain, number is barely visible
 Example response for an image WITH visible jerseys:
 {
  "jerseys": [
    {
      "jersey_number": "101",
      "jersey_color": "red",
      "number_color": "white",
      "confidence": 95
    },
    {
      "jersey_number": "142",
      "jersey_color": "blue",
      "number_color": "yellow",
      "confidence": 78
    }
  ]
 }
 Example response for an image WITHOUT jerseys or with unclear numbers:
 {"jerseys": []}
 REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array. Always provide a confidence score that honestly reflects how certain you are about each detection.
 Now analyze the image and return the JSON object.
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@ -0,0 +1,59 @@
 # llama-swap configuration for jersey detection testing
 # ==================================================
 # This configuration allows automatic model switching for testing
 # different vision language models with the jersey detection test script.
 #
 # Usage:
 #   llama-swap --config llama-swap-config.yaml --listen localhost:8080
 #
 # Then use the test script with --model-tag:
 #   python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b"
 #
 # llama-swap will automatically load the requested model and swap models
 # as needed when you run tests with different --model-tag values.
 models:
  # Small vision models (1-4B parameters)
  lfm2-vl-1.6b:
    name: "LiquidAI LFM2-VL 1.6B (F16)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf LiquidAI/LFM2-VL-1.6B-GGUF:F16
  gemma-3-4b:
    name: "Gemma 3 4B Instruct (F16)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-4b-it-GGUF:F16
  kimi-vl-3b:
    name: "Kimi VL A3B Thinking (F16)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:F16
  # Medium vision models (7-12B parameters)
  qwen2.5-vl-7b:
    name: "Qwen2.5-VL 7B Instruct (F16)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:F16
  gemma-3-12b:
    name: "Gemma 3 12B Instruct (F16)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-12b-it-GGUF:F16
  # Large models (24-27B parameters)
  mistral-small-24b-q8:
    name: "Mistral Small 3.2 24B Instruct (Q8_K_XL)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q8_K_XL
  mistral-small-24b-q4:
    name: "Mistral Small 3.2 24B Instruct (Q4_K_XL)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q4_K_XL
  gemma-3-27b:
    name: "Gemma 3 27B Instruct (Q8_0)"
    cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-27b-it-GGUF:Q8_0
 # Optional: Automatically unload models after 5 minutes of inactivity
 # Uncomment to enable
 # ttl: 300
 # Optional: Preload a specific model on startup
 # Uncomment to enable
 # hooks:
 #   onStartup:
 #     - loadModel: qwen2.5-vl-7b
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
 # Jersey Detection Test Dependencies
 # Install with: pip install -r requirements.txt
 # HTTP client for llama.cpp server communication
 requests>=2.28.0
 # Image processing
 opencv-python>=4.8.0
 numpy>=1.24.0
--- a/scan_utils/init.py
+++ b/scan_utils/init.py
@ -0,0 +1 @@
 # Jersey detection scan utilities
--- a/scan_utils/jersey_detection.py
+++ b/scan_utils/jersey_detection.py
@ -0,0 +1,149 @@
 import json
 import cv2
 import numpy as np
 from typing import Dict, Any, Optional
 import logging
 # Read the default jersey detection prompt
 try:
    with open('jersey_prompt.txt', 'r') as f:
        DEFAULT_JERSEY_PROMPT = f.read()
 except FileNotFoundError:
    # Fallback prompt if file is not found
    DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
 CRITICAL INSTRUCTIONS:
 1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
 2. ONLY include jersey numbers that you can ACTUALLY READ in the image
 3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
 4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
 5. DO NOT include jerseys if you cannot clearly see the number
 RESPONSE FORMAT:
 Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
 Use DOUBLE QUOTES (") for all JSON keys and string values.
 The JSON must have a single key "jerseys" with an array of dictionaries.
 Each dictionary must have exactly these three keys:
 - "jersey_number": The number on the jersey (as a string, only if clearly visible)
 - "jersey_color": The primary color of the jersey
 - "number_color": The color of the number on the jersey
 Example response for an image WITH visible jerseys:
 {
  "jerseys": [
    {
      "jersey_number": "101",
      "jersey_color": "red",
      "number_color": "white"
    }
  ]
 }
 Example response for an image WITHOUT jerseys or with unclear numbers:
 {"jerseys": []}
 REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
 Now analyze the image and return the JSON object."""
 class DetectJerseys:
    """A class for detecting sports jerseys using a vision language model."""
    def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
        """
        Initialize the jersey detection class.
        Args:
            llama_cpp_base_url: Base URL for the llama.cpp server
            logger: Logger instance for logging messages
            prompt: Custom prompt to use for jersey detection (optional)
        """
        self.logger = logger or logging.getLogger(__name__)
        self.prompt = prompt or DEFAULT_JERSEY_PROMPT
        # Import here to avoid circular dependencies
        try:
            from scan_utils.llama_cpp_client import LlamaCppClient
            self.client = LlamaCppClient(base_url=llama_cpp_base_url)
            self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
        except ImportError as e:
            self.logger.error(f"Failed to import LlamaCppClient: {e}")
            raise
    def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
        """
        Detect jerseys in an image using the vision language model.
        Args:
            image: OpenCV image (numpy array) to analyze
            temperature: Temperature value for the model (default: 0.1)
        Returns:
            Dictionary containing detected jerseys or empty dict if invalid
        """
        try:
            # Create multimodal message with image and prompt
            message = self.client.create_multimodal_message(
                role="user",
                content=self.prompt,
                images=[image]
            )
            # Send chat completion request
            response = self.client.chat_completion(
                messages=[message],
                temperature=temperature,
                max_tokens=1000
            )
            # Extract the response text
            if 'choices' in response and len(response['choices']) > 0:
                response_text = response['choices'][0]['message']['content']
                # Log the raw response for debugging
                self.logger.debug(f"Raw VLM response: {response_text}")
                # Parse JSON response
                try:
                    result = json.loads(response_text)
                    # Process jerseys to ensure they have all required fields
                    jerseys = result.get('jerseys', [])
                    # Hallucination detection: filter out example numbers from the prompt
                    # Using numbers > 100 as examples to avoid filtering valid jersey numbers
                    HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
                    processed_jerseys = []
                    for jersey in jerseys:
                        jersey_number = jersey.get('jersey_number', '')
                        # Check for hallucination (model returning example numbers)
                        if jersey_number in HALLUCINATION_NUMBERS:
                            self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
                            continue
                        # Ensure all required fields are present
                        processed_jersey = {
                            'jersey_number': jersey_number,
                            'jersey_color': jersey.get('jersey_color', ''),
                            'number_color': jersey.get('number_color', 'unknown')  # Default to 'unknown' if missing
                        }
                        processed_jerseys.append(processed_jersey)
                    return {"jerseys": processed_jerseys}
                except json.JSONDecodeError as e:
                    self.logger.error(f"Failed to parse JSON response: {e}")
                    self.logger.debug(f"Response text was: {response_text}")
                    return {"jerseys": []}
            else:
                self.logger.warning("Empty response from VLM")
                return {"jerseys": []}
        except Exception as e:
            self.logger.error(f"Error during jersey detection: {e}")
            return {"jerseys": []}
--- a/scan_utils/llama_cpp_client.py
+++ b/scan_utils/llama_cpp_client.py
@ -0,0 +1,237 @@
 import base64
 import json
 import cv2
 import numpy as np
 import requests
 from typing import List, Dict, Any, Optional, Union
 class LlamaCppClient:
    """A Python client for interacting with a llama.cpp server."""
    def __init__(self, base_url: str = "http://192.168.1.34:8080"):
        """
        Initialize the client with the base URL of the llama.cpp server.
        Args:
            base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
        """
        self.base_url = base_url.rstrip('/')
    def health_check(self) -> Dict[str, Any]:
        """
        Check the health status of the server.
        Returns:
            Health status response from the server
        """
        response = requests.get(f"{self.base_url}/health")
        response.raise_for_status()
        return response.json()
    def get_models(self) -> Dict[str, Any]:
        """
        Get information about loaded models.
        Returns:
            Model information from the server
        """
        response = requests.get(f"{self.base_url}/v1/models")
        response.raise_for_status()
        return response.json()
    def chat_completion(
        self,
        messages: List[Dict[str, Any]],
        temperature: float = 0.1,
        min_p: float = 0.15,
        repetition_penalty: float = 1.05,
        min_image_tokens: int = 64,
        max_image_tokens: int = 256,
        do_image_splitting: bool = True,
        max_tokens: int = -1,
        stream: bool = False,
        **kwargs
    ) -> Union[Dict[str, Any], requests.Response]:
        """
        Generate a chat completion using the OpenAI-compatible API.
        Args:
            messages: List of message dictionaries with role and content
            temperature: Sampling temperature (default: 0.1)
            min_p: Minimum probability for sampling (default: 0.15)
            repetition_penalty: Repetition penalty factor (default: 1.05)
            min_image_tokens: Minimum image tokens (default: 64)
            max_image_tokens: Maximum image tokens (default: 256)
            do_image_splitting: Whether to split images (default: True)
            max_tokens: Maximum tokens to generate (default: -1 for infinity)
            stream: Whether to stream the response (default: False)
            **kwargs: Additional parameters for the completion
        Returns:
            Completion response or streaming response
        """
        payload = {
            "messages": messages,
            "temperature": temperature,
            "min_p": min_p,
            "repetition_penalty": repetition_penalty,
            "min_image_tokens": min_image_tokens,
            "max_image_tokens": max_image_tokens,
            "do_image_splitting": do_image_splitting,
            "max_tokens": max_tokens,
            "cache_prompt": True,
            "stream": stream,
            **kwargs
        }
        # Debug: Show model parameter if present (for llama-swap debugging)
        if 'model' in payload and payload['model']:
            import os
            if os.environ.get('DEBUG_LLAMA_SWAP'):
                print(f"[DEBUG] Requesting model: {payload['model']}")
        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json=payload,
            stream=stream
        )
        response.raise_for_status()
        if stream:
            return response
        return response.json()
    def completion(
        self,
        prompt: Union[str, List[Union[str, int]]],
        temperature: float = 0.1,
        min_p: float = 0.15,
        repetition_penalty: float = 1.05,
        min_image_tokens: int = 64,
        max_image_tokens: int = 256,
        do_image_splitting: bool = True,
        max_tokens: int = -1,
        stream: bool = False,
        **kwargs
    ) -> Union[Dict[str, Any], requests.Response]:
        """
        Generate a completion using the non-OAI compatible API.
        Args:
            prompt: The prompt string or list of tokens
            temperature: Sampling temperature (default: 0.1)
            min_p: Minimum probability for sampling (default: 0.15)
            repetition_penalty: Repetition penalty factor (default: 1.05)
            min_image_tokens: Minimum image tokens (default: 64)
            max_image_tokens: Maximum image tokens (default: 256)
            do_image_splitting: Whether to split images (default: True)
            max_tokens: Maximum tokens to generate (default: -1 for infinity)
            stream: Whether to stream the response (default: False)
            **kwargs: Additional parameters for the completion
        Returns:
            Completion response or streaming response
        """
        payload = {
            "prompt": prompt,
            "temperature": temperature,
            "min_p": min_p,
            "repeat_penalty": repetition_penalty,
            "min_image_tokens": min_image_tokens,
            "max_image_tokens": max_image_tokens,
            "do_image_splitting": do_image_splitting,
            "cache_prompt": True,
            "n_predict": max_tokens,
            "stream": stream,
            **kwargs
        }
        response = requests.post(
            f"{self.base_url}/completion",
            headers={"Content-Type": "application/json"},
            json=payload,
            stream=stream
        )
        response.raise_for_status()
        if stream:
            return response
        return response.json()
    @staticmethod
    def _encode_image_to_base64(image_path: str) -> str:
        """
        Encode an image file to base64 string.
        Args:
            image_path: Path to the image file
        Returns:
            Base64 encoded image string
        """
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    @staticmethod
    def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
        """
        Encode an OpenCV image to base64 string.
        Args:
            image: OpenCV image (numpy array)
        Returns:
            Base64 encoded image string
        """
        _, buffer = cv2.imencode('.jpg', image)
        return base64.b64encode(buffer).decode('utf-8')
    def create_multimodal_message(
        self,
        role: str,
        content: str,
        images: Optional[List[Union[str, np.ndarray]]] = None
    ) -> Dict[str, Any]:
        """
        Create a multimodal message with text and images.
        Args:
            role: Role of the message (system, user, assistant)
            content: Text content of the message
            images: List of image paths or OpenCV images (numpy arrays)
        Returns:
            Formatted message dictionary
        """
        if not images:
            return {"role": role, "content": content}
        # Process images
        image_data = []
        for img in images:
            if isinstance(img, str):
                # Image path
                encoded_image = self._encode_image_to_base64(img)
            else:
                # OpenCV image
                encoded_image = self._encode_cv2_image_to_base64(img)
            image_data.append(encoded_image)
        # Create multimodal content
        multimodal_content = [
            {"type": "text", "text": content}
        ]
        for img_data in image_data:
            multimodal_content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{img_data}"
                }
            })
        return {"role": role, "content": multimodal_content}
--- a/test_all_models.sh
+++ b/test_all_models.sh
@ -0,0 +1,263 @@
 #!/bin/bash
 # ==============================================================================
 # Test All Models Script for Jersey Detection
 # ==============================================================================
 # This script automatically tests all models defined in llama-swap-config.yaml
 # with the jersey detection test suite.
 #
 # Usage:
 #   ./test_all_models.sh
 #   ./test_all_models.sh /path/to/images
 #   RESIZE=2048 ./test_all_models.sh
 #   OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
 # ==============================================================================
 # Note: We don't use 'set -e' here because we have explicit error handling
 # in the test loop and want to give the user the option to continue on failures
 # ==============================================================================
 # Configuration Variables
 # ==============================================================================
 # Image directory containing test images
 IMAGES_DIR="${1:-./test_images}"
 # Prompt file to use for testing
 PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
 # Resize images to this max dimension (set to empty string to disable)
 RESIZE="${RESIZE:-1024}"
 # Output file for results
 OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
 # llama-swap configuration file
 LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
 # Server URL
 SERVER_URL="${SERVER_URL:-http://localhost:8080}"
 # ==============================================================================
 # Color codes for output
 # ==============================================================================
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 CYAN='\033[0;36m'
 NC='\033[0m' # No Color
 # ==============================================================================
 # Helper Functions
 # ==============================================================================
 print_header() {
    echo -e "${CYAN}============================================================================${NC}"
    echo -e "${CYAN}$1${NC}"
    echo -e "${CYAN}============================================================================${NC}"
 }
 print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 # ==============================================================================
 # Validation
 # ==============================================================================
 print_header "Jersey Detection - Test All Models"
 # Check if images directory exists
 if [ ! -d "$IMAGES_DIR" ]; then
    print_error "Image directory not found: $IMAGES_DIR"
    echo "Usage: $0 <image_directory>"
    exit 1
 fi
 # Check if prompt file exists
 if [ ! -f "$PROMPT_FILE" ]; then
    print_error "Prompt file not found: $PROMPT_FILE"
    exit 1
 fi
 # Check if llama-swap config exists
 if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
    print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
    exit 1
 fi
 # Check if test script exists
 if [ ! -f "test_jersey_detection.py" ]; then
    print_error "test_jersey_detection.py not found in current directory"
    exit 1
 fi
 # Check if server is running
 print_info "Checking if llama-swap server is running at $SERVER_URL..."
 if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
    print_error "Cannot connect to llama-swap at $SERVER_URL"
    echo ""
    echo "Please start llama-swap first:"
    echo "  llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
    echo ""
    exit 1
 fi
 print_success "Server is running"
 # ==============================================================================
 # Extract model tags from YAML
 # ==============================================================================
 print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
 # Extract model IDs (keys under 'models:')
 # This uses grep and sed to parse the YAML (simple parser, works for our format)
 MODEL_TAGS=$(grep "^  [a-z]" "$LLAMA_SWAP_CONFIG" | \
             grep -v "    " | \
             sed 's/:.*//' | \
             sed 's/^  //')
 if [ -z "$MODEL_TAGS" ]; then
    print_error "No model tags found in $LLAMA_SWAP_CONFIG"
    exit 1
 fi
 # Convert to array
 readarray -t MODELS <<< "$MODEL_TAGS"
 MODEL_COUNT=${#MODELS[@]}
 print_success "Found $MODEL_COUNT models to test"
 # ==============================================================================
 # Display Configuration
 # ==============================================================================
 echo ""
 print_info "Test Configuration:"
 echo "  Images directory:  $IMAGES_DIR"
 echo "  Prompt file:       $PROMPT_FILE"
 echo "  Resize:            ${RESIZE:-Disabled}"
 echo "  Output file:       $OUTPUT_FILE"
 echo "  Server URL:        $SERVER_URL"
 echo "  Models to test:    $MODEL_COUNT"
 echo ""
 # List all models
 print_info "Models:"
 for i in "${!MODELS[@]}"; do
    echo "  $((i+1)). ${MODELS[$i]}"
 done
 echo ""
 # ==============================================================================
 # Confirmation
 # ==============================================================================
 read -p "Continue with testing? (y/N) " -n 1 -r
 echo
 if [[ ! $REPLY =~ ^[Yy]$ ]]; then
    print_warning "Testing cancelled"
    exit 0
 fi
 # ==============================================================================
 # Run Tests
 # ==============================================================================
 print_header "Starting Tests"
 START_TIME=$(date +%s)
 SUCCESSFUL=0
 FAILED=0
 for i in "${!MODELS[@]}"; do
    MODEL="${MODELS[$i]}"
    MODEL_NUM=$((i+1))
    echo ""
    print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
    # Build command
    CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
    CMD="$CMD --model-tag \"$MODEL\""
    CMD="$CMD --output-file \"$OUTPUT_FILE\""
    CMD="$CMD --server-url \"$SERVER_URL\""
    # Add resize if configured
    if [ -n "$RESIZE" ]; then
        CMD="$CMD --resize $RESIZE"
    fi
    print_info "Running: $CMD"
    echo ""
    # Run the test
    if eval "$CMD"; then
        print_success "Model $MODEL completed successfully"
        SUCCESSFUL=$((SUCCESSFUL + 1))
    else
        print_error "Model $MODEL failed"
        FAILED=$((FAILED + 1))
        # Ask if user wants to continue
        echo ""
        read -p "Continue with remaining models? (Y/n) " -n 1 -r
        echo
        if [[ $REPLY =~ ^[Nn]$ ]]; then
            print_warning "Testing stopped by user"
            break
        fi
    fi
    # Show progress
    if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
        print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
    fi
 done
 # ==============================================================================
 # Summary
 # ==============================================================================
 END_TIME=$(date +%s)
 DURATION=$((END_TIME - START_TIME))
 MINUTES=$((DURATION / 60))
 SECONDS=$((DURATION % 60))
 echo ""
 print_header "Testing Complete"
 echo ""
 print_info "Summary:"
 echo "  Total models:      $MODEL_COUNT"
 echo "  Successful:        $SUCCESSFUL"
 echo "  Failed:            $FAILED"
 echo "  Total time:        ${MINUTES}m ${SECONDS}s"
 echo ""
 if [ $SUCCESSFUL -gt 0 ]; then
    print_success "Results saved to: $OUTPUT_FILE"
    echo ""
    print_info "Analyze results with:"
    echo "  python analyze_jersey_results.py $OUTPUT_FILE"
 fi
 echo ""
 # Exit with error code if any tests failed
 if [ $FAILED -gt 0 ]; then
    exit 1
 fi
 exit 0
--- a/test_jersey_detection.py
+++ b/test_jersey_detection.py
@ -0,0 +1,971 @@
 #!/usr/bin/env python3
 """
 Test script for evaluating jersey detection performance with different models and prompts.
 Usage:
    python test_jersey_detection.py <image_directory> <prompt_file> [options]
 Arguments:
    image_directory: Path to directory containing test images
    prompt_file: Path to text file containing the prompt to use
    --model-name: Name of the model being tested (optional, auto-detected from server if not provided)
    --model-tag: Model tag for llama-swap integration (optional)
    --server-url: Optional llama.cpp server URL (default: read from scan.ini)
    --output-file: Output file for results (default: jersey_detection_results.jsonl)
    --resize: Maximum image dimension for resizing before processing
 Ground Truth:
    Expected jersey numbers are parsed from filenames using dash-separated format:
    Example: 1122-8-10-29.jpg expects jerseys 8, 10, and 29
    The script calculates precision, recall, F1 score, and confidence calibration metrics
    to evaluate model accuracy against known correct results.
 Output Files:
    <output_file>: Summary statistics with ground truth metrics (default: jersey_detection_results.jsonl)
 Example:
    # Auto-detect model name from server
    python test_jersey_detection.py ./images jersey_prompt.txt
    # Resize images to 1024px max dimension before processing
    python test_jersey_detection.py ./images jersey_prompt.txt --resize 1024
    # Use llama-swap to automatically load a specific model
    python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
    # Specify custom model name (for tracking in results)
    python test_jersey_detection.py ./images jersey_prompt.txt --model-name "llama-3.2-vision"
    python test_jersey_detection.py ./images jersey_prompt_with_confidence.txt --model-name "qwen2-vl" --resize 1024
 After running tests, analyze results with:
    python analyze_jersey_results.py              # Performance and accuracy analysis
 """
 import argparse
 import configparser
 import json
 import os
 import re
 import requests
 import sys
 import time
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Any, Optional
 import cv2
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from scan_utils.llama_cpp_client import LlamaCppClient
 # Hallucination detection: filter out example numbers from prompts
 # Using numbers > 100 as examples to avoid filtering valid jersey numbers
 HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
 def parse_expected_jerseys(filename: str) -> List[str]:
    """
    Parse expected jersey numbers from filename.
    Format: prefix-number1-number2-number3.ext
    Example: 1122-8-10-29.jpg -> ['8', '10', '29']
    Args:
        filename: Image filename
    Returns:
        List of expected jersey numbers as strings
    """
    # Remove extension
    name_without_ext = Path(filename).stem
    # Split by dash
    parts = name_without_ext.split('-')
    # First part is typically a prefix/identifier, rest are jersey numbers
    # Skip the first part and collect numeric parts
    expected = []
    for i, part in enumerate(parts[1:], 1):  # Skip first part
        # Check if part is numeric (jersey number)
        if part.isdigit():
            expected.append(part)
    return expected
 def clean_response(text: str) -> str:
    """
    Clean the response by removing think tags and markdown code blocks.
    Some models use <think> tags for chain-of-thought reasoning and wrap JSON in markdown.
    Args:
        text: Raw response text
    Returns:
        Cleaned text ready for JSON parsing
    """
    # Remove <think>...</think> tags and their content (standard angle brackets)
    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
    # Remove ◁think▷...◁/think▷ tags (unicode triangle brackets)
    cleaned = re.sub(r'◁think▷.*?◁/think▷', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
    # Also remove any standalone think tags (both formats)
    cleaned = re.sub(r'</?think>', '', cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r'◁/?think▷', '', cleaned, flags=re.IGNORECASE)
    # Remove markdown code blocks (```json ... ``` or ``` ... ```)
    # First try to extract content from ```json blocks
    json_block_match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, flags=re.DOTALL | re.IGNORECASE)
    if json_block_match:
        # Extract just the content inside the code block
        cleaned = json_block_match.group(1)
    else:
        # If no code block, just remove any stray ``` markers
        cleaned = re.sub(r'```(?:json)?', '', cleaned, flags=re.IGNORECASE)
    return cleaned.strip()
 def get_llama_server_url_from_config() -> Optional[str]:
    """
    Read the LLAMA_CPP_SERVER_URL from scan.ini.
    Returns:
        Server URL from config or None if not found
    """
    config_path = os.path.join(os.path.dirname(__file__), 'scan.ini')
    if not os.path.exists(config_path):
        return None
    try:
        config = configparser.ConfigParser()
        config.read(config_path)
        if 'DEFAULT' in config and 'LLAMA_CPP_SERVER_URL' in config['DEFAULT']:
            return config['DEFAULT']['LLAMA_CPP_SERVER_URL']
    except Exception as e:
        print(f"Warning: Failed to read scan.ini: {e}")
    return None
 class JerseyDetectionTester:
    """Test runner for jersey detection evaluation."""
    def __init__(self, server_url: str, prompt: str, model_name: Optional[str] = None, resize_max: Optional[int] = None, model_tag: Optional[str] = None):
        """
        Initialize the tester.
        Args:
            server_url: Base URL for the llama.cpp server
            prompt: Prompt text to use for detection
            model_name: Name of the model being tested (optional)
            resize_max: Maximum image dimension (resize if larger, None = no resize)
            model_tag: Model tag for llama-swap integration (optional)
        """
        self.client = LlamaCppClient(base_url=server_url)
        self.prompt = prompt
        self.model_name = model_name or "unknown"
        self.resize_max = resize_max
        self.model_tag = model_tag
        self.results = []
    def test_image(self, image_path: str) -> Dict[str, Any]:
        """
        Test jersey detection on a single image.
        Args:
            image_path: Path to the image file
        Returns:
            Dictionary containing test results for this image
        """
        start_time = time.time()
        # Load image
        image = cv2.imread(image_path)
        if image is None:
            filename = Path(image_path).name
            expected_jerseys = parse_expected_jerseys(filename)
            return {
                'image_path': image_path,
                'error': 'Failed to load image',
                'jerseys': [],
                'processing_time': 0,
                'resized': False,
                'original_size': None,
                'final_size': None,
                'expected_jerseys': expected_jerseys,
                'detected_jerseys': [],
                'true_positives': [],
                'false_positives': [],
                'false_negatives': expected_jerseys,
                'precision': 0.0,
                'recall': 0.0,
                'f1_score': 0.0,
                'avg_confidence_correct': None,
                'avg_confidence_incorrect': None,
                'confidence_correct_count': 0,
                'confidence_incorrect_count': 0
            }
        # Track original size
        original_height, original_width = image.shape[:2]
        original_size = (original_width, original_height)
        resized = False
        # Resize if needed
        if self.resize_max and (original_width > self.resize_max or original_height > self.resize_max):
            # Calculate new dimensions maintaining aspect ratio
            if original_width > original_height:
                new_width = self.resize_max
                new_height = int(original_height * (self.resize_max / original_width))
            else:
                new_height = self.resize_max
                new_width = int(original_width * (self.resize_max / original_height))
            # Resize image
            image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
            resized = True
        final_height, final_width = image.shape[:2]
        final_size = (final_width, final_height)
        # Create multimodal message
        message = self.client.create_multimodal_message(
            role="user",
            content=self.prompt,
            images=[image]
        )
        # Send to LLM
        try:
            # Prepare kwargs for chat completion
            completion_kwargs = {
                'messages': [message],
                'temperature': 0.1,
                'max_tokens': 1000
            }
            # Add model parameter if model_tag is specified (for llama-swap)
            if self.model_tag:
                completion_kwargs['model'] = self.model_tag
                # Note: We don't print this for every image to avoid spam, but it's being sent
            response = self.client.chat_completion(**completion_kwargs)
            processing_time = time.time() - start_time
            # Extract response text
            if 'choices' in response and len(response['choices']) > 0:
                response_text = response['choices'][0]['message']['content']
                # Clean response (remove think tags and markdown code blocks)
                cleaned_text = clean_response(response_text)
                # Parse JSON response
                try:
                    result = json.loads(cleaned_text)
                    jerseys = result.get('jerseys', [])
                    # Apply hallucination detection
                    filtered_jerseys = []
                    hallucinated_count = 0
                    for jersey in jerseys:
                        jersey_number = jersey.get('jersey_number', '')
                        # Check for hallucination (model returning example numbers)
                        if jersey_number in HALLUCINATION_NUMBERS:
                            hallucinated_count += 1
                            continue
                        filtered_jerseys.append(jersey)
                    # Ground truth comparison
                    filename = Path(image_path).name
                    expected_jerseys = set(parse_expected_jerseys(filename))
                    detected_jerseys = set(jersey.get('jersey_number', '') for jersey in filtered_jerseys if jersey.get('jersey_number', ''))
                    # Calculate ground truth metrics
                    true_positives = expected_jerseys & detected_jerseys  # Correctly detected
                    false_positives = detected_jerseys - expected_jerseys  # Detected but not expected
                    false_negatives = expected_jerseys - detected_jerseys  # Expected but not detected
                    # Calculate precision, recall, F1
                    tp_count = len(true_positives)
                    fp_count = len(false_positives)
                    fn_count = len(false_negatives)
                    precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0
                    recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0.0
                    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
                    # Handle edge case: if no expected jerseys, precision is 1.0 if no detections, else 0.0
                    if len(expected_jerseys) == 0:
                        precision = 1.0 if len(detected_jerseys) == 0 else 0.0
                        recall = 1.0  # No jerseys to detect
                        f1_score = 1.0 if len(detected_jerseys) == 0 else 0.0
                    # Calculate confidence scores for correct vs incorrect detections
                    confidence_correct = []  # Confidence for true positives
                    confidence_incorrect = []  # Confidence for false positives
                    for jersey in filtered_jerseys:
                        jersey_number = jersey.get('jersey_number', '')
                        confidence = jersey.get('confidence')
                        if confidence is not None:
                            if jersey_number in true_positives:
                                confidence_correct.append(confidence)
                            elif jersey_number in false_positives:
                                confidence_incorrect.append(confidence)
                    avg_confidence_correct = sum(confidence_correct) / len(confidence_correct) if confidence_correct else None
                    avg_confidence_incorrect = sum(confidence_incorrect) / len(confidence_incorrect) if confidence_incorrect else None
                    return {
                        'image_path': image_path,
                        'jerseys': filtered_jerseys,
                        'hallucinated_count': hallucinated_count,
                        'raw_response': cleaned_text,
                        'processing_time': processing_time,
                        'error': None,
                        'resized': resized,
                        'original_size': original_size,
                        'final_size': final_size,
                        # Ground truth metrics
                        'expected_jerseys': sorted(expected_jerseys),
                        'detected_jerseys': sorted(detected_jerseys),
                        'true_positives': sorted(true_positives),
                        'false_positives': sorted(false_positives),
                        'false_negatives': sorted(false_negatives),
                        'precision': precision,
                        'recall': recall,
                        'f1_score': f1_score,
                        # Confidence calibration metrics
                        'avg_confidence_correct': avg_confidence_correct,
                        'avg_confidence_incorrect': avg_confidence_incorrect,
                        'confidence_correct_count': len(confidence_correct),
                        'confidence_incorrect_count': len(confidence_incorrect)
                    }
                except json.JSONDecodeError as e:
                    filename = Path(image_path).name
                    expected_jerseys = parse_expected_jerseys(filename)
                    return {
                        'image_path': image_path,
                        'error': f'JSON parse error: {e}',
                        'raw_response': cleaned_text,
                        'original_response': response_text if cleaned_text != response_text else None,
                        'jerseys': [],
                        'processing_time': processing_time,
                        'resized': resized,
                        'original_size': original_size,
                        'final_size': final_size,
                        'expected_jerseys': expected_jerseys,
                        'detected_jerseys': [],
                        'true_positives': [],
                        'false_positives': [],
                        'false_negatives': expected_jerseys,
                        'precision': 0.0,
                        'recall': 0.0,
                        'f1_score': 0.0
                    }
            else:
                filename = Path(image_path).name
                expected_jerseys = parse_expected_jerseys(filename)
                return {
                    'image_path': image_path,
                    'error': 'Empty response from model',
                    'jerseys': [],
                    'processing_time': processing_time,
                    'resized': resized,
                    'original_size': original_size,
                    'final_size': final_size,
                    'expected_jerseys': expected_jerseys,
                    'detected_jerseys': [],
                    'true_positives': [],
                    'false_positives': [],
                    'false_negatives': expected_jerseys,
                    'precision': 0.0,
                    'recall': 0.0,
                    'f1_score': 0.0
                }
        except Exception as e:
            processing_time = time.time() - start_time
            filename = Path(image_path).name
            expected_jerseys = parse_expected_jerseys(filename)
            return {
                'image_path': image_path,
                'error': f'Request error: {e}',
                'jerseys': [],
                'processing_time': processing_time,
                'resized': resized,
                'original_size': original_size,
                'final_size': final_size,
                'expected_jerseys': expected_jerseys,
                'detected_jerseys': [],
                'true_positives': [],
                'false_positives': [],
                'false_negatives': expected_jerseys,
                'precision': 0.0,
                'recall': 0.0,
                'f1_score': 0.0,
                'avg_confidence_correct': None,
                'avg_confidence_incorrect': None,
                'confidence_correct_count': 0,
                'confidence_incorrect_count': 0
            }
    def test_directory(self, directory_path: str) -> List[Dict[str, Any]]:
        """
        Test all images in a directory.
        Args:
            directory_path: Path to directory containing images
        Returns:
            List of results for all images
        """
        # Get all image files
        image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
        image_files = []
        for ext in image_extensions:
            image_files.extend(Path(directory_path).glob(f'*{ext}'))
            image_files.extend(Path(directory_path).glob(f'*{ext.upper()}'))
        image_files = sorted(image_files)
        if not image_files:
            print(f"No image files found in {directory_path}")
            return []
        print(f"Found {len(image_files)} images to process\n")
        # Process each image
        results = []
        for i, image_path in enumerate(image_files, 1):
            # Show model tag in progress if using llama-swap
            model_info = f" [{self.model_tag}]" if self.model_tag else ""
            print(f"[{i}/{len(image_files)}]{model_info} Processing {image_path.name}...")
            result = self.test_image(str(image_path))
            results.append(result)
            # Display result
            self._display_result(result)
            print()
        return results
    def _display_result(self, result: Dict[str, Any]):
        """Display the result for a single image."""
        if result.get('error'):
            print(f"  ❌ Error: {result['error']}")
            if 'raw_response' in result:
                print(f"  Cleaned response: {result['raw_response']}...")
            if result.get('original_response'):
                print(f"  (Think tags and/or markdown were filtered from response)")
        else:
            jerseys = result.get('jerseys', [])
            hallucinated_count = result.get('hallucinated_count', 0)
            if jerseys:
                print(f"  ✓ Found {len(jerseys)} jersey(s):")
                for jersey in jerseys:
                    number = jersey.get('jersey_number', 'N/A')
                    jersey_color = jersey.get('jersey_color', 'N/A')
                    number_color = jersey.get('number_color', 'N/A')
                    confidence = jersey.get('confidence', None)
                    conf_str = f" (confidence: {confidence})" if confidence is not None else ""
                    print(f"    - #{number}: {jersey_color} jersey, {number_color} number{conf_str}")
            else:
                print(f"  ○ No jerseys detected")
            if hallucinated_count > 0:
                print(f"  ⚠ Filtered {hallucinated_count} hallucinated detection(s)")
        # Display ground truth comparison
        expected = result.get('expected_jerseys', [])
        detected = result.get('detected_jerseys', [])
        true_positives = result.get('true_positives', [])
        false_positives = result.get('false_positives', [])
        false_negatives = result.get('false_negatives', [])
        if expected:
            print(f"  Ground truth: Expected {expected}, Detected {detected}")
            if true_positives:
                print(f"    ✓ Correct: {true_positives}")
            if false_positives:
                print(f"    ✗ False positives: {false_positives}")
            if false_negatives:
                print(f"    ✗ Missed: {false_negatives}")
            precision = result.get('precision', 0.0)
            recall = result.get('recall', 0.0)
            f1 = result.get('f1_score', 0.0)
            print(f"    Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}")
        print(f"  Processing time: {result['processing_time']:.2f}s")
    def save_results_to_file(self, results: List[Dict[str, Any]], prompt_file: str, output_file: str = "jersey_detection_results.jsonl"):
        """
        Save test results to a JSON Lines file for later analysis.
        Args:
            results: List of all test results
            prompt_file: Path to the prompt file used
            output_file: Path to output file (default: jersey_detection_results.jsonl)
        """
        # Calculate summary statistics
        total_images = len(results)
        images_with_errors = sum(1 for r in results if r.get('error'))
        images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
        images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
        total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
        total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
        total_raw_detections = total_jerseys + total_hallucinated
        total_processing_time = sum(r.get('processing_time', 0) for r in results)
        avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
        # Collect confidence statistics if available
        confidences = [
            jersey.get('confidence')
            for r in results if not r.get('error')
            for jersey in r.get('jerseys', [])
            if 'confidence' in jersey and jersey.get('confidence') is not None
        ]
        confidence_stats = None
        if confidences:
            buckets = {
                '90-100': sum(1 for c in confidences if 90 <= c <= 100),
                '70-89': sum(1 for c in confidences if 70 <= c <= 89),
                '50-69': sum(1 for c in confidences if 50 <= c <= 69),
                '30-49': sum(1 for c in confidences if 30 <= c <= 49),
                '0-29': sum(1 for c in confidences if 0 <= c <= 29)
            }
            confidence_stats = {
                'avg': sum(confidences) / len(confidences),
                'min': min(confidences),
                'max': max(confidences),
                'count': len(confidences),
                'distribution': buckets
            }
        # Calculate resize statistics
        images_resized = sum(1 for r in results if r.get('resized', False))
        # Calculate ground truth statistics
        results_without_errors = [r for r in results if not r.get('error')]
        total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
        total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
        total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
        total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
        # Calculate overall precision, recall, F1
        overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
        overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
        overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
        # Average per-image metrics
        avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
        avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
        avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
        # Calculate confidence calibration metrics (correct vs incorrect detections)
        all_confidence_correct = []
        all_confidence_incorrect = []
        for r in results_without_errors:
            if r.get('avg_confidence_correct') is not None:
                # Weight by the count of correct detections in this image
                count = r.get('confidence_correct_count', 0)
                avg_conf = r.get('avg_confidence_correct')
                all_confidence_correct.extend([avg_conf] * count)
            if r.get('avg_confidence_incorrect') is not None:
                # Weight by the count of incorrect detections in this image
                count = r.get('confidence_incorrect_count', 0)
                avg_conf = r.get('avg_confidence_incorrect')
                all_confidence_incorrect.extend([avg_conf] * count)
        overall_avg_confidence_correct = sum(all_confidence_correct) / len(all_confidence_correct) if all_confidence_correct else None
        overall_avg_confidence_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect) if all_confidence_incorrect else None
        # Create summary record
        summary_record = {
            'timestamp': datetime.now().isoformat(),
            'model_name': self.model_name,
            'model_tag': self.model_tag,
            'prompt_file': prompt_file,
            'prompt_length': len(self.prompt),
            'total_images': total_images,
            'images_with_jerseys': images_with_jerseys,
            'images_without_jerseys': images_without_jerseys,
            'images_with_errors': images_with_errors,
            'total_raw_detections': total_raw_detections,
            'total_valid_jerseys': total_jerseys,
            'total_hallucinated': total_hallucinated,
            'avg_processing_time': avg_processing_time,
            'total_processing_time': total_processing_time,
            'confidence_stats': confidence_stats,
            'empty_response_capable': images_without_jerseys > 0,
            'resize_enabled': self.resize_max is not None,
            'resize_max': self.resize_max,
            'images_resized': images_resized,
            # Ground truth statistics
            'ground_truth': {
                'total_expected': total_expected_jerseys,
                'total_true_positives': total_true_positives,
                'total_false_positives': total_false_positives,
                'total_false_negatives': total_false_negatives,
                'overall_precision': overall_precision,
                'overall_recall': overall_recall,
                'overall_f1': overall_f1,
                'avg_precision': avg_precision,
                'avg_recall': avg_recall,
                'avg_f1': avg_f1,
                # Confidence calibration
                'avg_confidence_correct': overall_avg_confidence_correct,
                'avg_confidence_incorrect': overall_avg_confidence_incorrect,
                'confidence_correct_count': len(all_confidence_correct),
                'confidence_incorrect_count': len(all_confidence_incorrect)
            }
        }
        # Append to file
        try:
            with open(output_file, 'a') as f:
                f.write(json.dumps(summary_record) + '\n')
            print(f"\n✓ Results saved to {output_file}")
        except Exception as e:
            print(f"\n❌ Failed to save results: {e}")
    def print_summary(self, results: List[Dict[str, Any]]):
        """
        Print summary statistics for all results.
        Args:
            results: List of all test results
        """
        print("=" * 70)
        print("SUMMARY")
        print("=" * 70)
        print(f"\nModel: {self.model_name}")
        if self.model_tag:
            print(f"Model tag: {self.model_tag}")
        # Display resize info
        if self.resize_max:
            images_resized = sum(1 for r in results if r.get('resized', False))
            print(f"Resize: Enabled (max: {self.resize_max}px, {images_resized} images resized)")
        else:
            print(f"Resize: Disabled")
        total_images = len(results)
        images_with_errors = sum(1 for r in results if r.get('error'))
        images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
        images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
        total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
        total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
        total_raw_detections = total_jerseys + total_hallucinated
        total_processing_time = sum(r.get('processing_time', 0) for r in results)
        avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
        print(f"\nTotal images processed: {total_images}")
        print(f"  - Images with jerseys: {images_with_jerseys} ({images_with_jerseys/total_images*100:.1f}%)")
        print(f"  - Images without jerseys: {images_without_jerseys} ({images_without_jerseys/total_images*100:.1f}%)")
        print(f"  - Images with errors: {images_with_errors} ({images_with_errors/total_images*100:.1f}%)")
        print(f"\nJersey detections:")
        print(f"  - Total raw detections: {total_raw_detections}")
        print(f"  - Valid jerseys (after filtering): {total_jerseys}")
        print(f"  - Hallucinations filtered out: {total_hallucinated}")
        if images_with_jerseys > 0:
            print(f"  - Average valid jerseys per image (when detected): {total_jerseys/images_with_jerseys:.2f}")
        # Empty response capability (important for evaluating model's ability to return empty results)
        print(f"\nEmpty response capability:")
        print(f"  - Empty responses returned: {images_without_jerseys}")
        print(f"  - Percentage of images: {images_without_jerseys/total_images*100:.1f}%")
        print(f"  - Model can return empty results: {'✓ Yes' if images_without_jerseys > 0 else '✗ No (potential issue)'}")
        if total_hallucinated > 0:
            print(f"\nHallucination detection:")
            print(f"  - Total hallucinated detections filtered: {total_hallucinated}")
            images_with_hallucinations = sum(1 for r in results if not r.get('error') and r.get('hallucinated_count', 0) > 0)
            print(f"  - Images with hallucinations: {images_with_hallucinations} ({images_with_hallucinations/total_images*100:.1f}%)")
        # Ground truth statistics
        results_without_errors = [r for r in results if not r.get('error')]
        total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
        if total_expected_jerseys > 0:
            total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
            total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
            total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
            # Calculate overall metrics
            overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
            overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
            overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
            # Calculate average per-image metrics
            avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
            avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
            avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
            print(f"\nGround truth performance:")
            print(f"  - Total expected jerseys: {total_expected_jerseys}")
            print(f"  - True positives: {total_true_positives}")
            print(f"  - False positives: {total_false_positives}")
            print(f"  - False negatives: {total_false_negatives}")
            print(f"\n  Overall metrics (across all jerseys):")
            print(f"    - Precision: {overall_precision:.2%}")
            print(f"    - Recall: {overall_recall:.2%}")
            print(f"    - F1 Score: {overall_f1:.2%}")
            print(f"\n  Average per-image metrics:")
            print(f"    - Avg Precision: {avg_precision:.2%}")
            print(f"    - Avg Recall: {avg_recall:.2%}")
            print(f"    - Avg F1 Score: {avg_f1:.2%}")
            # Confidence calibration metrics
            all_confidence_correct = []
            all_confidence_incorrect = []
            for r in results_without_errors:
                if r.get('avg_confidence_correct') is not None:
                    count = r.get('confidence_correct_count', 0)
                    avg_conf = r.get('avg_confidence_correct')
                    all_confidence_correct.extend([avg_conf] * count)
                if r.get('avg_confidence_incorrect') is not None:
                    count = r.get('confidence_incorrect_count', 0)
                    avg_conf = r.get('avg_confidence_incorrect')
                    all_confidence_incorrect.extend([avg_conf] * count)
            if all_confidence_correct or all_confidence_incorrect:
                print(f"\n  Confidence calibration:")
                if all_confidence_correct:
                    avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
                    print(f"    - Avg confidence (correct detections): {avg_conf_correct:.2f} ({len(all_confidence_correct)} detections)")
                else:
                    print(f"    - Avg confidence (correct detections): N/A (no correct detections with confidence)")
                if all_confidence_incorrect:
                    avg_conf_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect)
                    print(f"    - Avg confidence (incorrect detections): {avg_conf_incorrect:.2f} ({len(all_confidence_incorrect)} detections)")
                    # Show confidence difference
                    if all_confidence_correct:
                        avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
                        diff = avg_conf_correct - avg_conf_incorrect
                        if diff > 0:
                            print(f"    - Confidence difference: +{diff:.2f} (correct > incorrect, good calibration)")
                        else:
                            print(f"    - Confidence difference: {diff:.2f} (⚠ incorrect ≥ correct, poor calibration)")
                else:
                    print(f"    - Avg confidence (incorrect detections): N/A (no incorrect detections with confidence)")
        print(f"\nProcessing time:")
        print(f"  - Total: {total_processing_time:.2f}s")
        print(f"  - Average per image: {avg_processing_time:.2f}s")
        # Check for confidence values
        has_confidence = any(
            any('confidence' in jersey for jersey in r.get('jerseys', []))
            for r in results if not r.get('error')
        )
        if has_confidence:
            print(f"\nConfidence statistics:")
            confidences = [
                jersey.get('confidence')
                for r in results if not r.get('error')
                for jersey in r.get('jerseys', [])
                if 'confidence' in jersey and jersey.get('confidence') is not None
            ]
            if confidences:
                avg_confidence = sum(confidences) / len(confidences)
                min_confidence = min(confidences)
                max_confidence = max(confidences)
                print(f"  - Total detections with confidence: {len(confidences)}")
                print(f"  - Average confidence: {avg_confidence:.2f}")
                print(f"  - Min confidence: {min_confidence:.2f}")
                print(f"  - Max confidence: {max_confidence:.2f}")
                # Confidence distribution by bucket
                print(f"\n  Confidence distribution:")
                buckets = {
                    '90-100 (Extremely clear)': (90, 100),
                    '70-89  (Clear, minor issues)': (70, 89),
                    '50-69  (Partially visible)': (50, 69),
                    '30-49  (Difficult to read)': (30, 49),
                    '0-29   (Very uncertain)': (0, 29)
                }
                for bucket_name, (min_val, max_val) in buckets.items():
                    count = sum(1 for c in confidences if min_val <= c <= max_val)
                    percentage = (count / len(confidences) * 100) if len(confidences) > 0 else 0
                    bar_length = int(percentage / 2)  # Scale to max 50 chars
                    bar = '█' * bar_length
                    print(f"    {bucket_name}: {count:3d} ({percentage:5.1f}%) {bar}")
        # List errors if any
        if images_with_errors > 0:
            print(f"\nErrors encountered:")
            for r in results:
                if r.get('error'):
                    print(f"  - {Path(r['image_path']).name}: {r['error']}")
        print()
 def main():
    """Main entry point for the test script."""
    # Get default server URL from config
    default_server_url = get_llama_server_url_from_config() or 'http://192.168.1.34:8080'
    parser = argparse.ArgumentParser(
        description='Test jersey detection with different models and prompts',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument('image_directory', help='Path to directory containing test images')
    parser.add_argument('prompt_file', help='Path to text file containing the prompt')
    parser.add_argument('--model-name', default=None,
                       help='Name of the model being tested (auto-detected from server if not provided)')
    parser.add_argument('--server-url', default=default_server_url,
                       help=f'llama.cpp server URL (default: {default_server_url})')
    parser.add_argument('--output-file', default='jersey_detection_results.jsonl',
                       help='Output file for results (default: jersey_detection_results.jsonl)')
    parser.add_argument('--resize', type=int, default=None, metavar='MAX_SIZE',
                       help='Resize images to maximum dimension (e.g., 1024) before processing')
    parser.add_argument('--model-tag', default=None,
                       help='Model tag for llama-swap (e.g., "qwen2.5-vl-7b"). If not specified, uses whatever model is loaded.')
    args = parser.parse_args()
    # Validate inputs
    if not os.path.isdir(args.image_directory):
        print(f"Error: Directory not found: {args.image_directory}")
        sys.exit(1)
    if not os.path.isfile(args.prompt_file):
        print(f"Error: Prompt file not found: {args.prompt_file}")
        sys.exit(1)
    # Load prompt
    try:
        with open(args.prompt_file, 'r') as f:
            prompt = f.read()
    except Exception as e:
        print(f"Error reading prompt file: {e}")
        sys.exit(1)
    # Print test configuration
    print("=" * 70)
    print("JERSEY DETECTION TEST")
    print("=" * 70)
    print(f"Model name: {args.model_name if args.model_name else '(auto-detect)'}")
    print(f"Model tag: {args.model_tag if args.model_tag else 'None (use loaded model)'}")
    print(f"Server URL: {args.server_url}")
    print(f"Image directory: {args.image_directory}")
    print(f"Prompt file: {args.prompt_file}")
    print(f"Prompt length: {len(prompt)} characters")
    print(f"Output file: {args.output_file}")
    print(f"Resize images: {f'Yes (max: {args.resize}px)' if args.resize else 'No'}")
    print("=" * 70)
    print()
    # Check server health
    print("Checking server health...")
    try:
        client = LlamaCppClient(base_url=args.server_url)
        # Try health check (handle both JSON and plain text responses)
        try:
            health = client.health_check()
            print(f"✓ Server is healthy: {health}")
        except json.JSONDecodeError:
            # llama-swap returns plain text "OK" instead of JSON
            response = requests.get(f"{args.server_url}/health")
            response.raise_for_status()
            print(f"✓ Server is healthy: {response.text}")
        # Determine model name to use
        model_name = args.model_name
        # If model_tag is provided, use it as the model name (unless user explicitly provided a model_name)
        if args.model_tag and not args.model_name:
            model_name = args.model_tag
            print(f"✓ Using model tag as model name: {model_name}")
        elif not model_name:
            # Only auto-detect if neither model_tag nor model_name was provided
            detected_model_name = None
            try:
                models = client.get_models()
                if 'data' in models and len(models['data']) > 0:
                    model_id = models['data'][0].get('id', 'unknown')
                    print(f"✓ Active model: {model_id}")
                    # Extract just the model filename (without path)
                    if model_id and model_id != 'unknown':
                        # Remove path and get base filename
                        model_filename = os.path.basename(model_id)
                        # Remove common extensions (.gguf, .bin, etc.)
                        model_name_no_ext = os.path.splitext(model_filename)[0]
                        detected_model_name = model_name_no_ext
            except:
                pass
            if detected_model_name:
                model_name = detected_model_name
                print(f"✓ Using auto-detected model name: {model_name}")
            else:
                model_name = "unknown"
                print(f"⚠ Could not detect model name, using 'unknown'")
        else:
            # User explicitly provided model_name
            print(f"✓ Using provided model name: {model_name}")
    except Exception as e:
        print(f"❌ Failed to connect to server: {e}")
        print(f"Make sure llama.cpp server is running at {args.server_url}")
        sys.exit(1)
    print()
    # Show model tag info if using llama-swap
    if args.model_tag:
        print(f"Requesting model from llama-swap: {args.model_tag}")
        # Check currently running models on llama-swap
        try:
            running_response = requests.get(f"{args.server_url}/running")
            if running_response.status_code == 200:
                try:
                    running_models = running_response.json()
                    if running_models:
                        print(f"Currently running models: {running_models}")
                except:
                    pass
        except:
            pass
        print()
    # Run tests
    tester = JerseyDetectionTester(args.server_url, prompt, model_name, args.resize, args.model_tag)
    results = tester.test_directory(args.image_directory)
    # Print summary
    if results:
        tester.print_summary(results)
        # Save results to file
        tester.save_results_to_file(results, args.prompt_file, args.output_file)
 if __name__ == '__main__':
    main()