Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions
--- a/analyze_jersey_results.py
+++ b/analyze_jersey_results.py
@ -0,0 +1,663 @@
+#!/usr/bin/env python3
+"""
+Analyze jersey detection test results and compare model performance.
+
+Usage:
+    python analyze_jersey_results.py [results_file]
+    python analyze_jersey_results.py [results_file] --csv output.csv
+    python analyze_jersey_results.py [results_file] --csv-only output.csv
+
+Arguments:
+    results_file: Path to the results file (default: jersey_detection_results.jsonl)
+    --csv: Also export results to CSV file
+    --csv-only: Export to CSV only, skip analysis display
+"""
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+
+
+def load_results(results_file: str) -> List[Dict[str, Any]]:
+    """Load test results from a JSON Lines file."""
+    results = []
+    try:
+        with open(results_file, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    results.append(json.loads(line))
+        return results
+    except FileNotFoundError:
+        print(f"Error: Results file not found: {results_file}")
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON in results file: {e}")
+        sys.exit(1)
+
+
+def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
+    """
+    Calculate standard deviation of confidence scores from distribution.
+
+    Returns:
+        Tuple of (stdev, quality_rating)
+        quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
+    """
+    if not conf_stats or 'distribution' not in conf_stats:
+        return None, "N/A"
+
+    dist = conf_stats['distribution']
+
+    # Reconstruct approximate confidence values from buckets
+    # Use midpoint of each bucket
+    values = []
+    bucket_midpoints = {
+        '90-100': 95,
+        '70-89': 79.5,
+        '50-69': 59.5,
+        '30-49': 39.5,
+        '0-29': 14.5
+    }
+
+    for bucket, count in dist.items():
+        midpoint = bucket_midpoints.get(bucket, 50)
+        values.extend([midpoint] * count)
+
+    if len(values) < 2:
+        return None, "N/A"
+
+    # Calculate standard deviation
+    import math
+    mean = sum(values) / len(values)
+    variance = sum((x - mean) ** 2 for x in values) / len(values)
+    stdev = math.sqrt(variance)
+
+    # Assign quality rating based on StDev
+    if stdev < 5:
+        quality = "Poor"
+    elif stdev < 10:
+        quality = "Fair"
+    elif stdev < 15:
+        quality = "Good"
+    else:
+        quality = "Excel"  # Shortened for table
+
+    return stdev, quality
+
+
+def print_ascii_comparison_table(results: List[Dict[str, Any]]):
+    """Print a detailed ASCII comparison table of all test runs."""
+    if not results:
+        print("No results to display.")
+        return
+
+    print("=" * 280)
+    print("DETAILED MODEL COMPARISON TABLE")
+    print("=" * 280)
+    print()
+    print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
+    print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
+    print()
+
+    # Table headers with ground truth and confidence calibration columns
+    print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐")
+    print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format(
+        "Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
+    ))
+    print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤")
+
+    # Data rows
+    for i, result in enumerate(results):
+        model = result.get('model_name', 'unknown')[:20]
+        prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
+        total_images = result.get('total_images', 0)
+        valid_jerseys = result.get('total_valid_jerseys', 0)
+        hallucinated = result.get('total_hallucinated', 0)
+        total_detections = valid_jerseys + hallucinated
+        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
+        hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
+        avg_time = result.get('avg_processing_time', 0)
+
+        # Calculate confidence quality
+        conf_stats = result.get('confidence_stats')
+        has_conf = 'Yes' if conf_stats else 'No'
+        stdev, quality = calculate_confidence_stdev(conf_stats)
+
+        # Format confidence quality display
+        if stdev is not None:
+            conf_qual_str = f"{quality} ({stdev:.1f})"
+        else:
+            conf_qual_str = "N/A"
+
+        # Ground truth metrics
+        gt = result.get('ground_truth', {})
+        precision = gt.get('overall_precision', 0) * 100
+        recall = gt.get('overall_recall', 0) * 100
+        f1 = gt.get('overall_f1', 0) * 100
+
+        # Confidence calibration
+        conf_correct = gt.get('avg_confidence_correct')
+        conf_incorrect = gt.get('avg_confidence_incorrect')
+        conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
+        conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"
+
+        resize_max = result.get('resize_max')
+        resize_str = f"{resize_max}px" if resize_max else "No"
+        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')
+
+        print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format(
+            model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
+        ))
+
+    # Bottom border
+    print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘")
+
+    print()
+
+
+def print_comparison_table(results: List[Dict[str, Any]]):
+    """Print a simple comparison table of all test runs."""
+    if not results:
+        print("No results to display.")
+        return
+
+    print("=" * 140)
+    print("MODEL COMPARISON TABLE")
+    print("=" * 140)
+    print()
+
+    # Header
+    header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
+    print(header)
+    print("-" * 150)
+
+    # Data rows
+    for result in results:
+        model = result.get('model_name', 'unknown')[:24]
+        prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
+        total_images = result.get('total_images', 0)
+        valid_jerseys = result.get('total_valid_jerseys', 0)
+        hallucinated = result.get('total_hallucinated', 0)
+        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
+        avg_time = result.get('avg_processing_time', 0)
+        has_conf = 'Yes' if result.get('confidence_stats') else 'No'
+        resize_max = result.get('resize_max')
+        resize_str = f"{resize_max}px" if resize_max else "No"
+        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')
+
+        row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
+        print(row)
+
+    print()
+
+
+def print_model_performance_chart(results: List[Dict[str, Any]]):
+    """Print a performance chart showing key metrics for each model."""
+    if not results:
+        return
+
+    print("=" * 140)
+    print("MODEL PERFORMANCE CHART")
+    print("=" * 140)
+    print()
+
+    # Group results by model
+    models = {}
+    for result in results:
+        model_name = result.get('model_name', 'unknown')
+        if model_name not in models:
+            models[model_name] = []
+        models[model_name].append(result)
+
+    # Calculate aggregate statistics for each model
+    for model_name, model_results in models.items():
+        print(f"\n{model_name}")
+        print("-" * 100)
+
+        total_runs = len(model_results)
+        total_images = sum(r.get('total_images', 0) for r in model_results)
+        total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
+        total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
+        avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
+        avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0
+
+        # Check if any runs have confidence stats
+        has_confidence = any(r.get('confidence_stats') for r in model_results)
+
+        # Check resize status
+        resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
+        resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
+        resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"
+
+        print(f"  Total test runs: {total_runs}")
+        print(f"  Total images processed: {total_images}")
+        print(f"  Total valid detections: {total_valid}")
+        print(f"  Total hallucinations: {total_hallu}")
+        print(f"  Average empty response rate: {avg_empty_pct:.1f}%")
+        print(f"  Average processing time: {avg_time:.2f}s/image")
+        print(f"  Resize: {resize_info}")
+        print(f"  Confidence support: {'Yes' if has_confidence else 'No'}")
+
+        # Show hallucination rate
+        if total_valid + total_hallu > 0:
+            hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
+            print(f"  Hallucination rate: {hallu_rate:.1f}%")
+
+            # Visual bar
+            bar_length = int(hallu_rate / 2)  # Scale to max 50 chars
+            bar = '█' * bar_length
+            print(f"  Hallucination chart: {bar} ({hallu_rate:.1f}%)")
+
+        # Ground truth performance
+        gt_runs = [r for r in model_results if r.get('ground_truth')]
+        if gt_runs:
+            avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
+            avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
+            avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
+            total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
+            total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
+            total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
+            total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)
+
+            print(f"\n  Ground truth performance:")
+            print(f"    Total expected jerseys: {total_expected}")
+            print(f"    True positives: {total_tp}")
+            print(f"    False positives: {total_fp}")
+            print(f"    False negatives: {total_fn}")
+            print(f"    Average Precision: {avg_precision:.1%}")
+            print(f"    Average Recall: {avg_recall:.1%}")
+            print(f"    Average F1 Score: {avg_f1:.1%}")
+
+            # Visual F1 bar
+            bar_length = int(avg_f1 * 50)  # Scale to max 50 chars
+            bar = '█' * bar_length
+            print(f"    F1 Score chart: {bar} ({avg_f1:.1%})")
+
+            # Confidence calibration
+            conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
+            conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]
+
+            if conf_correct_vals or conf_incorrect_vals:
+                print(f"\n  Confidence calibration:")
+                if conf_correct_vals:
+                    avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
+                    print(f"    Avg confidence (correct detections): {avg_conf_correct:.2f}")
+                if conf_incorrect_vals:
+                    avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
+                    print(f"    Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
+                if conf_correct_vals and conf_incorrect_vals:
+                    diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
+                    if diff > 0:
+                        print(f"    Confidence difference: +{diff:.2f} (good calibration)")
+                    else:
+                        print(f"    Confidence difference: {diff:.2f} (⚠ poor calibration)")
+
+        # Confidence distribution if available
+        if has_confidence:
+            print(f"\n  Confidence distribution (across all runs):")
+            all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
+            total_conf_count = 0
+
+            for result in model_results:
+                conf_stats = result.get('confidence_stats')
+                if conf_stats and 'distribution' in conf_stats:
+                    for bucket, count in conf_stats['distribution'].items():
+                        all_dist[bucket] += count
+                        total_conf_count += count
+
+            if total_conf_count > 0:
+                for bucket, count in all_dist.items():
+                    pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
+                    bar_length = int(pct / 2)
+                    bar = '█' * bar_length
+                    print(f"    {bucket}: {count:4d} ({pct:5.1f}%) {bar}")
+
+    print()
+
+
+def print_best_performers(results: List[Dict[str, Any]]):
+    """Print summary of best performing models."""
+    if not results:
+        return
+
+    print("=" * 140)
+    print("BEST PERFORMERS")
+    print("=" * 140)
+    print()
+
+    # Group by model and calculate averages
+    models = {}
+    for result in results:
+        model_name = result.get('model_name', 'unknown')
+        if model_name not in models:
+            models[model_name] = {
+                'runs': 0,
+                'total_hallu': 0,
+                'total_detections': 0,
+                'avg_time': [],
+                'empty_capable': []
+            }
+
+        models[model_name]['runs'] += 1
+        models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
+        models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
+        models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
+        models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))
+
+    # Calculate scores
+    model_scores = []
+    for model_name, stats in models.items():
+        hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
+        avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
+        empty_capable = any(stats['empty_capable'])
+
+        model_scores.append({
+            'model': model_name,
+            'hallu_rate': hallu_rate,
+            'avg_time': avg_time,
+            'empty_capable': empty_capable,
+            'runs': stats['runs']
+        })
+
+    # Sort by hallucination rate (lower is better)
+    model_scores.sort(key=lambda x: x['hallu_rate'])
+
+    print("Lowest hallucination rate:")
+    for i, score in enumerate(model_scores[:3], 1):
+        capable = "✓" if score['empty_capable'] else "✗"
+        print(f"  {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")
+
+    print()
+
+    # Sort by speed (lower is better)
+    model_scores.sort(key=lambda x: x['avg_time'])
+
+    print("Fastest processing:")
+    for i, score in enumerate(model_scores[:3], 1):
+        capable = "✓" if score['empty_capable'] else "✗"
+        print(f"  {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")
+
+    print()
+
+    # Models with empty response capability
+    empty_models = [s for s in model_scores if s['empty_capable']]
+    print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
+    for score in empty_models:
+        print(f"  - {score['model']}")
+
+    print()
+
+    # Best F1 scores (ground truth accuracy)
+    models_with_gt = {}
+    for result in results:
+        if result.get('ground_truth'):
+            model_name = result.get('model_name', 'unknown')
+            if model_name not in models_with_gt:
+                models_with_gt[model_name] = {
+                    'f1_scores': [],
+                    'precision_scores': [],
+                    'recall_scores': []
+                }
+            gt = result['ground_truth']
+            models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
+            models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
+            models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))
+
+    if models_with_gt:
+        gt_scores = []
+        for model_name, stats in models_with_gt.items():
+            avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
+            avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
+            avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
+            gt_scores.append({
+                'model': model_name,
+                'avg_f1': avg_f1,
+                'avg_precision': avg_precision,
+                'avg_recall': avg_recall
+            })
+
+        # Sort by F1 score (higher is better)
+        gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)
+
+        print("Highest ground truth F1 scores:")
+        for i, score in enumerate(gt_scores[:3], 1):
+            print(f"  {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")
+
+        print()
+
+
+def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
+    """Export results to CSV file for spreadsheet import."""
+    if not results:
+        print("No results to export.")
+        return
+
+    try:
+        with open(csv_file, 'w', newline='') as f:
+            # Define CSV columns
+            fieldnames = [
+                'timestamp',
+                'model_name',
+                'model_tag',
+                'prompt_file',
+                'prompt_length',
+                'total_images',
+                'images_with_jerseys',
+                'images_without_jerseys',
+                'images_with_errors',
+                'total_raw_detections',
+                'total_valid_jerseys',
+                'total_hallucinated',
+                'hallucination_rate_pct',
+                'empty_response_rate_pct',
+                'avg_processing_time',
+                'total_processing_time',
+                'resize_enabled',
+                'resize_max',
+                'images_resized',
+                'has_confidence',
+                'confidence_avg',
+                'confidence_min',
+                'confidence_max',
+                'confidence_count',
+                'confidence_stdev',
+                'confidence_quality',
+                'conf_90_100',
+                'conf_70_89',
+                'conf_50_69',
+                'conf_30_49',
+                'conf_0_29',
+                # Ground truth columns
+                'gt_total_expected',
+                'gt_total_true_positives',
+                'gt_total_false_positives',
+                'gt_total_false_negatives',
+                'gt_overall_precision',
+                'gt_overall_recall',
+                'gt_overall_f1',
+                'gt_avg_precision',
+                'gt_avg_recall',
+                'gt_avg_f1',
+                # Confidence calibration
+                'gt_avg_confidence_correct',
+                'gt_avg_confidence_incorrect',
+                'gt_confidence_correct_count',
+                'gt_confidence_incorrect_count'
+            ]
+
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+
+            # Write data rows
+            for result in results:
+                # Calculate derived values
+                total_images = result.get('total_images', 0)
+                valid_jerseys = result.get('total_valid_jerseys', 0)
+                hallucinated = result.get('total_hallucinated', 0)
+                total_detections = valid_jerseys + hallucinated
+                hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
+                empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
+
+                # Extract confidence stats
+                conf_stats = result.get('confidence_stats')
+                has_confidence = conf_stats is not None
+                conf_avg = conf_stats.get('avg', '') if conf_stats else ''
+                conf_min = conf_stats.get('min', '') if conf_stats else ''
+                conf_max = conf_stats.get('max', '') if conf_stats else ''
+                conf_count = conf_stats.get('count', '') if conf_stats else ''
+
+                # Calculate confidence standard deviation and quality
+                conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)
+
+                # Extract confidence distribution
+                conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
+                conf_90_100 = conf_dist.get('90-100', '')
+                conf_70_89 = conf_dist.get('70-89', '')
+                conf_50_69 = conf_dist.get('50-69', '')
+                conf_30_49 = conf_dist.get('30-49', '')
+                conf_0_29 = conf_dist.get('0-29', '')
+
+                # Extract ground truth stats
+                gt = result.get('ground_truth', {})
+                gt_total_expected = gt.get('total_expected', '')
+                gt_total_tp = gt.get('total_true_positives', '')
+                gt_total_fp = gt.get('total_false_positives', '')
+                gt_total_fn = gt.get('total_false_negatives', '')
+                gt_overall_precision = gt.get('overall_precision', '')
+                gt_overall_recall = gt.get('overall_recall', '')
+                gt_overall_f1 = gt.get('overall_f1', '')
+                gt_avg_precision = gt.get('avg_precision', '')
+                gt_avg_recall = gt.get('avg_recall', '')
+                gt_avg_f1 = gt.get('avg_f1', '')
+                gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
+                gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
+                gt_conf_correct_count = gt.get('confidence_correct_count', '')
+                gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')
+
+                row = {
+                    'timestamp': result.get('timestamp', ''),
+                    'model_name': result.get('model_name', ''),
+                    'model_tag': result.get('model_tag', ''),
+                    'prompt_file': result.get('prompt_file', ''),
+                    'prompt_length': result.get('prompt_length', ''),
+                    'total_images': total_images,
+                    'images_with_jerseys': result.get('images_with_jerseys', ''),
+                    'images_without_jerseys': result.get('images_without_jerseys', ''),
+                    'images_with_errors': result.get('images_with_errors', ''),
+                    'total_raw_detections': result.get('total_raw_detections', ''),
+                    'total_valid_jerseys': valid_jerseys,
+                    'total_hallucinated': hallucinated,
+                    'hallucination_rate_pct': f"{hallu_rate:.2f}",
+                    'empty_response_rate_pct': f"{empty_rate:.2f}",
+                    'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
+                    'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
+                    'resize_enabled': result.get('resize_enabled', False),
+                    'resize_max': result.get('resize_max', ''),
+                    'images_resized': result.get('images_resized', ''),
+                    'has_confidence': has_confidence,
+                    'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
+                    'confidence_min': conf_min,
+                    'confidence_max': conf_max,
+                    'confidence_count': conf_count,
+                    'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
+                    'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
+                    'conf_90_100': conf_90_100,
+                    'conf_70_89': conf_70_89,
+                    'conf_50_69': conf_50_69,
+                    'conf_30_49': conf_30_49,
+                    'conf_0_29': conf_0_29,
+                    # Ground truth data
+                    'gt_total_expected': gt_total_expected,
+                    'gt_total_true_positives': gt_total_tp,
+                    'gt_total_false_positives': gt_total_fp,
+                    'gt_total_false_negatives': gt_total_fn,
+                    'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
+                    'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
+                    'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
+                    'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
+                    'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
+                    'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
+                    'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
+                    'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
+                    'gt_confidence_correct_count': gt_conf_correct_count,
+                    'gt_confidence_incorrect_count': gt_conf_incorrect_count
+                }
+
+                writer.writerow(row)
+
+        print(f"✓ Results exported to CSV: {csv_file}")
+        print(f"  Rows: {len(results)}")
+        print(f"  Columns: {len(fieldnames)}")
+
+    except Exception as e:
+        print(f"❌ Failed to export to CSV: {e}")
+        sys.exit(1)
+
+
+def main():
+    """Main entry point for the analysis script."""
+    parser = argparse.ArgumentParser(
+        description='Analyze jersey detection test results',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Show analysis
+  python analyze_jersey_results.py
+
+  # Show analysis and export to CSV
+  python analyze_jersey_results.py --csv results.csv
+
+  # Export to CSV only (no analysis display)
+  python analyze_jersey_results.py --csv-only results.csv
+
+  # Analyze custom results file
+  python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
+"""
+    )
+    parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
+                       help='Path to results file (default: jersey_detection_results.jsonl)')
+    parser.add_argument('--csv', metavar='FILE', dest='csv_file',
+                       help='Export results to CSV file (in addition to showing analysis)')
+    parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
+                       help='Export to CSV file only, skip analysis display')
+
+    args = parser.parse_args()
+
+    # Check if file exists
+    if not Path(args.results_file).exists():
+        print(f"Error: Results file not found: {args.results_file}")
+        print(f"Run some tests first with test_jersey_detection.py to generate results.")
+        sys.exit(1)
+
+    # Load results
+    results = load_results(args.results_file)
+
+    if not results:
+        print(f"No results found in {args.results_file}")
+        sys.exit(0)
+
+    print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")
+
+    # Handle CSV-only mode
+    if args.csv_only:
+        export_to_csv(results, args.csv_only)
+        return
+
+    # Print analyses (unless CSV-only mode)
+    print_ascii_comparison_table(results)
+    print_model_performance_chart(results)
+    print_best_performers(results)
+
+    # Export to CSV if requested
+    if args.csv_file:
+        print()
+        export_to_csv(results, args.csv_file)
+
+
+if __name__ == '__main__':
+    main()