jersey_test/analyze_jersey_results.py

#!/usr/bin/env python3
"""
Analyze jersey detection test results and compare model performance.

Usage:
    python analyze_jersey_results.py [results_file]
    python analyze_jersey_results.py [results_file] --csv output.csv
    python analyze_jersey_results.py [results_file] --csv-only output.csv

Arguments:
    results_file: Path to the results file (default: jersey_detection_results.jsonl)
    --csv: Also export results to CSV file
    --csv-only: Export to CSV only, skip analysis display
"""

import argparse
import csv
import json
import sys
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime


def load_results(results_file: str) -> List[Dict[str, Any]]:
    """Load test results from a JSON Lines file."""
    results = []
    try:
        with open(results_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    results.append(json.loads(line))
        return results
    except FileNotFoundError:
        print(f"Error: Results file not found: {results_file}")
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in results file: {e}")
        sys.exit(1)


def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
    """
    Calculate standard deviation of confidence scores from distribution.

    Returns:
        Tuple of (stdev, quality_rating)
        quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
    """
    if not conf_stats or 'distribution' not in conf_stats:
        return None, "N/A"

    dist = conf_stats['distribution']

    # Reconstruct approximate confidence values from buckets
    # Use midpoint of each bucket
    values = []
    bucket_midpoints = {
        '90-100': 95,
        '70-89': 79.5,
        '50-69': 59.5,
        '30-49': 39.5,
        '0-29': 14.5
    }

    for bucket, count in dist.items():
        midpoint = bucket_midpoints.get(bucket, 50)
        values.extend([midpoint] * count)

    if len(values) < 2:
        return None, "N/A"

    # Calculate standard deviation
    import math
    mean = sum(values) / len(values)
    variance = sum((x - mean) ** 2 for x in values) / len(values)
    stdev = math.sqrt(variance)

    # Assign quality rating based on StDev
    if stdev < 5:
        quality = "Poor"
    elif stdev < 10:
        quality = "Fair"
    elif stdev < 15:
        quality = "Good"
    else:
        quality = "Excel"  # Shortened for table

    return stdev, quality


def print_ascii_comparison_table(results: List[Dict[str, Any]]):
    """Print a detailed ASCII comparison table of all test runs."""
    if not results:
        print("No results to display.")
        return

    print("=" * 280)
    print("DETAILED MODEL COMPARISON TABLE")
    print("=" * 280)
    print()
    print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
    print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
    print()

    # Table headers with ground truth and confidence calibration columns
    print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐")
    print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format(
        "Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
    ))
    print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤")

    # Data rows
    for i, result in enumerate(results):
        model = result.get('model_name', 'unknown')[:20]
        prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
        total_images = result.get('total_images', 0)
        valid_jerseys = result.get('total_valid_jerseys', 0)
        hallucinated = result.get('total_hallucinated', 0)
        total_detections = valid_jerseys + hallucinated
        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
        hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
        avg_time = result.get('avg_processing_time', 0)

        # Calculate confidence quality
        conf_stats = result.get('confidence_stats')
        has_conf = 'Yes' if conf_stats else 'No'
        stdev, quality = calculate_confidence_stdev(conf_stats)

        # Format confidence quality display
        if stdev is not None:
            conf_qual_str = f"{quality} ({stdev:.1f})"
        else:
            conf_qual_str = "N/A"

        # Ground truth metrics
        gt = result.get('ground_truth', {})
        precision = gt.get('overall_precision', 0) * 100
        recall = gt.get('overall_recall', 0) * 100
        f1 = gt.get('overall_f1', 0) * 100

        # Confidence calibration
        conf_correct = gt.get('avg_confidence_correct')
        conf_incorrect = gt.get('avg_confidence_incorrect')
        conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
        conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"

        resize_max = result.get('resize_max')
        resize_str = f"{resize_max}px" if resize_max else "No"
        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')

        print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format(
            model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
        ))

    # Bottom border
    print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘")

    print()


def print_comparison_table(results: List[Dict[str, Any]]):
    """Print a simple comparison table of all test runs."""
    if not results:
        print("No results to display.")
        return

    print("=" * 140)
    print("MODEL COMPARISON TABLE")
    print("=" * 140)
    print()

    # Header
    header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
    print(header)
    print("-" * 150)

    # Data rows
    for result in results:
        model = result.get('model_name', 'unknown')[:24]
        prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
        total_images = result.get('total_images', 0)
        valid_jerseys = result.get('total_valid_jerseys', 0)
        hallucinated = result.get('total_hallucinated', 0)
        empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
        avg_time = result.get('avg_processing_time', 0)
        has_conf = 'Yes' if result.get('confidence_stats') else 'No'
        resize_max = result.get('resize_max')
        resize_str = f"{resize_max}px" if resize_max else "No"
        timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')

        row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
        print(row)

    print()


def print_model_performance_chart(results: List[Dict[str, Any]]):
    """Print a performance chart showing key metrics for each model."""
    if not results:
        return

    print("=" * 140)
    print("MODEL PERFORMANCE CHART")
    print("=" * 140)
    print()

    # Group results by model
    models = {}
    for result in results:
        model_name = result.get('model_name', 'unknown')
        if model_name not in models:
            models[model_name] = []
        models[model_name].append(result)

    # Calculate aggregate statistics for each model
    for model_name, model_results in models.items():
        print(f"\n{model_name}")
        print("-" * 100)

        total_runs = len(model_results)
        total_images = sum(r.get('total_images', 0) for r in model_results)
        total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
        total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
        avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
        avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0

        # Check if any runs have confidence stats
        has_confidence = any(r.get('confidence_stats') for r in model_results)

        # Check resize status
        resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
        resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
        resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"

        print(f"  Total test runs: {total_runs}")
        print(f"  Total images processed: {total_images}")
        print(f"  Total valid detections: {total_valid}")
        print(f"  Total hallucinations: {total_hallu}")
        print(f"  Average empty response rate: {avg_empty_pct:.1f}%")
        print(f"  Average processing time: {avg_time:.2f}s/image")
        print(f"  Resize: {resize_info}")
        print(f"  Confidence support: {'Yes' if has_confidence else 'No'}")

        # Show hallucination rate
        if total_valid + total_hallu > 0:
            hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
            print(f"  Hallucination rate: {hallu_rate:.1f}%")

            # Visual bar
            bar_length = int(hallu_rate / 2)  # Scale to max 50 chars
            bar = '█' * bar_length
            print(f"  Hallucination chart: {bar} ({hallu_rate:.1f}%)")

        # Ground truth performance
        gt_runs = [r for r in model_results if r.get('ground_truth')]
        if gt_runs:
            avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
            avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
            avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
            total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
            total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
            total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
            total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)

            print(f"\n  Ground truth performance:")
            print(f"    Total expected jerseys: {total_expected}")
            print(f"    True positives: {total_tp}")
            print(f"    False positives: {total_fp}")
            print(f"    False negatives: {total_fn}")
            print(f"    Average Precision: {avg_precision:.1%}")
            print(f"    Average Recall: {avg_recall:.1%}")
            print(f"    Average F1 Score: {avg_f1:.1%}")

            # Visual F1 bar
            bar_length = int(avg_f1 * 50)  # Scale to max 50 chars
            bar = '█' * bar_length
            print(f"    F1 Score chart: {bar} ({avg_f1:.1%})")

            # Confidence calibration
            conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
            conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]

            if conf_correct_vals or conf_incorrect_vals:
                print(f"\n  Confidence calibration:")
                if conf_correct_vals:
                    avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
                    print(f"    Avg confidence (correct detections): {avg_conf_correct:.2f}")
                if conf_incorrect_vals:
                    avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
                    print(f"    Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
                if conf_correct_vals and conf_incorrect_vals:
                    diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
                    if diff > 0:
                        print(f"    Confidence difference: +{diff:.2f} (good calibration)")
                    else:
                        print(f"    Confidence difference: {diff:.2f} (⚠ poor calibration)")

        # Confidence distribution if available
        if has_confidence:
            print(f"\n  Confidence distribution (across all runs):")
            all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
            total_conf_count = 0

            for result in model_results:
                conf_stats = result.get('confidence_stats')
                if conf_stats and 'distribution' in conf_stats:
                    for bucket, count in conf_stats['distribution'].items():
                        all_dist[bucket] += count
                        total_conf_count += count

            if total_conf_count > 0:
                for bucket, count in all_dist.items():
                    pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
                    bar_length = int(pct / 2)
                    bar = '█' * bar_length
                    print(f"    {bucket}: {count:4d} ({pct:5.1f}%) {bar}")

    print()


def print_best_performers(results: List[Dict[str, Any]]):
    """Print summary of best performing models."""
    if not results:
        return

    print("=" * 140)
    print("BEST PERFORMERS")
    print("=" * 140)
    print()

    # Group by model and calculate averages
    models = {}
    for result in results:
        model_name = result.get('model_name', 'unknown')
        if model_name not in models:
            models[model_name] = {
                'runs': 0,
                'total_hallu': 0,
                'total_detections': 0,
                'avg_time': [],
                'empty_capable': []
            }

        models[model_name]['runs'] += 1
        models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
        models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
        models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
        models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))

    # Calculate scores
    model_scores = []
    for model_name, stats in models.items():
        hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
        avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
        empty_capable = any(stats['empty_capable'])

        model_scores.append({
            'model': model_name,
            'hallu_rate': hallu_rate,
            'avg_time': avg_time,
            'empty_capable': empty_capable,
            'runs': stats['runs']
        })

    # Sort by hallucination rate (lower is better)
    model_scores.sort(key=lambda x: x['hallu_rate'])

    print("Lowest hallucination rate:")
    for i, score in enumerate(model_scores[:3], 1):
        capable = "✓" if score['empty_capable'] else "✗"
        print(f"  {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")

    print()

    # Sort by speed (lower is better)
    model_scores.sort(key=lambda x: x['avg_time'])

    print("Fastest processing:")
    for i, score in enumerate(model_scores[:3], 1):
        capable = "✓" if score['empty_capable'] else "✗"
        print(f"  {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")

    print()

    # Models with empty response capability
    empty_models = [s for s in model_scores if s['empty_capable']]
    print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
    for score in empty_models:
        print(f"  - {score['model']}")

    print()

    # Best F1 scores (ground truth accuracy)
    models_with_gt = {}
    for result in results:
        if result.get('ground_truth'):
            model_name = result.get('model_name', 'unknown')
            if model_name not in models_with_gt:
                models_with_gt[model_name] = {
                    'f1_scores': [],
                    'precision_scores': [],
                    'recall_scores': []
                }
            gt = result['ground_truth']
            models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
            models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
            models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))

    if models_with_gt:
        gt_scores = []
        for model_name, stats in models_with_gt.items():
            avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
            avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
            avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
            gt_scores.append({
                'model': model_name,
                'avg_f1': avg_f1,
                'avg_precision': avg_precision,
                'avg_recall': avg_recall
            })

        # Sort by F1 score (higher is better)
        gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)

        print("Highest ground truth F1 scores:")
        for i, score in enumerate(gt_scores[:3], 1):
            print(f"  {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")

        print()


def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
    """Export results to CSV file for spreadsheet import."""
    if not results:
        print("No results to export.")
        return

    try:
        with open(csv_file, 'w', newline='') as f:
            # Define CSV columns
            fieldnames = [
                'timestamp',
                'model_name',
                'model_tag',
                'prompt_file',
                'prompt_length',
                'total_images',
                'images_with_jerseys',
                'images_without_jerseys',
                'images_with_errors',
                'total_raw_detections',
                'total_valid_jerseys',
                'total_hallucinated',
                'hallucination_rate_pct',
                'empty_response_rate_pct',
                'avg_processing_time',
                'total_processing_time',
                'resize_enabled',
                'resize_max',
                'images_resized',
                'has_confidence',
                'confidence_avg',
                'confidence_min',
                'confidence_max',
                'confidence_count',
                'confidence_stdev',
                'confidence_quality',
                'conf_90_100',
                'conf_70_89',
                'conf_50_69',
                'conf_30_49',
                'conf_0_29',
                # Ground truth columns
                'gt_total_expected',
                'gt_total_true_positives',
                'gt_total_false_positives',
                'gt_total_false_negatives',
                'gt_overall_precision',
                'gt_overall_recall',
                'gt_overall_f1',
                'gt_avg_precision',
                'gt_avg_recall',
                'gt_avg_f1',
                # Confidence calibration
                'gt_avg_confidence_correct',
                'gt_avg_confidence_incorrect',
                'gt_confidence_correct_count',
                'gt_confidence_incorrect_count'
            ]

            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

            # Write data rows
            for result in results:
                # Calculate derived values
                total_images = result.get('total_images', 0)
                valid_jerseys = result.get('total_valid_jerseys', 0)
                hallucinated = result.get('total_hallucinated', 0)
                total_detections = valid_jerseys + hallucinated
                hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
                empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0

                # Extract confidence stats
                conf_stats = result.get('confidence_stats')
                has_confidence = conf_stats is not None
                conf_avg = conf_stats.get('avg', '') if conf_stats else ''
                conf_min = conf_stats.get('min', '') if conf_stats else ''
                conf_max = conf_stats.get('max', '') if conf_stats else ''
                conf_count = conf_stats.get('count', '') if conf_stats else ''

                # Calculate confidence standard deviation and quality
                conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)

                # Extract confidence distribution
                conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
                conf_90_100 = conf_dist.get('90-100', '')
                conf_70_89 = conf_dist.get('70-89', '')
                conf_50_69 = conf_dist.get('50-69', '')
                conf_30_49 = conf_dist.get('30-49', '')
                conf_0_29 = conf_dist.get('0-29', '')

                # Extract ground truth stats
                gt = result.get('ground_truth', {})
                gt_total_expected = gt.get('total_expected', '')
                gt_total_tp = gt.get('total_true_positives', '')
                gt_total_fp = gt.get('total_false_positives', '')
                gt_total_fn = gt.get('total_false_negatives', '')
                gt_overall_precision = gt.get('overall_precision', '')
                gt_overall_recall = gt.get('overall_recall', '')
                gt_overall_f1 = gt.get('overall_f1', '')
                gt_avg_precision = gt.get('avg_precision', '')
                gt_avg_recall = gt.get('avg_recall', '')
                gt_avg_f1 = gt.get('avg_f1', '')
                gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
                gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
                gt_conf_correct_count = gt.get('confidence_correct_count', '')
                gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')

                row = {
                    'timestamp': result.get('timestamp', ''),
                    'model_name': result.get('model_name', ''),
                    'model_tag': result.get('model_tag', ''),
                    'prompt_file': result.get('prompt_file', ''),
                    'prompt_length': result.get('prompt_length', ''),
                    'total_images': total_images,
                    'images_with_jerseys': result.get('images_with_jerseys', ''),
                    'images_without_jerseys': result.get('images_without_jerseys', ''),
                    'images_with_errors': result.get('images_with_errors', ''),
                    'total_raw_detections': result.get('total_raw_detections', ''),
                    'total_valid_jerseys': valid_jerseys,
                    'total_hallucinated': hallucinated,
                    'hallucination_rate_pct': f"{hallu_rate:.2f}",
                    'empty_response_rate_pct': f"{empty_rate:.2f}",
                    'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
                    'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
                    'resize_enabled': result.get('resize_enabled', False),
                    'resize_max': result.get('resize_max', ''),
                    'images_resized': result.get('images_resized', ''),
                    'has_confidence': has_confidence,
                    'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
                    'confidence_min': conf_min,
                    'confidence_max': conf_max,
                    'confidence_count': conf_count,
                    'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
                    'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
                    'conf_90_100': conf_90_100,
                    'conf_70_89': conf_70_89,
                    'conf_50_69': conf_50_69,
                    'conf_30_49': conf_30_49,
                    'conf_0_29': conf_0_29,
                    # Ground truth data
                    'gt_total_expected': gt_total_expected,
                    'gt_total_true_positives': gt_total_tp,
                    'gt_total_false_positives': gt_total_fp,
                    'gt_total_false_negatives': gt_total_fn,
                    'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
                    'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
                    'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
                    'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
                    'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
                    'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
                    'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
                    'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
                    'gt_confidence_correct_count': gt_conf_correct_count,
                    'gt_confidence_incorrect_count': gt_conf_incorrect_count
                }

                writer.writerow(row)

        print(f"✓ Results exported to CSV: {csv_file}")
        print(f"  Rows: {len(results)}")
        print(f"  Columns: {len(fieldnames)}")

    except Exception as e:
        print(f"❌ Failed to export to CSV: {e}")
        sys.exit(1)


def main():
    """Main entry point for the analysis script."""
    parser = argparse.ArgumentParser(
        description='Analyze jersey detection test results',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Show analysis
  python analyze_jersey_results.py

  # Show analysis and export to CSV
  python analyze_jersey_results.py --csv results.csv

  # Export to CSV only (no analysis display)
  python analyze_jersey_results.py --csv-only results.csv

  # Analyze custom results file
  python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
"""
    )
    parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
                       help='Path to results file (default: jersey_detection_results.jsonl)')
    parser.add_argument('--csv', metavar='FILE', dest='csv_file',
                       help='Export results to CSV file (in addition to showing analysis)')
    parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
                       help='Export to CSV file only, skip analysis display')

    args = parser.parse_args()

    # Check if file exists
    if not Path(args.results_file).exists():
        print(f"Error: Results file not found: {args.results_file}")
        print(f"Run some tests first with test_jersey_detection.py to generate results.")
        sys.exit(1)

    # Load results
    results = load_results(args.results_file)

    if not results:
        print(f"No results found in {args.results_file}")
        sys.exit(0)

    print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")

    # Handle CSV-only mode
    if args.csv_only:
        export_to_csv(results, args.csv_only)
        return

    # Print analyses (unless CSV-only mode)
    print_ascii_comparison_table(results)
    print_model_performance_chart(results)
    print_best_performers(results)

    # Export to CSV if requested
    if args.csv_file:
        print()
        export_to_csv(results, args.csv_file)


if __name__ == '__main__':
    main()