Files
jersey_test/analyze_jersey_results.py
Rick McEwen 8706edcd13 Initial commit: Jersey detection test suite
Test scripts and utilities for evaluating vision-language models
on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00

664 lines
29 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Analyze jersey detection test results and compare model performance.
Usage:
python analyze_jersey_results.py [results_file]
python analyze_jersey_results.py [results_file] --csv output.csv
python analyze_jersey_results.py [results_file] --csv-only output.csv
Arguments:
results_file: Path to the results file (default: jersey_detection_results.jsonl)
--csv: Also export results to CSV file
--csv-only: Export to CSV only, skip analysis display
"""
import argparse
import csv
import json
import sys
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime
def load_results(results_file: str) -> List[Dict[str, Any]]:
"""Load test results from a JSON Lines file."""
results = []
try:
with open(results_file, 'r') as f:
for line in f:
line = line.strip()
if line:
results.append(json.loads(line))
return results
except FileNotFoundError:
print(f"Error: Results file not found: {results_file}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in results file: {e}")
sys.exit(1)
def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
"""
Calculate standard deviation of confidence scores from distribution.
Returns:
Tuple of (stdev, quality_rating)
quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
"""
if not conf_stats or 'distribution' not in conf_stats:
return None, "N/A"
dist = conf_stats['distribution']
# Reconstruct approximate confidence values from buckets
# Use midpoint of each bucket
values = []
bucket_midpoints = {
'90-100': 95,
'70-89': 79.5,
'50-69': 59.5,
'30-49': 39.5,
'0-29': 14.5
}
for bucket, count in dist.items():
midpoint = bucket_midpoints.get(bucket, 50)
values.extend([midpoint] * count)
if len(values) < 2:
return None, "N/A"
# Calculate standard deviation
import math
mean = sum(values) / len(values)
variance = sum((x - mean) ** 2 for x in values) / len(values)
stdev = math.sqrt(variance)
# Assign quality rating based on StDev
if stdev < 5:
quality = "Poor"
elif stdev < 10:
quality = "Fair"
elif stdev < 15:
quality = "Good"
else:
quality = "Excel" # Shortened for table
return stdev, quality
def print_ascii_comparison_table(results: List[Dict[str, Any]]):
"""Print a detailed ASCII comparison table of all test runs."""
if not results:
print("No results to display.")
return
print("=" * 280)
print("DETAILED MODEL COMPARISON TABLE")
print("=" * 280)
print()
print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
print()
# Table headers with ground truth and confidence calibration columns
print("" + "" * 22 + "" + "" * 10 + "" + "" * 8 + "" + "" * 8 + "" + "" * 8 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 8 + "" + "" * 8 + "" + "" * 12 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 21 + "")
print("{:<20}{:^8}{:^6}{:^6}{:^6}{:^8}{:^8}{:^8}{:^6}{:^6}{:^10}{:^8}{:^8}{:^8}{:^8}{:^8}{:^19}".format(
"Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
))
print("" + "" * 22 + "" + "" * 10 + "" + "" * 8 + "" + "" * 8 + "" + "" * 8 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 8 + "" + "" * 8 + "" + "" * 12 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 21 + "")
# Data rows
for i, result in enumerate(results):
model = result.get('model_name', 'unknown')[:20]
prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
total_images = result.get('total_images', 0)
valid_jerseys = result.get('total_valid_jerseys', 0)
hallucinated = result.get('total_hallucinated', 0)
total_detections = valid_jerseys + hallucinated
empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
avg_time = result.get('avg_processing_time', 0)
# Calculate confidence quality
conf_stats = result.get('confidence_stats')
has_conf = 'Yes' if conf_stats else 'No'
stdev, quality = calculate_confidence_stdev(conf_stats)
# Format confidence quality display
if stdev is not None:
conf_qual_str = f"{quality} ({stdev:.1f})"
else:
conf_qual_str = "N/A"
# Ground truth metrics
gt = result.get('ground_truth', {})
precision = gt.get('overall_precision', 0) * 100
recall = gt.get('overall_recall', 0) * 100
f1 = gt.get('overall_f1', 0) * 100
# Confidence calibration
conf_correct = gt.get('avg_confidence_correct')
conf_incorrect = gt.get('avg_confidence_incorrect')
conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"
resize_max = result.get('resize_max')
resize_str = f"{resize_max}px" if resize_max else "No"
timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')
print("{:<20}{:>8}{:>6}{:>6}{:>6}{:>7.1f}%{:>7.1f}%{:>7.2f}s │ {:>6}{:>6}{:>10}{:>7.1f}%{:>7.1f}%{:>7.1f}%{:>8}{:>8}{:>19}".format(
model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
))
# Bottom border
print("" + "" * 22 + "" + "" * 10 + "" + "" * 8 + "" + "" * 8 + "" + "" * 8 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 8 + "" + "" * 8 + "" + "" * 12 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 10 + "" + "" * 21 + "")
print()
def print_comparison_table(results: List[Dict[str, Any]]):
"""Print a simple comparison table of all test runs."""
if not results:
print("No results to display.")
return
print("=" * 140)
print("MODEL COMPARISON TABLE")
print("=" * 140)
print()
# Header
header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
print(header)
print("-" * 150)
# Data rows
for result in results:
model = result.get('model_name', 'unknown')[:24]
prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
total_images = result.get('total_images', 0)
valid_jerseys = result.get('total_valid_jerseys', 0)
hallucinated = result.get('total_hallucinated', 0)
empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
avg_time = result.get('avg_processing_time', 0)
has_conf = 'Yes' if result.get('confidence_stats') else 'No'
resize_max = result.get('resize_max')
resize_str = f"{resize_max}px" if resize_max else "No"
timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')
row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
print(row)
print()
def print_model_performance_chart(results: List[Dict[str, Any]]):
"""Print a performance chart showing key metrics for each model."""
if not results:
return
print("=" * 140)
print("MODEL PERFORMANCE CHART")
print("=" * 140)
print()
# Group results by model
models = {}
for result in results:
model_name = result.get('model_name', 'unknown')
if model_name not in models:
models[model_name] = []
models[model_name].append(result)
# Calculate aggregate statistics for each model
for model_name, model_results in models.items():
print(f"\n{model_name}")
print("-" * 100)
total_runs = len(model_results)
total_images = sum(r.get('total_images', 0) for r in model_results)
total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0
# Check if any runs have confidence stats
has_confidence = any(r.get('confidence_stats') for r in model_results)
# Check resize status
resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"
print(f" Total test runs: {total_runs}")
print(f" Total images processed: {total_images}")
print(f" Total valid detections: {total_valid}")
print(f" Total hallucinations: {total_hallu}")
print(f" Average empty response rate: {avg_empty_pct:.1f}%")
print(f" Average processing time: {avg_time:.2f}s/image")
print(f" Resize: {resize_info}")
print(f" Confidence support: {'Yes' if has_confidence else 'No'}")
# Show hallucination rate
if total_valid + total_hallu > 0:
hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
print(f" Hallucination rate: {hallu_rate:.1f}%")
# Visual bar
bar_length = int(hallu_rate / 2) # Scale to max 50 chars
bar = '' * bar_length
print(f" Hallucination chart: {bar} ({hallu_rate:.1f}%)")
# Ground truth performance
gt_runs = [r for r in model_results if r.get('ground_truth')]
if gt_runs:
avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)
print(f"\n Ground truth performance:")
print(f" Total expected jerseys: {total_expected}")
print(f" True positives: {total_tp}")
print(f" False positives: {total_fp}")
print(f" False negatives: {total_fn}")
print(f" Average Precision: {avg_precision:.1%}")
print(f" Average Recall: {avg_recall:.1%}")
print(f" Average F1 Score: {avg_f1:.1%}")
# Visual F1 bar
bar_length = int(avg_f1 * 50) # Scale to max 50 chars
bar = '' * bar_length
print(f" F1 Score chart: {bar} ({avg_f1:.1%})")
# Confidence calibration
conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]
if conf_correct_vals or conf_incorrect_vals:
print(f"\n Confidence calibration:")
if conf_correct_vals:
avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
print(f" Avg confidence (correct detections): {avg_conf_correct:.2f}")
if conf_incorrect_vals:
avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
print(f" Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
if conf_correct_vals and conf_incorrect_vals:
diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
if diff > 0:
print(f" Confidence difference: +{diff:.2f} (good calibration)")
else:
print(f" Confidence difference: {diff:.2f} (⚠ poor calibration)")
# Confidence distribution if available
if has_confidence:
print(f"\n Confidence distribution (across all runs):")
all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
total_conf_count = 0
for result in model_results:
conf_stats = result.get('confidence_stats')
if conf_stats and 'distribution' in conf_stats:
for bucket, count in conf_stats['distribution'].items():
all_dist[bucket] += count
total_conf_count += count
if total_conf_count > 0:
for bucket, count in all_dist.items():
pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
bar_length = int(pct / 2)
bar = '' * bar_length
print(f" {bucket}: {count:4d} ({pct:5.1f}%) {bar}")
print()
def print_best_performers(results: List[Dict[str, Any]]):
"""Print summary of best performing models."""
if not results:
return
print("=" * 140)
print("BEST PERFORMERS")
print("=" * 140)
print()
# Group by model and calculate averages
models = {}
for result in results:
model_name = result.get('model_name', 'unknown')
if model_name not in models:
models[model_name] = {
'runs': 0,
'total_hallu': 0,
'total_detections': 0,
'avg_time': [],
'empty_capable': []
}
models[model_name]['runs'] += 1
models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))
# Calculate scores
model_scores = []
for model_name, stats in models.items():
hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
empty_capable = any(stats['empty_capable'])
model_scores.append({
'model': model_name,
'hallu_rate': hallu_rate,
'avg_time': avg_time,
'empty_capable': empty_capable,
'runs': stats['runs']
})
# Sort by hallucination rate (lower is better)
model_scores.sort(key=lambda x: x['hallu_rate'])
print("Lowest hallucination rate:")
for i, score in enumerate(model_scores[:3], 1):
capable = "" if score['empty_capable'] else ""
print(f" {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")
print()
# Sort by speed (lower is better)
model_scores.sort(key=lambda x: x['avg_time'])
print("Fastest processing:")
for i, score in enumerate(model_scores[:3], 1):
capable = "" if score['empty_capable'] else ""
print(f" {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")
print()
# Models with empty response capability
empty_models = [s for s in model_scores if s['empty_capable']]
print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
for score in empty_models:
print(f" - {score['model']}")
print()
# Best F1 scores (ground truth accuracy)
models_with_gt = {}
for result in results:
if result.get('ground_truth'):
model_name = result.get('model_name', 'unknown')
if model_name not in models_with_gt:
models_with_gt[model_name] = {
'f1_scores': [],
'precision_scores': [],
'recall_scores': []
}
gt = result['ground_truth']
models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))
if models_with_gt:
gt_scores = []
for model_name, stats in models_with_gt.items():
avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
gt_scores.append({
'model': model_name,
'avg_f1': avg_f1,
'avg_precision': avg_precision,
'avg_recall': avg_recall
})
# Sort by F1 score (higher is better)
gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)
print("Highest ground truth F1 scores:")
for i, score in enumerate(gt_scores[:3], 1):
print(f" {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")
print()
def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
"""Export results to CSV file for spreadsheet import."""
if not results:
print("No results to export.")
return
try:
with open(csv_file, 'w', newline='') as f:
# Define CSV columns
fieldnames = [
'timestamp',
'model_name',
'model_tag',
'prompt_file',
'prompt_length',
'total_images',
'images_with_jerseys',
'images_without_jerseys',
'images_with_errors',
'total_raw_detections',
'total_valid_jerseys',
'total_hallucinated',
'hallucination_rate_pct',
'empty_response_rate_pct',
'avg_processing_time',
'total_processing_time',
'resize_enabled',
'resize_max',
'images_resized',
'has_confidence',
'confidence_avg',
'confidence_min',
'confidence_max',
'confidence_count',
'confidence_stdev',
'confidence_quality',
'conf_90_100',
'conf_70_89',
'conf_50_69',
'conf_30_49',
'conf_0_29',
# Ground truth columns
'gt_total_expected',
'gt_total_true_positives',
'gt_total_false_positives',
'gt_total_false_negatives',
'gt_overall_precision',
'gt_overall_recall',
'gt_overall_f1',
'gt_avg_precision',
'gt_avg_recall',
'gt_avg_f1',
# Confidence calibration
'gt_avg_confidence_correct',
'gt_avg_confidence_incorrect',
'gt_confidence_correct_count',
'gt_confidence_incorrect_count'
]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
# Write data rows
for result in results:
# Calculate derived values
total_images = result.get('total_images', 0)
valid_jerseys = result.get('total_valid_jerseys', 0)
hallucinated = result.get('total_hallucinated', 0)
total_detections = valid_jerseys + hallucinated
hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
# Extract confidence stats
conf_stats = result.get('confidence_stats')
has_confidence = conf_stats is not None
conf_avg = conf_stats.get('avg', '') if conf_stats else ''
conf_min = conf_stats.get('min', '') if conf_stats else ''
conf_max = conf_stats.get('max', '') if conf_stats else ''
conf_count = conf_stats.get('count', '') if conf_stats else ''
# Calculate confidence standard deviation and quality
conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)
# Extract confidence distribution
conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
conf_90_100 = conf_dist.get('90-100', '')
conf_70_89 = conf_dist.get('70-89', '')
conf_50_69 = conf_dist.get('50-69', '')
conf_30_49 = conf_dist.get('30-49', '')
conf_0_29 = conf_dist.get('0-29', '')
# Extract ground truth stats
gt = result.get('ground_truth', {})
gt_total_expected = gt.get('total_expected', '')
gt_total_tp = gt.get('total_true_positives', '')
gt_total_fp = gt.get('total_false_positives', '')
gt_total_fn = gt.get('total_false_negatives', '')
gt_overall_precision = gt.get('overall_precision', '')
gt_overall_recall = gt.get('overall_recall', '')
gt_overall_f1 = gt.get('overall_f1', '')
gt_avg_precision = gt.get('avg_precision', '')
gt_avg_recall = gt.get('avg_recall', '')
gt_avg_f1 = gt.get('avg_f1', '')
gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
gt_conf_correct_count = gt.get('confidence_correct_count', '')
gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')
row = {
'timestamp': result.get('timestamp', ''),
'model_name': result.get('model_name', ''),
'model_tag': result.get('model_tag', ''),
'prompt_file': result.get('prompt_file', ''),
'prompt_length': result.get('prompt_length', ''),
'total_images': total_images,
'images_with_jerseys': result.get('images_with_jerseys', ''),
'images_without_jerseys': result.get('images_without_jerseys', ''),
'images_with_errors': result.get('images_with_errors', ''),
'total_raw_detections': result.get('total_raw_detections', ''),
'total_valid_jerseys': valid_jerseys,
'total_hallucinated': hallucinated,
'hallucination_rate_pct': f"{hallu_rate:.2f}",
'empty_response_rate_pct': f"{empty_rate:.2f}",
'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
'resize_enabled': result.get('resize_enabled', False),
'resize_max': result.get('resize_max', ''),
'images_resized': result.get('images_resized', ''),
'has_confidence': has_confidence,
'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
'confidence_min': conf_min,
'confidence_max': conf_max,
'confidence_count': conf_count,
'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
'conf_90_100': conf_90_100,
'conf_70_89': conf_70_89,
'conf_50_69': conf_50_69,
'conf_30_49': conf_30_49,
'conf_0_29': conf_0_29,
# Ground truth data
'gt_total_expected': gt_total_expected,
'gt_total_true_positives': gt_total_tp,
'gt_total_false_positives': gt_total_fp,
'gt_total_false_negatives': gt_total_fn,
'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
'gt_confidence_correct_count': gt_conf_correct_count,
'gt_confidence_incorrect_count': gt_conf_incorrect_count
}
writer.writerow(row)
print(f"✓ Results exported to CSV: {csv_file}")
print(f" Rows: {len(results)}")
print(f" Columns: {len(fieldnames)}")
except Exception as e:
print(f"❌ Failed to export to CSV: {e}")
sys.exit(1)
def main():
"""Main entry point for the analysis script."""
parser = argparse.ArgumentParser(
description='Analyze jersey detection test results',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Show analysis
python analyze_jersey_results.py
# Show analysis and export to CSV
python analyze_jersey_results.py --csv results.csv
# Export to CSV only (no analysis display)
python analyze_jersey_results.py --csv-only results.csv
# Analyze custom results file
python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
"""
)
parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
help='Path to results file (default: jersey_detection_results.jsonl)')
parser.add_argument('--csv', metavar='FILE', dest='csv_file',
help='Export results to CSV file (in addition to showing analysis)')
parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
help='Export to CSV file only, skip analysis display')
args = parser.parse_args()
# Check if file exists
if not Path(args.results_file).exists():
print(f"Error: Results file not found: {args.results_file}")
print(f"Run some tests first with test_jersey_detection.py to generate results.")
sys.exit(1)
# Load results
results = load_results(args.results_file)
if not results:
print(f"No results found in {args.results_file}")
sys.exit(0)
print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")
# Handle CSV-only mode
if args.csv_only:
export_to_csv(results, args.csv_only)
return
# Print analyses (unless CSV-only mode)
print_ascii_comparison_table(results)
print_model_performance_chart(results)
print_best_performers(results)
# Export to CSV if requested
if args.csv_file:
print()
export_to_csv(results, args.csv_file)
if __name__ == '__main__':
main()