#!/usr/bin/env python3 """ Analyze jersey detection test results and compare model performance. Usage: python analyze_jersey_results.py [results_file] python analyze_jersey_results.py [results_file] --csv output.csv python analyze_jersey_results.py [results_file] --csv-only output.csv Arguments: results_file: Path to the results file (default: jersey_detection_results.jsonl) --csv: Also export results to CSV file --csv-only: Export to CSV only, skip analysis display """ import argparse import csv import json import sys from pathlib import Path from typing import List, Dict, Any from datetime import datetime def load_results(results_file: str) -> List[Dict[str, Any]]: """Load test results from a JSON Lines file.""" results = [] try: with open(results_file, 'r') as f: for line in f: line = line.strip() if line: results.append(json.loads(line)) return results except FileNotFoundError: print(f"Error: Results file not found: {results_file}") sys.exit(1) except json.JSONDecodeError as e: print(f"Error: Invalid JSON in results file: {e}") sys.exit(1) def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple: """ Calculate standard deviation of confidence scores from distribution. Returns: Tuple of (stdev, quality_rating) quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A" """ if not conf_stats or 'distribution' not in conf_stats: return None, "N/A" dist = conf_stats['distribution'] # Reconstruct approximate confidence values from buckets # Use midpoint of each bucket values = [] bucket_midpoints = { '90-100': 95, '70-89': 79.5, '50-69': 59.5, '30-49': 39.5, '0-29': 14.5 } for bucket, count in dist.items(): midpoint = bucket_midpoints.get(bucket, 50) values.extend([midpoint] * count) if len(values) < 2: return None, "N/A" # Calculate standard deviation import math mean = sum(values) / len(values) variance = sum((x - mean) ** 2 for x in values) / len(values) stdev = math.sqrt(variance) # Assign quality rating based on StDev if stdev < 5: quality = "Poor" elif stdev < 10: quality = "Fair" elif stdev < 15: quality = "Good" else: quality = "Excel" # Shortened for table return stdev, quality def print_ascii_comparison_table(results: List[Dict[str, Any]]): """Print a detailed ASCII comparison table of all test runs.""" if not results: print("No results to display.") return print("=" * 280) print("DETAILED MODEL COMPARISON TABLE") print("=" * 280) print() print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)") print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections") print() # Table headers with ground truth and confidence calibration columns print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐") print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format( "Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date" )) print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤") # Data rows for i, result in enumerate(results): model = result.get('model_name', 'unknown')[:20] prompt = Path(result.get('prompt_file', 'unknown')).stem[:8] total_images = result.get('total_images', 0) valid_jerseys = result.get('total_valid_jerseys', 0) hallucinated = result.get('total_hallucinated', 0) total_detections = valid_jerseys + hallucinated empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0 hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0 avg_time = result.get('avg_processing_time', 0) # Calculate confidence quality conf_stats = result.get('confidence_stats') has_conf = 'Yes' if conf_stats else 'No' stdev, quality = calculate_confidence_stdev(conf_stats) # Format confidence quality display if stdev is not None: conf_qual_str = f"{quality} ({stdev:.1f})" else: conf_qual_str = "N/A" # Ground truth metrics gt = result.get('ground_truth', {}) precision = gt.get('overall_precision', 0) * 100 recall = gt.get('overall_recall', 0) * 100 f1 = gt.get('overall_f1', 0) * 100 # Confidence calibration conf_correct = gt.get('avg_confidence_correct') conf_incorrect = gt.get('avg_confidence_incorrect') conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A" conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A" resize_max = result.get('resize_max') resize_str = f"{resize_max}px" if resize_max else "No" timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M') print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format( model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp )) # Bottom border print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘") print() def print_comparison_table(results: List[Dict[str, Any]]): """Print a simple comparison table of all test runs.""" if not results: print("No results to display.") return print("=" * 140) print("MODEL COMPARISON TABLE") print("=" * 140) print() # Header header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}" print(header) print("-" * 150) # Data rows for result in results: model = result.get('model_name', 'unknown')[:24] prompt = Path(result.get('prompt_file', 'unknown')).stem[:29] total_images = result.get('total_images', 0) valid_jerseys = result.get('total_valid_jerseys', 0) hallucinated = result.get('total_hallucinated', 0) empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0 avg_time = result.get('avg_processing_time', 0) has_conf = 'Yes' if result.get('confidence_stats') else 'No' resize_max = result.get('resize_max') resize_str = f"{resize_max}px" if resize_max else "No" timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S') row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}" print(row) print() def print_model_performance_chart(results: List[Dict[str, Any]]): """Print a performance chart showing key metrics for each model.""" if not results: return print("=" * 140) print("MODEL PERFORMANCE CHART") print("=" * 140) print() # Group results by model models = {} for result in results: model_name = result.get('model_name', 'unknown') if model_name not in models: models[model_name] = [] models[model_name].append(result) # Calculate aggregate statistics for each model for model_name, model_results in models.items(): print(f"\n{model_name}") print("-" * 100) total_runs = len(model_results) total_images = sum(r.get('total_images', 0) for r in model_results) total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results) total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results) avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0 avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0 # Check if any runs have confidence stats has_confidence = any(r.get('confidence_stats') for r in model_results) # Check resize status resize_enabled = any(r.get('resize_enabled', False) for r in model_results) resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')] resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled" print(f" Total test runs: {total_runs}") print(f" Total images processed: {total_images}") print(f" Total valid detections: {total_valid}") print(f" Total hallucinations: {total_hallu}") print(f" Average empty response rate: {avg_empty_pct:.1f}%") print(f" Average processing time: {avg_time:.2f}s/image") print(f" Resize: {resize_info}") print(f" Confidence support: {'Yes' if has_confidence else 'No'}") # Show hallucination rate if total_valid + total_hallu > 0: hallu_rate = (total_hallu / (total_valid + total_hallu) * 100) print(f" Hallucination rate: {hallu_rate:.1f}%") # Visual bar bar_length = int(hallu_rate / 2) # Scale to max 50 chars bar = '█' * bar_length print(f" Hallucination chart: {bar} ({hallu_rate:.1f}%)") # Ground truth performance gt_runs = [r for r in model_results if r.get('ground_truth')] if gt_runs: avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs) avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs) avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs) total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs) total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs) total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs) total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs) print(f"\n Ground truth performance:") print(f" Total expected jerseys: {total_expected}") print(f" True positives: {total_tp}") print(f" False positives: {total_fp}") print(f" False negatives: {total_fn}") print(f" Average Precision: {avg_precision:.1%}") print(f" Average Recall: {avg_recall:.1%}") print(f" Average F1 Score: {avg_f1:.1%}") # Visual F1 bar bar_length = int(avg_f1 * 50) # Scale to max 50 chars bar = '█' * bar_length print(f" F1 Score chart: {bar} ({avg_f1:.1%})") # Confidence calibration conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None] conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None] if conf_correct_vals or conf_incorrect_vals: print(f"\n Confidence calibration:") if conf_correct_vals: avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals) print(f" Avg confidence (correct detections): {avg_conf_correct:.2f}") if conf_incorrect_vals: avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals) print(f" Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}") if conf_correct_vals and conf_incorrect_vals: diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals) if diff > 0: print(f" Confidence difference: +{diff:.2f} (good calibration)") else: print(f" Confidence difference: {diff:.2f} (⚠ poor calibration)") # Confidence distribution if available if has_confidence: print(f"\n Confidence distribution (across all runs):") all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0} total_conf_count = 0 for result in model_results: conf_stats = result.get('confidence_stats') if conf_stats and 'distribution' in conf_stats: for bucket, count in conf_stats['distribution'].items(): all_dist[bucket] += count total_conf_count += count if total_conf_count > 0: for bucket, count in all_dist.items(): pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0 bar_length = int(pct / 2) bar = '█' * bar_length print(f" {bucket}: {count:4d} ({pct:5.1f}%) {bar}") print() def print_best_performers(results: List[Dict[str, Any]]): """Print summary of best performing models.""" if not results: return print("=" * 140) print("BEST PERFORMERS") print("=" * 140) print() # Group by model and calculate averages models = {} for result in results: model_name = result.get('model_name', 'unknown') if model_name not in models: models[model_name] = { 'runs': 0, 'total_hallu': 0, 'total_detections': 0, 'avg_time': [], 'empty_capable': [] } models[model_name]['runs'] += 1 models[model_name]['total_hallu'] += result.get('total_hallucinated', 0) models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0) models[model_name]['avg_time'].append(result.get('avg_processing_time', 0)) models[model_name]['empty_capable'].append(result.get('empty_response_capable', False)) # Calculate scores model_scores = [] for model_name, stats in models.items(): hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0 avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0 empty_capable = any(stats['empty_capable']) model_scores.append({ 'model': model_name, 'hallu_rate': hallu_rate, 'avg_time': avg_time, 'empty_capable': empty_capable, 'runs': stats['runs'] }) # Sort by hallucination rate (lower is better) model_scores.sort(key=lambda x: x['hallu_rate']) print("Lowest hallucination rate:") for i, score in enumerate(model_scores[:3], 1): capable = "✓" if score['empty_capable'] else "✗" print(f" {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)") print() # Sort by speed (lower is better) model_scores.sort(key=lambda x: x['avg_time']) print("Fastest processing:") for i, score in enumerate(model_scores[:3], 1): capable = "✓" if score['empty_capable'] else "✗" print(f" {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})") print() # Models with empty response capability empty_models = [s for s in model_scores if s['empty_capable']] print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}") for score in empty_models: print(f" - {score['model']}") print() # Best F1 scores (ground truth accuracy) models_with_gt = {} for result in results: if result.get('ground_truth'): model_name = result.get('model_name', 'unknown') if model_name not in models_with_gt: models_with_gt[model_name] = { 'f1_scores': [], 'precision_scores': [], 'recall_scores': [] } gt = result['ground_truth'] models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0)) models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0)) models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0)) if models_with_gt: gt_scores = [] for model_name, stats in models_with_gt.items(): avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0 avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0 avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0 gt_scores.append({ 'model': model_name, 'avg_f1': avg_f1, 'avg_precision': avg_precision, 'avg_recall': avg_recall }) # Sort by F1 score (higher is better) gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True) print("Highest ground truth F1 scores:") for i, score in enumerate(gt_scores[:3], 1): print(f" {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})") print() def export_to_csv(results: List[Dict[str, Any]], csv_file: str): """Export results to CSV file for spreadsheet import.""" if not results: print("No results to export.") return try: with open(csv_file, 'w', newline='') as f: # Define CSV columns fieldnames = [ 'timestamp', 'model_name', 'model_tag', 'prompt_file', 'prompt_length', 'total_images', 'images_with_jerseys', 'images_without_jerseys', 'images_with_errors', 'total_raw_detections', 'total_valid_jerseys', 'total_hallucinated', 'hallucination_rate_pct', 'empty_response_rate_pct', 'avg_processing_time', 'total_processing_time', 'resize_enabled', 'resize_max', 'images_resized', 'has_confidence', 'confidence_avg', 'confidence_min', 'confidence_max', 'confidence_count', 'confidence_stdev', 'confidence_quality', 'conf_90_100', 'conf_70_89', 'conf_50_69', 'conf_30_49', 'conf_0_29', # Ground truth columns 'gt_total_expected', 'gt_total_true_positives', 'gt_total_false_positives', 'gt_total_false_negatives', 'gt_overall_precision', 'gt_overall_recall', 'gt_overall_f1', 'gt_avg_precision', 'gt_avg_recall', 'gt_avg_f1', # Confidence calibration 'gt_avg_confidence_correct', 'gt_avg_confidence_incorrect', 'gt_confidence_correct_count', 'gt_confidence_incorrect_count' ] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() # Write data rows for result in results: # Calculate derived values total_images = result.get('total_images', 0) valid_jerseys = result.get('total_valid_jerseys', 0) hallucinated = result.get('total_hallucinated', 0) total_detections = valid_jerseys + hallucinated hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0 empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0 # Extract confidence stats conf_stats = result.get('confidence_stats') has_confidence = conf_stats is not None conf_avg = conf_stats.get('avg', '') if conf_stats else '' conf_min = conf_stats.get('min', '') if conf_stats else '' conf_max = conf_stats.get('max', '') if conf_stats else '' conf_count = conf_stats.get('count', '') if conf_stats else '' # Calculate confidence standard deviation and quality conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats) # Extract confidence distribution conf_dist = conf_stats.get('distribution', {}) if conf_stats else {} conf_90_100 = conf_dist.get('90-100', '') conf_70_89 = conf_dist.get('70-89', '') conf_50_69 = conf_dist.get('50-69', '') conf_30_49 = conf_dist.get('30-49', '') conf_0_29 = conf_dist.get('0-29', '') # Extract ground truth stats gt = result.get('ground_truth', {}) gt_total_expected = gt.get('total_expected', '') gt_total_tp = gt.get('total_true_positives', '') gt_total_fp = gt.get('total_false_positives', '') gt_total_fn = gt.get('total_false_negatives', '') gt_overall_precision = gt.get('overall_precision', '') gt_overall_recall = gt.get('overall_recall', '') gt_overall_f1 = gt.get('overall_f1', '') gt_avg_precision = gt.get('avg_precision', '') gt_avg_recall = gt.get('avg_recall', '') gt_avg_f1 = gt.get('avg_f1', '') gt_avg_conf_correct = gt.get('avg_confidence_correct', '') gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '') gt_conf_correct_count = gt.get('confidence_correct_count', '') gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '') row = { 'timestamp': result.get('timestamp', ''), 'model_name': result.get('model_name', ''), 'model_tag': result.get('model_tag', ''), 'prompt_file': result.get('prompt_file', ''), 'prompt_length': result.get('prompt_length', ''), 'total_images': total_images, 'images_with_jerseys': result.get('images_with_jerseys', ''), 'images_without_jerseys': result.get('images_without_jerseys', ''), 'images_with_errors': result.get('images_with_errors', ''), 'total_raw_detections': result.get('total_raw_detections', ''), 'total_valid_jerseys': valid_jerseys, 'total_hallucinated': hallucinated, 'hallucination_rate_pct': f"{hallu_rate:.2f}", 'empty_response_rate_pct': f"{empty_rate:.2f}", 'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}", 'total_processing_time': f"{result.get('total_processing_time', 0):.2f}", 'resize_enabled': result.get('resize_enabled', False), 'resize_max': result.get('resize_max', ''), 'images_resized': result.get('images_resized', ''), 'has_confidence': has_confidence, 'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '', 'confidence_min': conf_min, 'confidence_max': conf_max, 'confidence_count': conf_count, 'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '', 'confidence_quality': conf_quality if conf_quality != 'N/A' else '', 'conf_90_100': conf_90_100, 'conf_70_89': conf_70_89, 'conf_50_69': conf_50_69, 'conf_30_49': conf_30_49, 'conf_0_29': conf_0_29, # Ground truth data 'gt_total_expected': gt_total_expected, 'gt_total_true_positives': gt_total_tp, 'gt_total_false_positives': gt_total_fp, 'gt_total_false_negatives': gt_total_fn, 'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '', 'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '', 'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '', 'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '', 'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '', 'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '', 'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '', 'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '', 'gt_confidence_correct_count': gt_conf_correct_count, 'gt_confidence_incorrect_count': gt_conf_incorrect_count } writer.writerow(row) print(f"✓ Results exported to CSV: {csv_file}") print(f" Rows: {len(results)}") print(f" Columns: {len(fieldnames)}") except Exception as e: print(f"❌ Failed to export to CSV: {e}") sys.exit(1) def main(): """Main entry point for the analysis script.""" parser = argparse.ArgumentParser( description='Analyze jersey detection test results', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Show analysis python analyze_jersey_results.py # Show analysis and export to CSV python analyze_jersey_results.py --csv results.csv # Export to CSV only (no analysis display) python analyze_jersey_results.py --csv-only results.csv # Analyze custom results file python analyze_jersey_results.py custom_results.jsonl --csv custom.csv """ ) parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl', help='Path to results file (default: jersey_detection_results.jsonl)') parser.add_argument('--csv', metavar='FILE', dest='csv_file', help='Export results to CSV file (in addition to showing analysis)') parser.add_argument('--csv-only', metavar='FILE', dest='csv_only', help='Export to CSV file only, skip analysis display') args = parser.parse_args() # Check if file exists if not Path(args.results_file).exists(): print(f"Error: Results file not found: {args.results_file}") print(f"Run some tests first with test_jersey_detection.py to generate results.") sys.exit(1) # Load results results = load_results(args.results_file) if not results: print(f"No results found in {args.results_file}") sys.exit(0) print(f"Loaded {len(results)} test run(s) from {args.results_file}\n") # Handle CSV-only mode if args.csv_only: export_to_csv(results, args.csv_only) return # Print analyses (unless CSV-only mode) print_ascii_comparison_table(results) print_model_performance_chart(results) print_best_performers(results) # Export to CSV if requested if args.csv_file: print() export_to_csv(results, args.csv_file) if __name__ == '__main__': main()