Initial commit: Jersey detection test suite
Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
This commit is contained in:
663
analyze_jersey_results.py
Executable file
663
analyze_jersey_results.py
Executable file
@ -0,0 +1,663 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze jersey detection test results and compare model performance.
|
||||
|
||||
Usage:
|
||||
python analyze_jersey_results.py [results_file]
|
||||
python analyze_jersey_results.py [results_file] --csv output.csv
|
||||
python analyze_jersey_results.py [results_file] --csv-only output.csv
|
||||
|
||||
Arguments:
|
||||
results_file: Path to the results file (default: jersey_detection_results.jsonl)
|
||||
--csv: Also export results to CSV file
|
||||
--csv-only: Export to CSV only, skip analysis display
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_results(results_file: str) -> List[Dict[str, Any]]:
|
||||
"""Load test results from a JSON Lines file."""
|
||||
results = []
|
||||
try:
|
||||
with open(results_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
results.append(json.loads(line))
|
||||
return results
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Results file not found: {results_file}")
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in results file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
|
||||
"""
|
||||
Calculate standard deviation of confidence scores from distribution.
|
||||
|
||||
Returns:
|
||||
Tuple of (stdev, quality_rating)
|
||||
quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
|
||||
"""
|
||||
if not conf_stats or 'distribution' not in conf_stats:
|
||||
return None, "N/A"
|
||||
|
||||
dist = conf_stats['distribution']
|
||||
|
||||
# Reconstruct approximate confidence values from buckets
|
||||
# Use midpoint of each bucket
|
||||
values = []
|
||||
bucket_midpoints = {
|
||||
'90-100': 95,
|
||||
'70-89': 79.5,
|
||||
'50-69': 59.5,
|
||||
'30-49': 39.5,
|
||||
'0-29': 14.5
|
||||
}
|
||||
|
||||
for bucket, count in dist.items():
|
||||
midpoint = bucket_midpoints.get(bucket, 50)
|
||||
values.extend([midpoint] * count)
|
||||
|
||||
if len(values) < 2:
|
||||
return None, "N/A"
|
||||
|
||||
# Calculate standard deviation
|
||||
import math
|
||||
mean = sum(values) / len(values)
|
||||
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
||||
stdev = math.sqrt(variance)
|
||||
|
||||
# Assign quality rating based on StDev
|
||||
if stdev < 5:
|
||||
quality = "Poor"
|
||||
elif stdev < 10:
|
||||
quality = "Fair"
|
||||
elif stdev < 15:
|
||||
quality = "Good"
|
||||
else:
|
||||
quality = "Excel" # Shortened for table
|
||||
|
||||
return stdev, quality
|
||||
|
||||
|
||||
def print_ascii_comparison_table(results: List[Dict[str, Any]]):
|
||||
"""Print a detailed ASCII comparison table of all test runs."""
|
||||
if not results:
|
||||
print("No results to display.")
|
||||
return
|
||||
|
||||
print("=" * 280)
|
||||
print("DETAILED MODEL COMPARISON TABLE")
|
||||
print("=" * 280)
|
||||
print()
|
||||
print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
|
||||
print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
|
||||
print()
|
||||
|
||||
# Table headers with ground truth and confidence calibration columns
|
||||
print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐")
|
||||
print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format(
|
||||
"Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
|
||||
))
|
||||
print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤")
|
||||
|
||||
# Data rows
|
||||
for i, result in enumerate(results):
|
||||
model = result.get('model_name', 'unknown')[:20]
|
||||
prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
|
||||
total_images = result.get('total_images', 0)
|
||||
valid_jerseys = result.get('total_valid_jerseys', 0)
|
||||
hallucinated = result.get('total_hallucinated', 0)
|
||||
total_detections = valid_jerseys + hallucinated
|
||||
empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
|
||||
hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
|
||||
avg_time = result.get('avg_processing_time', 0)
|
||||
|
||||
# Calculate confidence quality
|
||||
conf_stats = result.get('confidence_stats')
|
||||
has_conf = 'Yes' if conf_stats else 'No'
|
||||
stdev, quality = calculate_confidence_stdev(conf_stats)
|
||||
|
||||
# Format confidence quality display
|
||||
if stdev is not None:
|
||||
conf_qual_str = f"{quality} ({stdev:.1f})"
|
||||
else:
|
||||
conf_qual_str = "N/A"
|
||||
|
||||
# Ground truth metrics
|
||||
gt = result.get('ground_truth', {})
|
||||
precision = gt.get('overall_precision', 0) * 100
|
||||
recall = gt.get('overall_recall', 0) * 100
|
||||
f1 = gt.get('overall_f1', 0) * 100
|
||||
|
||||
# Confidence calibration
|
||||
conf_correct = gt.get('avg_confidence_correct')
|
||||
conf_incorrect = gt.get('avg_confidence_incorrect')
|
||||
conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
|
||||
conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"
|
||||
|
||||
resize_max = result.get('resize_max')
|
||||
resize_str = f"{resize_max}px" if resize_max else "No"
|
||||
timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')
|
||||
|
||||
print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format(
|
||||
model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
|
||||
))
|
||||
|
||||
# Bottom border
|
||||
print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def print_comparison_table(results: List[Dict[str, Any]]):
|
||||
"""Print a simple comparison table of all test runs."""
|
||||
if not results:
|
||||
print("No results to display.")
|
||||
return
|
||||
|
||||
print("=" * 140)
|
||||
print("MODEL COMPARISON TABLE")
|
||||
print("=" * 140)
|
||||
print()
|
||||
|
||||
# Header
|
||||
header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
|
||||
print(header)
|
||||
print("-" * 150)
|
||||
|
||||
# Data rows
|
||||
for result in results:
|
||||
model = result.get('model_name', 'unknown')[:24]
|
||||
prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
|
||||
total_images = result.get('total_images', 0)
|
||||
valid_jerseys = result.get('total_valid_jerseys', 0)
|
||||
hallucinated = result.get('total_hallucinated', 0)
|
||||
empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
|
||||
avg_time = result.get('avg_processing_time', 0)
|
||||
has_conf = 'Yes' if result.get('confidence_stats') else 'No'
|
||||
resize_max = result.get('resize_max')
|
||||
resize_str = f"{resize_max}px" if resize_max else "No"
|
||||
timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
|
||||
print(row)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def print_model_performance_chart(results: List[Dict[str, Any]]):
|
||||
"""Print a performance chart showing key metrics for each model."""
|
||||
if not results:
|
||||
return
|
||||
|
||||
print("=" * 140)
|
||||
print("MODEL PERFORMANCE CHART")
|
||||
print("=" * 140)
|
||||
print()
|
||||
|
||||
# Group results by model
|
||||
models = {}
|
||||
for result in results:
|
||||
model_name = result.get('model_name', 'unknown')
|
||||
if model_name not in models:
|
||||
models[model_name] = []
|
||||
models[model_name].append(result)
|
||||
|
||||
# Calculate aggregate statistics for each model
|
||||
for model_name, model_results in models.items():
|
||||
print(f"\n{model_name}")
|
||||
print("-" * 100)
|
||||
|
||||
total_runs = len(model_results)
|
||||
total_images = sum(r.get('total_images', 0) for r in model_results)
|
||||
total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
|
||||
total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
|
||||
avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
|
||||
avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0
|
||||
|
||||
# Check if any runs have confidence stats
|
||||
has_confidence = any(r.get('confidence_stats') for r in model_results)
|
||||
|
||||
# Check resize status
|
||||
resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
|
||||
resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
|
||||
resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"
|
||||
|
||||
print(f" Total test runs: {total_runs}")
|
||||
print(f" Total images processed: {total_images}")
|
||||
print(f" Total valid detections: {total_valid}")
|
||||
print(f" Total hallucinations: {total_hallu}")
|
||||
print(f" Average empty response rate: {avg_empty_pct:.1f}%")
|
||||
print(f" Average processing time: {avg_time:.2f}s/image")
|
||||
print(f" Resize: {resize_info}")
|
||||
print(f" Confidence support: {'Yes' if has_confidence else 'No'}")
|
||||
|
||||
# Show hallucination rate
|
||||
if total_valid + total_hallu > 0:
|
||||
hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
|
||||
print(f" Hallucination rate: {hallu_rate:.1f}%")
|
||||
|
||||
# Visual bar
|
||||
bar_length = int(hallu_rate / 2) # Scale to max 50 chars
|
||||
bar = '█' * bar_length
|
||||
print(f" Hallucination chart: {bar} ({hallu_rate:.1f}%)")
|
||||
|
||||
# Ground truth performance
|
||||
gt_runs = [r for r in model_results if r.get('ground_truth')]
|
||||
if gt_runs:
|
||||
avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
|
||||
avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
|
||||
avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
|
||||
total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
|
||||
total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
|
||||
total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
|
||||
total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)
|
||||
|
||||
print(f"\n Ground truth performance:")
|
||||
print(f" Total expected jerseys: {total_expected}")
|
||||
print(f" True positives: {total_tp}")
|
||||
print(f" False positives: {total_fp}")
|
||||
print(f" False negatives: {total_fn}")
|
||||
print(f" Average Precision: {avg_precision:.1%}")
|
||||
print(f" Average Recall: {avg_recall:.1%}")
|
||||
print(f" Average F1 Score: {avg_f1:.1%}")
|
||||
|
||||
# Visual F1 bar
|
||||
bar_length = int(avg_f1 * 50) # Scale to max 50 chars
|
||||
bar = '█' * bar_length
|
||||
print(f" F1 Score chart: {bar} ({avg_f1:.1%})")
|
||||
|
||||
# Confidence calibration
|
||||
conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
|
||||
conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]
|
||||
|
||||
if conf_correct_vals or conf_incorrect_vals:
|
||||
print(f"\n Confidence calibration:")
|
||||
if conf_correct_vals:
|
||||
avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
|
||||
print(f" Avg confidence (correct detections): {avg_conf_correct:.2f}")
|
||||
if conf_incorrect_vals:
|
||||
avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
|
||||
print(f" Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
|
||||
if conf_correct_vals and conf_incorrect_vals:
|
||||
diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
|
||||
if diff > 0:
|
||||
print(f" Confidence difference: +{diff:.2f} (good calibration)")
|
||||
else:
|
||||
print(f" Confidence difference: {diff:.2f} (⚠ poor calibration)")
|
||||
|
||||
# Confidence distribution if available
|
||||
if has_confidence:
|
||||
print(f"\n Confidence distribution (across all runs):")
|
||||
all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
|
||||
total_conf_count = 0
|
||||
|
||||
for result in model_results:
|
||||
conf_stats = result.get('confidence_stats')
|
||||
if conf_stats and 'distribution' in conf_stats:
|
||||
for bucket, count in conf_stats['distribution'].items():
|
||||
all_dist[bucket] += count
|
||||
total_conf_count += count
|
||||
|
||||
if total_conf_count > 0:
|
||||
for bucket, count in all_dist.items():
|
||||
pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
|
||||
bar_length = int(pct / 2)
|
||||
bar = '█' * bar_length
|
||||
print(f" {bucket}: {count:4d} ({pct:5.1f}%) {bar}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def print_best_performers(results: List[Dict[str, Any]]):
|
||||
"""Print summary of best performing models."""
|
||||
if not results:
|
||||
return
|
||||
|
||||
print("=" * 140)
|
||||
print("BEST PERFORMERS")
|
||||
print("=" * 140)
|
||||
print()
|
||||
|
||||
# Group by model and calculate averages
|
||||
models = {}
|
||||
for result in results:
|
||||
model_name = result.get('model_name', 'unknown')
|
||||
if model_name not in models:
|
||||
models[model_name] = {
|
||||
'runs': 0,
|
||||
'total_hallu': 0,
|
||||
'total_detections': 0,
|
||||
'avg_time': [],
|
||||
'empty_capable': []
|
||||
}
|
||||
|
||||
models[model_name]['runs'] += 1
|
||||
models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
|
||||
models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
|
||||
models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
|
||||
models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))
|
||||
|
||||
# Calculate scores
|
||||
model_scores = []
|
||||
for model_name, stats in models.items():
|
||||
hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
|
||||
avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
|
||||
empty_capable = any(stats['empty_capable'])
|
||||
|
||||
model_scores.append({
|
||||
'model': model_name,
|
||||
'hallu_rate': hallu_rate,
|
||||
'avg_time': avg_time,
|
||||
'empty_capable': empty_capable,
|
||||
'runs': stats['runs']
|
||||
})
|
||||
|
||||
# Sort by hallucination rate (lower is better)
|
||||
model_scores.sort(key=lambda x: x['hallu_rate'])
|
||||
|
||||
print("Lowest hallucination rate:")
|
||||
for i, score in enumerate(model_scores[:3], 1):
|
||||
capable = "✓" if score['empty_capable'] else "✗"
|
||||
print(f" {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")
|
||||
|
||||
print()
|
||||
|
||||
# Sort by speed (lower is better)
|
||||
model_scores.sort(key=lambda x: x['avg_time'])
|
||||
|
||||
print("Fastest processing:")
|
||||
for i, score in enumerate(model_scores[:3], 1):
|
||||
capable = "✓" if score['empty_capable'] else "✗"
|
||||
print(f" {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")
|
||||
|
||||
print()
|
||||
|
||||
# Models with empty response capability
|
||||
empty_models = [s for s in model_scores if s['empty_capable']]
|
||||
print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
|
||||
for score in empty_models:
|
||||
print(f" - {score['model']}")
|
||||
|
||||
print()
|
||||
|
||||
# Best F1 scores (ground truth accuracy)
|
||||
models_with_gt = {}
|
||||
for result in results:
|
||||
if result.get('ground_truth'):
|
||||
model_name = result.get('model_name', 'unknown')
|
||||
if model_name not in models_with_gt:
|
||||
models_with_gt[model_name] = {
|
||||
'f1_scores': [],
|
||||
'precision_scores': [],
|
||||
'recall_scores': []
|
||||
}
|
||||
gt = result['ground_truth']
|
||||
models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
|
||||
models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
|
||||
models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))
|
||||
|
||||
if models_with_gt:
|
||||
gt_scores = []
|
||||
for model_name, stats in models_with_gt.items():
|
||||
avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
|
||||
avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
|
||||
avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
|
||||
gt_scores.append({
|
||||
'model': model_name,
|
||||
'avg_f1': avg_f1,
|
||||
'avg_precision': avg_precision,
|
||||
'avg_recall': avg_recall
|
||||
})
|
||||
|
||||
# Sort by F1 score (higher is better)
|
||||
gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)
|
||||
|
||||
print("Highest ground truth F1 scores:")
|
||||
for i, score in enumerate(gt_scores[:3], 1):
|
||||
print(f" {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
|
||||
"""Export results to CSV file for spreadsheet import."""
|
||||
if not results:
|
||||
print("No results to export.")
|
||||
return
|
||||
|
||||
try:
|
||||
with open(csv_file, 'w', newline='') as f:
|
||||
# Define CSV columns
|
||||
fieldnames = [
|
||||
'timestamp',
|
||||
'model_name',
|
||||
'model_tag',
|
||||
'prompt_file',
|
||||
'prompt_length',
|
||||
'total_images',
|
||||
'images_with_jerseys',
|
||||
'images_without_jerseys',
|
||||
'images_with_errors',
|
||||
'total_raw_detections',
|
||||
'total_valid_jerseys',
|
||||
'total_hallucinated',
|
||||
'hallucination_rate_pct',
|
||||
'empty_response_rate_pct',
|
||||
'avg_processing_time',
|
||||
'total_processing_time',
|
||||
'resize_enabled',
|
||||
'resize_max',
|
||||
'images_resized',
|
||||
'has_confidence',
|
||||
'confidence_avg',
|
||||
'confidence_min',
|
||||
'confidence_max',
|
||||
'confidence_count',
|
||||
'confidence_stdev',
|
||||
'confidence_quality',
|
||||
'conf_90_100',
|
||||
'conf_70_89',
|
||||
'conf_50_69',
|
||||
'conf_30_49',
|
||||
'conf_0_29',
|
||||
# Ground truth columns
|
||||
'gt_total_expected',
|
||||
'gt_total_true_positives',
|
||||
'gt_total_false_positives',
|
||||
'gt_total_false_negatives',
|
||||
'gt_overall_precision',
|
||||
'gt_overall_recall',
|
||||
'gt_overall_f1',
|
||||
'gt_avg_precision',
|
||||
'gt_avg_recall',
|
||||
'gt_avg_f1',
|
||||
# Confidence calibration
|
||||
'gt_avg_confidence_correct',
|
||||
'gt_avg_confidence_incorrect',
|
||||
'gt_confidence_correct_count',
|
||||
'gt_confidence_incorrect_count'
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
# Write data rows
|
||||
for result in results:
|
||||
# Calculate derived values
|
||||
total_images = result.get('total_images', 0)
|
||||
valid_jerseys = result.get('total_valid_jerseys', 0)
|
||||
hallucinated = result.get('total_hallucinated', 0)
|
||||
total_detections = valid_jerseys + hallucinated
|
||||
hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
|
||||
empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
|
||||
|
||||
# Extract confidence stats
|
||||
conf_stats = result.get('confidence_stats')
|
||||
has_confidence = conf_stats is not None
|
||||
conf_avg = conf_stats.get('avg', '') if conf_stats else ''
|
||||
conf_min = conf_stats.get('min', '') if conf_stats else ''
|
||||
conf_max = conf_stats.get('max', '') if conf_stats else ''
|
||||
conf_count = conf_stats.get('count', '') if conf_stats else ''
|
||||
|
||||
# Calculate confidence standard deviation and quality
|
||||
conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)
|
||||
|
||||
# Extract confidence distribution
|
||||
conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
|
||||
conf_90_100 = conf_dist.get('90-100', '')
|
||||
conf_70_89 = conf_dist.get('70-89', '')
|
||||
conf_50_69 = conf_dist.get('50-69', '')
|
||||
conf_30_49 = conf_dist.get('30-49', '')
|
||||
conf_0_29 = conf_dist.get('0-29', '')
|
||||
|
||||
# Extract ground truth stats
|
||||
gt = result.get('ground_truth', {})
|
||||
gt_total_expected = gt.get('total_expected', '')
|
||||
gt_total_tp = gt.get('total_true_positives', '')
|
||||
gt_total_fp = gt.get('total_false_positives', '')
|
||||
gt_total_fn = gt.get('total_false_negatives', '')
|
||||
gt_overall_precision = gt.get('overall_precision', '')
|
||||
gt_overall_recall = gt.get('overall_recall', '')
|
||||
gt_overall_f1 = gt.get('overall_f1', '')
|
||||
gt_avg_precision = gt.get('avg_precision', '')
|
||||
gt_avg_recall = gt.get('avg_recall', '')
|
||||
gt_avg_f1 = gt.get('avg_f1', '')
|
||||
gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
|
||||
gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
|
||||
gt_conf_correct_count = gt.get('confidence_correct_count', '')
|
||||
gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')
|
||||
|
||||
row = {
|
||||
'timestamp': result.get('timestamp', ''),
|
||||
'model_name': result.get('model_name', ''),
|
||||
'model_tag': result.get('model_tag', ''),
|
||||
'prompt_file': result.get('prompt_file', ''),
|
||||
'prompt_length': result.get('prompt_length', ''),
|
||||
'total_images': total_images,
|
||||
'images_with_jerseys': result.get('images_with_jerseys', ''),
|
||||
'images_without_jerseys': result.get('images_without_jerseys', ''),
|
||||
'images_with_errors': result.get('images_with_errors', ''),
|
||||
'total_raw_detections': result.get('total_raw_detections', ''),
|
||||
'total_valid_jerseys': valid_jerseys,
|
||||
'total_hallucinated': hallucinated,
|
||||
'hallucination_rate_pct': f"{hallu_rate:.2f}",
|
||||
'empty_response_rate_pct': f"{empty_rate:.2f}",
|
||||
'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
|
||||
'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
|
||||
'resize_enabled': result.get('resize_enabled', False),
|
||||
'resize_max': result.get('resize_max', ''),
|
||||
'images_resized': result.get('images_resized', ''),
|
||||
'has_confidence': has_confidence,
|
||||
'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
|
||||
'confidence_min': conf_min,
|
||||
'confidence_max': conf_max,
|
||||
'confidence_count': conf_count,
|
||||
'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
|
||||
'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
|
||||
'conf_90_100': conf_90_100,
|
||||
'conf_70_89': conf_70_89,
|
||||
'conf_50_69': conf_50_69,
|
||||
'conf_30_49': conf_30_49,
|
||||
'conf_0_29': conf_0_29,
|
||||
# Ground truth data
|
||||
'gt_total_expected': gt_total_expected,
|
||||
'gt_total_true_positives': gt_total_tp,
|
||||
'gt_total_false_positives': gt_total_fp,
|
||||
'gt_total_false_negatives': gt_total_fn,
|
||||
'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
|
||||
'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
|
||||
'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
|
||||
'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
|
||||
'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
|
||||
'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
|
||||
'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
|
||||
'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
|
||||
'gt_confidence_correct_count': gt_conf_correct_count,
|
||||
'gt_confidence_incorrect_count': gt_conf_incorrect_count
|
||||
}
|
||||
|
||||
writer.writerow(row)
|
||||
|
||||
print(f"✓ Results exported to CSV: {csv_file}")
|
||||
print(f" Rows: {len(results)}")
|
||||
print(f" Columns: {len(fieldnames)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to export to CSV: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the analysis script."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyze jersey detection test results',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Show analysis
|
||||
python analyze_jersey_results.py
|
||||
|
||||
# Show analysis and export to CSV
|
||||
python analyze_jersey_results.py --csv results.csv
|
||||
|
||||
# Export to CSV only (no analysis display)
|
||||
python analyze_jersey_results.py --csv-only results.csv
|
||||
|
||||
# Analyze custom results file
|
||||
python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
|
||||
"""
|
||||
)
|
||||
parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
|
||||
help='Path to results file (default: jersey_detection_results.jsonl)')
|
||||
parser.add_argument('--csv', metavar='FILE', dest='csv_file',
|
||||
help='Export results to CSV file (in addition to showing analysis)')
|
||||
parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
|
||||
help='Export to CSV file only, skip analysis display')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if file exists
|
||||
if not Path(args.results_file).exists():
|
||||
print(f"Error: Results file not found: {args.results_file}")
|
||||
print(f"Run some tests first with test_jersey_detection.py to generate results.")
|
||||
sys.exit(1)
|
||||
|
||||
# Load results
|
||||
results = load_results(args.results_file)
|
||||
|
||||
if not results:
|
||||
print(f"No results found in {args.results_file}")
|
||||
sys.exit(0)
|
||||
|
||||
print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")
|
||||
|
||||
# Handle CSV-only mode
|
||||
if args.csv_only:
|
||||
export_to_csv(results, args.csv_only)
|
||||
return
|
||||
|
||||
# Print analyses (unless CSV-only mode)
|
||||
print_ascii_comparison_table(results)
|
||||
print_model_performance_chart(results)
|
||||
print_best_performers(results)
|
||||
|
||||
# Export to CSV if requested
|
||||
if args.csv_file:
|
||||
print()
|
||||
export_to_csv(results, args.csv_file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user