From 8706edcd13e59576f06187e0e80b5380c652e451 Mon Sep 17 00:00:00 2001 From: Rick McEwen Date: Tue, 20 Jan 2026 13:37:01 -0700 Subject: [PATCH] Initial commit: Jersey detection test suite Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server. --- README.md | 93 +++ analyze_jersey_results.py | 663 ++++++++++++++++ docs/JERSEY_DETECTION_MODEL_ANALYSIS.md | 296 ++++++++ docs/LLAMA_SWAP_SETUP.md | 237 ++++++ jersey_detection_results.jsonl | 6 + jersey_prompt.txt | 43 ++ jersey_prompt_with_confidence.txt | 53 ++ llama-swap-config.yaml | 59 ++ requirements.txt | 9 + scan_utils/__init__.py | 1 + scan_utils/jersey_detection.py | 149 ++++ scan_utils/llama_cpp_client.py | 237 ++++++ test_all_models.sh | 263 +++++++ test_jersey_detection.py | 971 ++++++++++++++++++++++++ 14 files changed, 3080 insertions(+) create mode 100644 README.md create mode 100755 analyze_jersey_results.py create mode 100644 docs/JERSEY_DETECTION_MODEL_ANALYSIS.md create mode 100644 docs/LLAMA_SWAP_SETUP.md create mode 100644 jersey_detection_results.jsonl create mode 100644 jersey_prompt.txt create mode 100644 jersey_prompt_with_confidence.txt create mode 100644 llama-swap-config.yaml create mode 100644 requirements.txt create mode 100644 scan_utils/__init__.py create mode 100644 scan_utils/jersey_detection.py create mode 100644 scan_utils/llama_cpp_client.py create mode 100755 test_all_models.sh create mode 100755 test_jersey_detection.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..9b410e8 --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +# Jersey Detection Testing + +This project contains test scripts, results, and utilities for evaluating vision-language models on jersey number detection tasks using llama.cpp. + +## Directory Structure + +``` +jersey_test/ +├── scan_utils/ +│ ├── jersey_detection.py # Core detection class using VLM +│ └── llama_cpp_client.py # Client for llama.cpp server +├── docs/ +│ ├── JERSEY_DETECTION_MODEL_ANALYSIS.md # Model comparison results +│ └── LLAMA_SWAP_SETUP.md # Server setup instructions +├── test_images/ # Place test images here +├── test_images_output/ # Output directory for annotated images +├── test_jersey_detection.py # Main test runner +├── analyze_jersey_results.py # Results analysis script +├── test_all_models.sh # Batch testing shell script +├── jersey_prompt.txt # Basic detection prompt +├── jersey_prompt_with_confidence.txt # Prompt with confidence scoring +└── jersey_detection_results.jsonl # Historical test results +``` + +## Prerequisites + +- Python 3.10+ +- llama.cpp server running with a vision-language model +- Test images with ground truth encoded in filenames + +## Test Image Naming Convention + +Test images should follow this naming pattern to encode ground truth: +``` +prefix-number1-number2-number3.jpg +``` + +Example: `game1-23-45-7.jpg` contains jerseys with numbers 23, 45, and 7. + +## Running Tests + +### Single Model Test + +```bash +python test_jersey_detection.py \ + --images-dir ./test_images \ + --prompt-file jersey_prompt_with_confidence.txt \ + --server-url http://localhost:8080 \ + --resize 1024 \ + --output jersey_detection_results.jsonl +``` + +### Batch Testing All Models + +```bash +./test_all_models.sh +``` + +Edit variables at the top of the script to configure: +- `IMAGES_DIR` - test images directory +- `PROMPT_FILE` - prompt file to use +- `SERVER_URL` - llama.cpp/llama-swap server URL +- `LLAMA_SWAP_CONFIG` - path to llama-swap config for model list + +### Analyzing Results + +```bash +python analyze_jersey_results.py jersey_detection_results.jsonl +``` + +Options: +- `--csv output.csv` - Export results to CSV +- `--filter-model "model_name"` - Filter by model name + +## Historical Results + +The `jersey_detection_results.jsonl` file contains results from 6 test runs: + +| Model | F1 Score | Avg Time/Image | Avg Confidence | +|-------|----------|----------------|----------------| +| qwen2.5-vl-7b | 72.9% | - | - | +| gemma-3-27b | 72.1% | 18.1s | 87.1 | +| Mistral-Small-3.2-24B (Q4) | - | 14.2s | 92.1 | +| Kimi-VL-A3B-Thinking | - | 29.1s | 88.9 | + +See `docs/JERSEY_DETECTION_MODEL_ANALYSIS.md` for detailed analysis. + +## Key Findings + +1. **Top Recommendation**: qwen2.5-vl-7b (72.9% F1 score) +2. **Best Confidence Calibration**: gemma-3-27b +3. **Speed Champion**: gemma-3-4b (7.9s/img, 63.8% F1) +4. Confidence threshold of 85+ recommended for filtering uncertain detections diff --git a/analyze_jersey_results.py b/analyze_jersey_results.py new file mode 100755 index 0000000..868b992 --- /dev/null +++ b/analyze_jersey_results.py @@ -0,0 +1,663 @@ +#!/usr/bin/env python3 +""" +Analyze jersey detection test results and compare model performance. + +Usage: + python analyze_jersey_results.py [results_file] + python analyze_jersey_results.py [results_file] --csv output.csv + python analyze_jersey_results.py [results_file] --csv-only output.csv + +Arguments: + results_file: Path to the results file (default: jersey_detection_results.jsonl) + --csv: Also export results to CSV file + --csv-only: Export to CSV only, skip analysis display +""" + +import argparse +import csv +import json +import sys +from pathlib import Path +from typing import List, Dict, Any +from datetime import datetime + + +def load_results(results_file: str) -> List[Dict[str, Any]]: + """Load test results from a JSON Lines file.""" + results = [] + try: + with open(results_file, 'r') as f: + for line in f: + line = line.strip() + if line: + results.append(json.loads(line)) + return results + except FileNotFoundError: + print(f"Error: Results file not found: {results_file}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in results file: {e}") + sys.exit(1) + + +def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple: + """ + Calculate standard deviation of confidence scores from distribution. + + Returns: + Tuple of (stdev, quality_rating) + quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A" + """ + if not conf_stats or 'distribution' not in conf_stats: + return None, "N/A" + + dist = conf_stats['distribution'] + + # Reconstruct approximate confidence values from buckets + # Use midpoint of each bucket + values = [] + bucket_midpoints = { + '90-100': 95, + '70-89': 79.5, + '50-69': 59.5, + '30-49': 39.5, + '0-29': 14.5 + } + + for bucket, count in dist.items(): + midpoint = bucket_midpoints.get(bucket, 50) + values.extend([midpoint] * count) + + if len(values) < 2: + return None, "N/A" + + # Calculate standard deviation + import math + mean = sum(values) / len(values) + variance = sum((x - mean) ** 2 for x in values) / len(values) + stdev = math.sqrt(variance) + + # Assign quality rating based on StDev + if stdev < 5: + quality = "Poor" + elif stdev < 10: + quality = "Fair" + elif stdev < 15: + quality = "Good" + else: + quality = "Excel" # Shortened for table + + return stdev, quality + + +def print_ascii_comparison_table(results: List[Dict[str, Any]]): + """Print a detailed ASCII comparison table of all test runs.""" + if not results: + print("No results to display.") + return + + print("=" * 280) + print("DETAILED MODEL COMPARISON TABLE") + print("=" * 280) + print() + print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)") + print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections") + print() + + # Table headers with ground truth and confidence calibration columns + print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐") + print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format( + "Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date" + )) + print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤") + + # Data rows + for i, result in enumerate(results): + model = result.get('model_name', 'unknown')[:20] + prompt = Path(result.get('prompt_file', 'unknown')).stem[:8] + total_images = result.get('total_images', 0) + valid_jerseys = result.get('total_valid_jerseys', 0) + hallucinated = result.get('total_hallucinated', 0) + total_detections = valid_jerseys + hallucinated + empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0 + hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0 + avg_time = result.get('avg_processing_time', 0) + + # Calculate confidence quality + conf_stats = result.get('confidence_stats') + has_conf = 'Yes' if conf_stats else 'No' + stdev, quality = calculate_confidence_stdev(conf_stats) + + # Format confidence quality display + if stdev is not None: + conf_qual_str = f"{quality} ({stdev:.1f})" + else: + conf_qual_str = "N/A" + + # Ground truth metrics + gt = result.get('ground_truth', {}) + precision = gt.get('overall_precision', 0) * 100 + recall = gt.get('overall_recall', 0) * 100 + f1 = gt.get('overall_f1', 0) * 100 + + # Confidence calibration + conf_correct = gt.get('avg_confidence_correct') + conf_incorrect = gt.get('avg_confidence_incorrect') + conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A" + conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A" + + resize_max = result.get('resize_max') + resize_str = f"{resize_max}px" if resize_max else "No" + timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M') + + print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format( + model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp + )) + + # Bottom border + print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘") + + print() + + +def print_comparison_table(results: List[Dict[str, Any]]): + """Print a simple comparison table of all test runs.""" + if not results: + print("No results to display.") + return + + print("=" * 140) + print("MODEL COMPARISON TABLE") + print("=" * 140) + print() + + # Header + header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}" + print(header) + print("-" * 150) + + # Data rows + for result in results: + model = result.get('model_name', 'unknown')[:24] + prompt = Path(result.get('prompt_file', 'unknown')).stem[:29] + total_images = result.get('total_images', 0) + valid_jerseys = result.get('total_valid_jerseys', 0) + hallucinated = result.get('total_hallucinated', 0) + empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0 + avg_time = result.get('avg_processing_time', 0) + has_conf = 'Yes' if result.get('confidence_stats') else 'No' + resize_max = result.get('resize_max') + resize_str = f"{resize_max}px" if resize_max else "No" + timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S') + + row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}" + print(row) + + print() + + +def print_model_performance_chart(results: List[Dict[str, Any]]): + """Print a performance chart showing key metrics for each model.""" + if not results: + return + + print("=" * 140) + print("MODEL PERFORMANCE CHART") + print("=" * 140) + print() + + # Group results by model + models = {} + for result in results: + model_name = result.get('model_name', 'unknown') + if model_name not in models: + models[model_name] = [] + models[model_name].append(result) + + # Calculate aggregate statistics for each model + for model_name, model_results in models.items(): + print(f"\n{model_name}") + print("-" * 100) + + total_runs = len(model_results) + total_images = sum(r.get('total_images', 0) for r in model_results) + total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results) + total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results) + avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0 + avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0 + + # Check if any runs have confidence stats + has_confidence = any(r.get('confidence_stats') for r in model_results) + + # Check resize status + resize_enabled = any(r.get('resize_enabled', False) for r in model_results) + resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')] + resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled" + + print(f" Total test runs: {total_runs}") + print(f" Total images processed: {total_images}") + print(f" Total valid detections: {total_valid}") + print(f" Total hallucinations: {total_hallu}") + print(f" Average empty response rate: {avg_empty_pct:.1f}%") + print(f" Average processing time: {avg_time:.2f}s/image") + print(f" Resize: {resize_info}") + print(f" Confidence support: {'Yes' if has_confidence else 'No'}") + + # Show hallucination rate + if total_valid + total_hallu > 0: + hallu_rate = (total_hallu / (total_valid + total_hallu) * 100) + print(f" Hallucination rate: {hallu_rate:.1f}%") + + # Visual bar + bar_length = int(hallu_rate / 2) # Scale to max 50 chars + bar = '█' * bar_length + print(f" Hallucination chart: {bar} ({hallu_rate:.1f}%)") + + # Ground truth performance + gt_runs = [r for r in model_results if r.get('ground_truth')] + if gt_runs: + avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs) + avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs) + avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs) + total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs) + total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs) + total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs) + total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs) + + print(f"\n Ground truth performance:") + print(f" Total expected jerseys: {total_expected}") + print(f" True positives: {total_tp}") + print(f" False positives: {total_fp}") + print(f" False negatives: {total_fn}") + print(f" Average Precision: {avg_precision:.1%}") + print(f" Average Recall: {avg_recall:.1%}") + print(f" Average F1 Score: {avg_f1:.1%}") + + # Visual F1 bar + bar_length = int(avg_f1 * 50) # Scale to max 50 chars + bar = '█' * bar_length + print(f" F1 Score chart: {bar} ({avg_f1:.1%})") + + # Confidence calibration + conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None] + conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None] + + if conf_correct_vals or conf_incorrect_vals: + print(f"\n Confidence calibration:") + if conf_correct_vals: + avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals) + print(f" Avg confidence (correct detections): {avg_conf_correct:.2f}") + if conf_incorrect_vals: + avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals) + print(f" Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}") + if conf_correct_vals and conf_incorrect_vals: + diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals) + if diff > 0: + print(f" Confidence difference: +{diff:.2f} (good calibration)") + else: + print(f" Confidence difference: {diff:.2f} (⚠ poor calibration)") + + # Confidence distribution if available + if has_confidence: + print(f"\n Confidence distribution (across all runs):") + all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0} + total_conf_count = 0 + + for result in model_results: + conf_stats = result.get('confidence_stats') + if conf_stats and 'distribution' in conf_stats: + for bucket, count in conf_stats['distribution'].items(): + all_dist[bucket] += count + total_conf_count += count + + if total_conf_count > 0: + for bucket, count in all_dist.items(): + pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0 + bar_length = int(pct / 2) + bar = '█' * bar_length + print(f" {bucket}: {count:4d} ({pct:5.1f}%) {bar}") + + print() + + +def print_best_performers(results: List[Dict[str, Any]]): + """Print summary of best performing models.""" + if not results: + return + + print("=" * 140) + print("BEST PERFORMERS") + print("=" * 140) + print() + + # Group by model and calculate averages + models = {} + for result in results: + model_name = result.get('model_name', 'unknown') + if model_name not in models: + models[model_name] = { + 'runs': 0, + 'total_hallu': 0, + 'total_detections': 0, + 'avg_time': [], + 'empty_capable': [] + } + + models[model_name]['runs'] += 1 + models[model_name]['total_hallu'] += result.get('total_hallucinated', 0) + models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0) + models[model_name]['avg_time'].append(result.get('avg_processing_time', 0)) + models[model_name]['empty_capable'].append(result.get('empty_response_capable', False)) + + # Calculate scores + model_scores = [] + for model_name, stats in models.items(): + hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0 + avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0 + empty_capable = any(stats['empty_capable']) + + model_scores.append({ + 'model': model_name, + 'hallu_rate': hallu_rate, + 'avg_time': avg_time, + 'empty_capable': empty_capable, + 'runs': stats['runs'] + }) + + # Sort by hallucination rate (lower is better) + model_scores.sort(key=lambda x: x['hallu_rate']) + + print("Lowest hallucination rate:") + for i, score in enumerate(model_scores[:3], 1): + capable = "✓" if score['empty_capable'] else "✗" + print(f" {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)") + + print() + + # Sort by speed (lower is better) + model_scores.sort(key=lambda x: x['avg_time']) + + print("Fastest processing:") + for i, score in enumerate(model_scores[:3], 1): + capable = "✓" if score['empty_capable'] else "✗" + print(f" {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})") + + print() + + # Models with empty response capability + empty_models = [s for s in model_scores if s['empty_capable']] + print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}") + for score in empty_models: + print(f" - {score['model']}") + + print() + + # Best F1 scores (ground truth accuracy) + models_with_gt = {} + for result in results: + if result.get('ground_truth'): + model_name = result.get('model_name', 'unknown') + if model_name not in models_with_gt: + models_with_gt[model_name] = { + 'f1_scores': [], + 'precision_scores': [], + 'recall_scores': [] + } + gt = result['ground_truth'] + models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0)) + models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0)) + models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0)) + + if models_with_gt: + gt_scores = [] + for model_name, stats in models_with_gt.items(): + avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0 + avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0 + avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0 + gt_scores.append({ + 'model': model_name, + 'avg_f1': avg_f1, + 'avg_precision': avg_precision, + 'avg_recall': avg_recall + }) + + # Sort by F1 score (higher is better) + gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True) + + print("Highest ground truth F1 scores:") + for i, score in enumerate(gt_scores[:3], 1): + print(f" {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})") + + print() + + +def export_to_csv(results: List[Dict[str, Any]], csv_file: str): + """Export results to CSV file for spreadsheet import.""" + if not results: + print("No results to export.") + return + + try: + with open(csv_file, 'w', newline='') as f: + # Define CSV columns + fieldnames = [ + 'timestamp', + 'model_name', + 'model_tag', + 'prompt_file', + 'prompt_length', + 'total_images', + 'images_with_jerseys', + 'images_without_jerseys', + 'images_with_errors', + 'total_raw_detections', + 'total_valid_jerseys', + 'total_hallucinated', + 'hallucination_rate_pct', + 'empty_response_rate_pct', + 'avg_processing_time', + 'total_processing_time', + 'resize_enabled', + 'resize_max', + 'images_resized', + 'has_confidence', + 'confidence_avg', + 'confidence_min', + 'confidence_max', + 'confidence_count', + 'confidence_stdev', + 'confidence_quality', + 'conf_90_100', + 'conf_70_89', + 'conf_50_69', + 'conf_30_49', + 'conf_0_29', + # Ground truth columns + 'gt_total_expected', + 'gt_total_true_positives', + 'gt_total_false_positives', + 'gt_total_false_negatives', + 'gt_overall_precision', + 'gt_overall_recall', + 'gt_overall_f1', + 'gt_avg_precision', + 'gt_avg_recall', + 'gt_avg_f1', + # Confidence calibration + 'gt_avg_confidence_correct', + 'gt_avg_confidence_incorrect', + 'gt_confidence_correct_count', + 'gt_confidence_incorrect_count' + ] + + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + + # Write data rows + for result in results: + # Calculate derived values + total_images = result.get('total_images', 0) + valid_jerseys = result.get('total_valid_jerseys', 0) + hallucinated = result.get('total_hallucinated', 0) + total_detections = valid_jerseys + hallucinated + hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0 + empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0 + + # Extract confidence stats + conf_stats = result.get('confidence_stats') + has_confidence = conf_stats is not None + conf_avg = conf_stats.get('avg', '') if conf_stats else '' + conf_min = conf_stats.get('min', '') if conf_stats else '' + conf_max = conf_stats.get('max', '') if conf_stats else '' + conf_count = conf_stats.get('count', '') if conf_stats else '' + + # Calculate confidence standard deviation and quality + conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats) + + # Extract confidence distribution + conf_dist = conf_stats.get('distribution', {}) if conf_stats else {} + conf_90_100 = conf_dist.get('90-100', '') + conf_70_89 = conf_dist.get('70-89', '') + conf_50_69 = conf_dist.get('50-69', '') + conf_30_49 = conf_dist.get('30-49', '') + conf_0_29 = conf_dist.get('0-29', '') + + # Extract ground truth stats + gt = result.get('ground_truth', {}) + gt_total_expected = gt.get('total_expected', '') + gt_total_tp = gt.get('total_true_positives', '') + gt_total_fp = gt.get('total_false_positives', '') + gt_total_fn = gt.get('total_false_negatives', '') + gt_overall_precision = gt.get('overall_precision', '') + gt_overall_recall = gt.get('overall_recall', '') + gt_overall_f1 = gt.get('overall_f1', '') + gt_avg_precision = gt.get('avg_precision', '') + gt_avg_recall = gt.get('avg_recall', '') + gt_avg_f1 = gt.get('avg_f1', '') + gt_avg_conf_correct = gt.get('avg_confidence_correct', '') + gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '') + gt_conf_correct_count = gt.get('confidence_correct_count', '') + gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '') + + row = { + 'timestamp': result.get('timestamp', ''), + 'model_name': result.get('model_name', ''), + 'model_tag': result.get('model_tag', ''), + 'prompt_file': result.get('prompt_file', ''), + 'prompt_length': result.get('prompt_length', ''), + 'total_images': total_images, + 'images_with_jerseys': result.get('images_with_jerseys', ''), + 'images_without_jerseys': result.get('images_without_jerseys', ''), + 'images_with_errors': result.get('images_with_errors', ''), + 'total_raw_detections': result.get('total_raw_detections', ''), + 'total_valid_jerseys': valid_jerseys, + 'total_hallucinated': hallucinated, + 'hallucination_rate_pct': f"{hallu_rate:.2f}", + 'empty_response_rate_pct': f"{empty_rate:.2f}", + 'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}", + 'total_processing_time': f"{result.get('total_processing_time', 0):.2f}", + 'resize_enabled': result.get('resize_enabled', False), + 'resize_max': result.get('resize_max', ''), + 'images_resized': result.get('images_resized', ''), + 'has_confidence': has_confidence, + 'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '', + 'confidence_min': conf_min, + 'confidence_max': conf_max, + 'confidence_count': conf_count, + 'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '', + 'confidence_quality': conf_quality if conf_quality != 'N/A' else '', + 'conf_90_100': conf_90_100, + 'conf_70_89': conf_70_89, + 'conf_50_69': conf_50_69, + 'conf_30_49': conf_30_49, + 'conf_0_29': conf_0_29, + # Ground truth data + 'gt_total_expected': gt_total_expected, + 'gt_total_true_positives': gt_total_tp, + 'gt_total_false_positives': gt_total_fp, + 'gt_total_false_negatives': gt_total_fn, + 'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '', + 'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '', + 'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '', + 'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '', + 'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '', + 'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '', + 'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '', + 'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '', + 'gt_confidence_correct_count': gt_conf_correct_count, + 'gt_confidence_incorrect_count': gt_conf_incorrect_count + } + + writer.writerow(row) + + print(f"✓ Results exported to CSV: {csv_file}") + print(f" Rows: {len(results)}") + print(f" Columns: {len(fieldnames)}") + + except Exception as e: + print(f"❌ Failed to export to CSV: {e}") + sys.exit(1) + + +def main(): + """Main entry point for the analysis script.""" + parser = argparse.ArgumentParser( + description='Analyze jersey detection test results', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Show analysis + python analyze_jersey_results.py + + # Show analysis and export to CSV + python analyze_jersey_results.py --csv results.csv + + # Export to CSV only (no analysis display) + python analyze_jersey_results.py --csv-only results.csv + + # Analyze custom results file + python analyze_jersey_results.py custom_results.jsonl --csv custom.csv +""" + ) + parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl', + help='Path to results file (default: jersey_detection_results.jsonl)') + parser.add_argument('--csv', metavar='FILE', dest='csv_file', + help='Export results to CSV file (in addition to showing analysis)') + parser.add_argument('--csv-only', metavar='FILE', dest='csv_only', + help='Export to CSV file only, skip analysis display') + + args = parser.parse_args() + + # Check if file exists + if not Path(args.results_file).exists(): + print(f"Error: Results file not found: {args.results_file}") + print(f"Run some tests first with test_jersey_detection.py to generate results.") + sys.exit(1) + + # Load results + results = load_results(args.results_file) + + if not results: + print(f"No results found in {args.results_file}") + sys.exit(0) + + print(f"Loaded {len(results)} test run(s) from {args.results_file}\n") + + # Handle CSV-only mode + if args.csv_only: + export_to_csv(results, args.csv_only) + return + + # Print analyses (unless CSV-only mode) + print_ascii_comparison_table(results) + print_model_performance_chart(results) + print_best_performers(results) + + # Export to CSV if requested + if args.csv_file: + print() + export_to_csv(results, args.csv_file) + + +if __name__ == '__main__': + main() diff --git a/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md b/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md new file mode 100644 index 0000000..cd03467 --- /dev/null +++ b/docs/JERSEY_DETECTION_MODEL_ANALYSIS.md @@ -0,0 +1,296 @@ +# Jersey Detection Model Analysis Report + +**Date:** October 22, 2025 +**Models Tested:** 8 vision-language models +**Test Images:** 194 images with known jersey numbers +**Purpose:** Determine the best model for automated jersey number detection in sports photography + +--- + +## Executive Summary + +After comprehensive testing of 8 different AI models on 194 sports images with known jersey numbers, we recommend **qwen2.5-vl-7b** as the best overall model for jersey detection, with **gemma-3-27b** as a close second choice depending on specific needs. + +### Key Findings: + +1. **Best Overall Performance**: qwen2.5-vl-7b achieves the highest accuracy (72.9% F1 score) +2. **Confidence Scores Are Useful**: 7 out of 8 models show reliable confidence calibration, meaning higher confidence scores correlate with correct detections +3. **Speed vs Accuracy Trade-off**: The most accurate models take 13-21 seconds per image; faster models sacrifice significant accuracy + +--- + +## Model Performance Comparison + +### Top 3 Recommended Models + +| Rank | Model | Accuracy (F1) | Speed | Correct Detections | False Alarms | Confidence Reliability | +|------|-------|---------------|-------|--------------------|--------------|-----------------------| +| 🥇 1 | qwen2.5-vl-7b | 72.9% | 13.4s | 328 / 436 (75%) | 136 | Good | +| 🥈 2 | gemma-3-27b | 72.1% | 20.9s | 343 / 462 (74%) | 147 | Very Good (+6.0) | +| 🥉 3 | gemma-3-12b | 69.8% | 18.9s | 322 / 462 (70%) | 139 | Good (+3.1) | + +### Complete Results Table + +| Model | Accuracy (F1 Score) | Correct Detections | False Alarms | Missed Jerseys | Speed (sec/image) | Confidence Calibration | +|-------|--------------------|--------------------|--------------|----------------|-------------------|------------------------| +| **qwen2.5-vl-7b** | **72.9%** ⭐ | 328 / 436 | 136 | 108 | 13.4 | +0.5 (Good) | +| **gemma-3-27b** | **72.1%** | 343 / 462 | 147 | 119 | 20.9 | +6.0 (Very Good) | +| **gemma-3-12b** | 69.8% | 322 / 462 | 139 | 140 | 18.9 | +3.1 (Good) | +| mistral-small-24b-q4 | 67.6% | 328 / 462 | 180 | 134 | 15.1 | +2.4 (Good) | +| mistral-small-24b-q8 | 67.2% | 330 / 462 | 190 | 132 | 22.6 | +3.1 (Good) | +| gemma-3-4b | 63.8% | 277 / 462 | 130 | 185 | 7.9 ⚡ | +6.2 (Very Good) | +| lfm2-vl-1.6b | 50.5% | 171 / 448 | 58 | 277 | 4.6 ⚡⚡ | +11.9 (Excellent) | +| kimi-vl-3b | 2.0% ❌ | 5 / 416 | 67 | 411 | 40.0 🐌 | -1.3 (Poor) | + +--- + +## Understanding the Metrics + +### What the Numbers Mean: + +- **Accuracy (F1 Score)**: Overall effectiveness balancing correct detections and false alarms + - 70%+ = Excellent for production use + - 60-70% = Good for assisted workflows + - Below 60% = Not recommended + +- **Correct Detections**: Out of all jerseys that should have been found, how many were actually detected + - Example: "328 / 436" means the model found 328 jerseys out of 436 that were actually in the images + +- **False Alarms**: Jersey numbers detected that weren't actually in the image + - Lower is better - these are incorrect detections + - Can be filtered using confidence scores + +- **Missed Jerseys**: Jersey numbers that were in the image but not detected + - Lower is better - these are opportunities lost + +- **Speed**: Average seconds to process one image + - ⚡⚡ = Very fast (< 8s) + - ⚡ = Fast (8-15s) + - Standard = 15-25s + - 🐌 = Slow (> 30s) + +- **Confidence Calibration**: The difference between average confidence on correct vs incorrect detections + - Positive number (e.g., +6.0) = Good calibration - correct detections have higher confidence + - Negative number = Poor calibration - can't trust confidence scores + - Higher positive values = Better for filtering with confidence thresholds + +--- + +## Detailed Analysis + +### 1. Best Model: qwen2.5-vl-7b + +**Why It's the Best:** +- ✅ Highest overall accuracy (72.9%) +- ✅ Best recall - finds 75% of all jerseys +- ✅ Reasonable speed (13.4 seconds per image) +- ✅ Very low hallucination rate (only 1%) +- ✅ Confidence scores are reliable for filtering + +**Strengths:** +- Finds the most jerseys (highest recall at 75.2%) +- Rarely makes up fake jersey numbers (hallucination rate: 1%) +- Almost always returns results (empty response rate: 2.6%) + +**Weaknesses:** +- Generates 136 false positives (30% of detections are incorrect) +- Confidence calibration is minimal (+0.5), making threshold filtering less effective +- All confidence scores are 90-95, showing limited variation + +**Best For:** +- Applications where finding all jerseys is critical +- Batch processing where moderate false positives are acceptable +- When combined with manual review of results + +### 2. Runner-Up: gemma-3-27b + +**Why It's Excellent:** +- ✅ Nearly identical accuracy to the winner (72.1% vs 72.9%) +- ✅ Finds the most total jerseys (343 correct detections) +- ✅ Excellent confidence calibration (+6.0 difference) +- ✅ No hallucinations +- ⚠️ Slower processing (20.9s per image) + +**Strengths:** +- Best for confidence-based filtering (6-point difference between correct/incorrect) +- Highest absolute number of correct detections (343) +- More varied confidence scores (54% in 90-100 range, 42% in 70-89 range) + +**Weaknesses:** +- 56% slower than qwen2.5-vl-7b +- Similar false positive rate + +**Best For:** +- Applications requiring confidence-based filtering +- When processing time is not critical +- Maximizing total correct detections + +### 3. Alternative: gemma-3-4b (Speed Champion) + +**Why Consider It:** +- ⚡ Fast processing (7.9 seconds per image) +- ✅ Very good confidence calibration (+6.2) +- ✅ Zero hallucinations +- ⚠️ Lower accuracy (63.8%) + +**Trade-offs:** +- 41% faster than qwen2.5-vl-7b +- But 12% lower accuracy +- Misses 40% of jerseys (185 false negatives) + +**Best For:** +- Real-time or high-volume processing +- Applications where speed is more important than completeness +- Initial rough filtering before manual review + +--- + +## Should You Use Confidence Scores for Filtering? + +### Answer: **YES** - Confidence scores are useful for most models + +### Evidence from Testing: + +**7 out of 8 models show good confidence calibration:** + +| Model | Avg Confidence (Correct) | Avg Confidence (Incorrect) | Difference | Reliability | +|-------|--------------------------|---------------------------|------------|-------------| +| lfm2-vl-1.6b | 91.8 | 80.0 | **+11.9** | ⭐⭐⭐ Excellent | +| gemma-3-4b | 85.2 | 79.0 | **+6.2** | ⭐⭐ Very Good | +| gemma-3-27b | 88.2 | 82.2 | **+6.0** | ⭐⭐ Very Good | +| gemma-3-12b | 91.8 | 88.7 | **+3.1** | ⭐ Good | +| mistral-small-24b-q8 | 92.3 | 89.1 | **+3.1** | ⭐ Good | +| mistral-small-24b-q4 | 93.0 | 90.7 | **+2.4** | ⭐ Good | +| qwen2.5-vl-7b | 94.6 | 94.1 | +0.5 | Limited utility | +| kimi-vl-3b | 88.4 | 89.7 | **-1.3** | ❌ Not reliable | + +### What This Means: + +**For most models**, setting a confidence threshold can significantly reduce false positives: +- A threshold of 85 on gemma-3-27b would keep most correct detections (88.2 avg) while filtering many incorrect ones (82.2 avg) +- A threshold of 85 on gemma-3-4b would be even more effective + +**Exception: qwen2.5-vl-7b** has minimal difference (94.6 vs 94.1), making threshold filtering less useful despite being the most accurate model. + +### Recommended Filtering Strategy: + +1. **Use gemma-3-27b with confidence threshold of 85+** for best balance of accuracy and filtering +2. **Use gemma-3-4b with confidence threshold of 85+** for faster processing with good filtering +3. **Use qwen2.5-vl-7b without filtering** when you need maximum recall and will manually review results + +--- + +## Model-Specific Recommendations + +### For Different Use Cases: + +#### 🎯 **Highest Accuracy Required** +- **Model:** qwen2.5-vl-7b +- **Expected Results:** Find 75% of jerseys, 30% false positive rate +- **Processing:** 13.4 seconds per image +- **Setup:** Use raw results, manually review all detections + +#### 🎯 **Best Balance of Speed and Accuracy** +- **Model:** gemma-3-12b +- **Expected Results:** Find 70% of jerseys, reasonable false positive rate +- **Processing:** 18.9 seconds per image +- **Setup:** Apply confidence threshold of 90+ to reduce false positives + +#### 🎯 **Maximum Quality with Confidence Filtering** +- **Model:** gemma-3-27b +- **Expected Results:** Find 74% of jerseys, filter false positives effectively +- **Processing:** 20.9 seconds per image +- **Setup:** Apply confidence threshold of 85+ to reduce false positives by ~50% + +#### ⚡ **Speed is Critical** +- **Model:** gemma-3-4b +- **Expected Results:** Find 60% of jerseys quickly +- **Processing:** 7.9 seconds per image +- **Setup:** Apply confidence threshold of 85+ for quality filtering + +#### ❌ **Do Not Use** +- **kimi-vl-3b**: Only 2% accuracy, extremely slow, poor confidence calibration + +--- + +## Implementation Recommendations + +### 1. Production Deployment Strategy + +**Recommended:** Two-tier approach +- **Tier 1 (Automatic):** gemma-3-27b with confidence threshold 85+ + - Automatically tag high-confidence detections + - Expected: ~200 correct detections per 194 images with minimal false positives + +- **Tier 2 (Review Queue):** qwen2.5-vl-7b on remaining images + - Human review of all detections below confidence threshold + - Catches jerseys missed by Tier 1 + +### 2. Confidence Threshold Guidelines + +Based on testing data: + +| Model | Recommended Threshold | Expected Precision | Expected Recall | +|-------|----------------------|-------------------|-----------------| +| gemma-3-27b | 85+ | ~85-90% | ~60-65% | +| gemma-3-4b | 85+ | ~80-85% | ~50-55% | +| gemma-3-12b | 90+ | ~80-85% | ~60-65% | +| qwen2.5-vl-7b | Don't filter | 70.7% | 75.2% | + +### 3. Performance Optimization + +**Processing 1000 images:** +- qwen2.5-vl-7b: ~3.7 hours +- gemma-3-27b: ~5.8 hours +- gemma-3-4b: ~2.2 hours + +**Recommendation:** Use gemma-3-4b for initial pass, qwen2.5-vl-7b for second pass on low-confidence results. + +--- + +## Conclusions + +### Main Findings: + +1. **qwen2.5-vl-7b is the most accurate model** but has limited confidence score utility +2. **gemma-3-27b offers the best combination** of accuracy and confidence-based filtering +3. **Confidence scores are highly valuable** for reducing false positives in most models +4. **Speed vs accuracy trade-offs are significant** - fastest model is 9% less accurate than best +5. **One model (kimi-vl-3b) is completely unsuitable** for this task + +### Strategic Recommendations: + +**For most users:** Deploy gemma-3-27b with confidence threshold of 85+ +- Balances accuracy, speed, and filtering capability +- Reduces manual review burden significantly +- Good confidence calibration enables automated decision-making + +**For maximum accuracy:** Deploy qwen2.5-vl-7b without filtering +- Best for finding all possible jerseys +- Requires manual review of results +- Accept higher false positive rate + +**For high-volume processing:** Deploy gemma-3-4b with confidence threshold of 85+ +- Fast enough for real-time applications +- Good accuracy for the speed +- Effective filtering capability + +### Final Verdict: + +**Winner: qwen2.5-vl-7b** for pure accuracy +**Best Overall: gemma-3-27b** for practical deployment with confidence filtering +**Best Value: gemma-3-4b** for speed-sensitive applications + +--- + +## Technical Notes + +- **Test Dataset:** 194 images with ground truth jersey numbers encoded in filenames +- **Total Expected Jerseys:** 416-462 depending on which images each model processed successfully +- **Evaluation Metrics:** Precision, Recall, F1 Score, Confidence Calibration +- **Hardware:** Testing performed on comparable hardware configurations +- **Prompt:** All models used identical jersey detection prompt with confidence scores + +--- + +*Report generated from comprehensive testing of 8 vision-language models for jersey number detection in sports photography.* diff --git a/docs/LLAMA_SWAP_SETUP.md b/docs/LLAMA_SWAP_SETUP.md new file mode 100644 index 0000000..fc5b787 --- /dev/null +++ b/docs/LLAMA_SWAP_SETUP.md @@ -0,0 +1,237 @@ +# llama-swap Setup Guide for Jersey Detection Testing + +This guide explains how to use [llama-swap](https://github.com/mostlygeek/llama-swap) to automatically switch between different vision language models when testing jersey detection. + +## What is llama-swap? + +llama-swap is a model-swapping proxy that sits between your application and llama.cpp servers. It automatically loads and unloads models based on the `model` parameter in API requests, allowing you to test multiple models without manually restarting servers. + +## Installation + +### Docker (Recommended) + +```bash +# Pull the CUDA image (or cpu, vulkan, intel depending on your hardware) +docker pull ghcr.io/mostlygeek/llama-swap:cuda +``` + +### Homebrew (macOS/Linux) + +```bash +brew tap mostlygeek/llama-swap +brew install llama-swap +``` + +### Pre-built Binaries + +Download from the [releases page](https://github.com/mostlygeek/llama-swap/releases). + +## Configuration + +A configuration file `llama-swap-config.yaml` is provided with 8 pre-configured vision models: + +### Small Models (1-4B parameters) +- `lfm2-vl-1.6b` - LiquidAI LFM2-VL 1.6B (F16) +- `gemma-3-4b` - Gemma 3 4B Instruct (F16) +- `kimi-vl-3b` - Kimi VL A3B Thinking (F16) + +### Medium Models (7-12B parameters) +- `qwen2.5-vl-7b` - Qwen2.5-VL 7B Instruct (F16) +- `gemma-3-12b` - Gemma 3 12B Instruct (F16) + +### Large Models (24-27B parameters) +- `mistral-small-24b-q8` - Mistral Small 3.2 24B (Q8_K_XL) +- `mistral-small-24b-q4` - Mistral Small 3.2 24B (Q4_K_XL) +- `gemma-3-27b` - Gemma 3 27B Instruct (Q8_0) + +## Starting llama-swap + +### Using Docker + +```bash +docker run -it --rm --runtime nvidia -p 8080:8080 \ + -v $(pwd)/llama-swap-config.yaml:/app/config.yaml \ + -v /path/to/hf/cache:/root/.cache/huggingface \ + ghcr.io/mostlygeek/llama-swap:cuda +``` + +### Using Binary + +```bash +llama-swap --config llama-swap-config.yaml --listen localhost:8080 +``` + +## Testing with Jersey Detection Script + +Once llama-swap is running, you can test different models by specifying the `--model-tag` parameter: + +### Test a Single Model + +```bash +# Test Qwen2.5-VL 7B with resizing +python test_jersey_detection.py ./images jersey_prompt.txt \ + --model-tag "qwen2.5-vl-7b" \ + --resize 1024 +``` + +### Test Multiple Models Sequentially + +```bash +# Test small models +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b" --resize 1024 +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-4b" --resize 1024 +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "kimi-vl-3b" --resize 1024 + +# Test medium models +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024 +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-12b" --resize 1024 + +# Test large models +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "mistral-small-24b-q4" --resize 1024 +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-27b" --resize 1024 +``` + +### Automated Testing Scripts + +Two bash scripts are provided for automated testing: + +#### 1. Full Test Suite (`test_all_models.sh`) + +Tests **all models** defined in `llama-swap-config.yaml`: + +```bash +# Basic usage (uses defaults) +./test_all_models.sh ./test_images + +# Customize configuration with environment variables +RESIZE=2048 ./test_all_models.sh ./test_images +OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh ./test_images +PROMPT_FILE=custom_prompt.txt ./test_all_models.sh ./test_images + +# Disable resize +RESIZE= ./test_all_models.sh ./test_images +``` + +**Features:** +- Automatically extracts all model tags from YAML config +- Color-coded output with progress tracking +- Confirms before starting tests +- Shows summary with success/failure counts +- Asks to continue if a model fails + +**Default Configuration:** +- Images: `./test_images` +- Prompt: `jersey_prompt_with_confidence.txt` +- Resize: `1024px` +- Output: `jersey_detection_results.jsonl` + +#### 2. Quick Test (`test_quick.sh`) + +Tests a **small subset** of models for rapid iteration: + +```bash +# Test default selection (small, medium, large) +./test_quick.sh ./test_images + +# Test custom models +MODELS="lfm2-vl-1.6b qwen2.5-vl-7b" ./test_quick.sh ./test_images + +# Customize settings +RESIZE=512 MODELS="gemma-3-4b" ./test_quick.sh ./test_images +``` + +**Default Models:** +- `lfm2-vl-1.6b` (Small - 1.6B) +- `qwen2.5-vl-7b` (Medium - 7B) +- `mistral-small-24b-q4` (Large - 24B Q4) + +**Use Cases:** +- Quick validation after prompt changes +- Testing configuration adjustments +- Rapid prototyping before full test run + +## Analyzing Results + +After testing multiple models, use the analysis script to compare performance: + +```bash +python analyze_jersey_results.py +``` + +This will show: +- Comparison table of all models tested +- Performance charts with hallucination rates +- Best performers by speed and accuracy +- Confidence distribution (if applicable) + +## Model Swapping Behavior + +llama-swap will: +1. **Automatically load** the requested model when you specify `--model-tag` +2. **Automatically unload** the previous model (if different from current request) +3. **Keep running** if you test the same model multiple times +4. **Monitor** model loading/unloading in the web UI at `http://localhost:8080/ui` + +## Optional: Model Auto-Unloading + +To automatically unload models after 5 minutes of inactivity, uncomment this line in `llama-swap-config.yaml`: + +```yaml +ttl: 300 +``` + +## Optional: Preload Model on Startup + +To preload a specific model when llama-swap starts, uncomment and modify this section: + +```yaml +hooks: + onStartup: + - loadModel: qwen2.5-vl-7b +``` + +## Customizing Models + +To add or modify models, edit `llama-swap-config.yaml`: + +```yaml +models: + my-custom-model: + name: "My Custom Model Description" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf user/model-name:quantization +``` + +Then test with: + +```bash +python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "my-custom-model" +``` + +## Troubleshooting + +### Model not loading +- Check llama-swap logs at `http://localhost:8080/log` or via `curl http://localhost:8080/log/stream` +- Verify the model name in the config matches the `--model-tag` parameter +- Ensure sufficient GPU memory for the model + +### Connection refused +- Verify llama-swap is running: `curl http://localhost:8080/health` +- Check the server URL matches: default is `http://192.168.1.126:8080` (from scan.ini) + +### Slow model switching +- First load downloads models from HuggingFace (can be slow) +- Subsequent loads are faster (cached locally) +- Use quantized models (Q4, Q8) for faster loading and lower memory usage + +## Web UI + +llama-swap includes a web interface for monitoring: +- **Dashboard**: `http://localhost:8080/ui` - View loaded models and logs +- **Activity**: See recent API requests +- **Logs**: Real-time log monitoring + +## References + +- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap) +- [llama-swap Documentation](https://github.com/mostlygeek/llama-swap/tree/main/docs) +- [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp) diff --git a/jersey_detection_results.jsonl b/jersey_detection_results.jsonl new file mode 100644 index 0000000..c822806 --- /dev/null +++ b/jersey_detection_results.jsonl @@ -0,0 +1,6 @@ +{"timestamp": "2025-10-19T19:30:44.272849", "model_name": "LFM2-VL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 88, "images_without_jerseys": 110, "images_with_errors": 0, "total_raw_detections": 470, "total_valid_jerseys": 235, "total_hallucinated": 235, "avg_processing_time": 4.607636096501591, "total_processing_time": 912.3119471073151, "confidence_stats": {"avg": 84.14893617021276, "min": 0, "max": 100, "count": 235, "distribution": {"90-100": 138, "70-89": 70, "50-69": 8, "30-49": 8, "0-29": 11}}, "empty_response_capable": true} +{"timestamp": "2025-10-19T22:10:05.135029", "model_name": "ggml-org_Kimi-VL-A3B-Thinking-2506-GGUF_Kimi-VL-A3B-Thinking-2506-bf16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 28, "images_without_jerseys": 163, "images_with_errors": 7, "total_raw_detections": 49, "total_valid_jerseys": 49, "total_hallucinated": 0, "avg_processing_time": 29.11009831259949, "total_processing_time": 5763.799465894699, "confidence_stats": {"avg": 88.85714285714286, "min": 60, "max": 95, "count": 49, "distribution": {"90-100": 37, "70-89": 9, "50-69": 3, "30-49": 0, "0-29": 0}}, "empty_response_capable": true} +{"timestamp": "2025-10-20T01:20:31.076468", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-BF16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 37.221905313356956, "total_processing_time": 7369.937252044678, "confidence_stats": {"avg": 90.81983805668017, "min": 70, "max": 95, "count": 494, "distribution": {"90-100": 362, "70-89": 132, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true} +{"timestamp": "2025-10-20T12:04:37.833650", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q8_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 496, "total_valid_jerseys": 496, "total_hallucinated": 0, "avg_processing_time": 20.684308366342023, "total_processing_time": 4095.493056535721, "confidence_stats": {"avg": 90.76612903225806, "min": 70, "max": 95, "count": 496, "distribution": {"90-100": 363, "70-89": 133, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true} +{"timestamp": "2025-10-20T13:01:42.747694", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 14.196594772916852, "total_processing_time": 2810.9257650375366, "confidence_stats": {"avg": 92.09514170040485, "min": 80, "max": 95, "count": 494, "distribution": {"90-100": 415, "70-89": 79, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true} +{"timestamp": "2025-10-20T15:01:25.669340", "model_name": "unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-Q8_0", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 185, "images_without_jerseys": 13, "images_with_errors": 0, "total_raw_detections": 428, "total_valid_jerseys": 428, "total_hallucinated": 0, "avg_processing_time": 18.127051142731098, "total_processing_time": 3589.1561262607574, "confidence_stats": {"avg": 87.14953271028037, "min": 55, "max": 100, "count": 428, "distribution": {"90-100": 250, "70-89": 166, "50-69": 12, "30-49": 0, "0-29": 0}}, "empty_response_capable": true} diff --git a/jersey_prompt.txt b/jersey_prompt.txt new file mode 100644 index 0000000..27f5f3f --- /dev/null +++ b/jersey_prompt.txt @@ -0,0 +1,43 @@ +You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys. + +CRITICAL INSTRUCTIONS: +1. ONLY detect jerseys that are CLEARLY VISIBLE in the image +2. ONLY include jersey numbers that you can ACTUALLY READ in the image +3. If you CANNOT see any jerseys, you MUST return {"jerseys": []} +4. DO NOT make up, imagine, or guess jersey numbers that aren't visible +5. DO NOT include jerseys if you cannot clearly see the number + +RESPONSE FORMAT: +Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text. + +Use DOUBLE QUOTES (") for all JSON keys and string values. + +The JSON must have a single key "jerseys" with an array of dictionaries. + +Each dictionary must have exactly these three keys: +- "jersey_number": The number on the jersey (as a string, only if clearly visible) +- "jersey_color": The primary color of the jersey +- "number_color": The color of the number on the jersey + +Example response for an image WITH visible jerseys: +{ + "jerseys": [ + { + "jersey_number": "101", + "jersey_color": "red", + "number_color": "white" + }, + { + "jersey_number": "142", + "jersey_color": "blue", + "number_color": "yellow" + } + ] +} + +Example response for an image WITHOUT jerseys or with unclear numbers: +{"jerseys": []} + +REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array. + +Now analyze the image and return the JSON object. \ No newline at end of file diff --git a/jersey_prompt_with_confidence.txt b/jersey_prompt_with_confidence.txt new file mode 100644 index 0000000..0f080f9 --- /dev/null +++ b/jersey_prompt_with_confidence.txt @@ -0,0 +1,53 @@ +You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys. + +CRITICAL INSTRUCTIONS: +1. ONLY detect jerseys that are CLEARLY VISIBLE in the image +2. ONLY include jersey numbers that you can ACTUALLY READ in the image +3. If you CANNOT see any jerseys, you MUST return {"jerseys": []} +4. DO NOT make up, imagine, or guess jersey numbers that aren't visible +5. DO NOT include jerseys if you cannot clearly see the number + +RESPONSE FORMAT: +Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text. + +Use DOUBLE QUOTES (") for all JSON keys and string values. + +The JSON must have a single key "jerseys" with an array of dictionaries. + +Each dictionary must have exactly these four keys: +- "jersey_number": The number on the jersey (as a string, only if clearly visible) +- "jersey_color": The primary color of the jersey +- "number_color": The color of the number on the jersey +- "confidence": A number from 0 to 100 representing your confidence in this detection (0 = no confidence, 100 = absolutely certain) + +CONFIDENCE SCORING GUIDELINES: +- 90-100: Jersey number is extremely clear and unambiguous +- 70-89: Jersey number is clear but might have minor occlusion or angle issues +- 50-69: Jersey number is partially visible or somewhat unclear +- 30-49: Jersey number is difficult to read but you can make it out +- 0-29: Very uncertain, number is barely visible + +Example response for an image WITH visible jerseys: +{ + "jerseys": [ + { + "jersey_number": "101", + "jersey_color": "red", + "number_color": "white", + "confidence": 95 + }, + { + "jersey_number": "142", + "jersey_color": "blue", + "number_color": "yellow", + "confidence": 78 + } + ] +} + +Example response for an image WITHOUT jerseys or with unclear numbers: +{"jerseys": []} + +REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array. Always provide a confidence score that honestly reflects how certain you are about each detection. + +Now analyze the image and return the JSON object. diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml new file mode 100644 index 0000000..956ad59 --- /dev/null +++ b/llama-swap-config.yaml @@ -0,0 +1,59 @@ +# llama-swap configuration for jersey detection testing +# ================================================== +# This configuration allows automatic model switching for testing +# different vision language models with the jersey detection test script. +# +# Usage: +# llama-swap --config llama-swap-config.yaml --listen localhost:8080 +# +# Then use the test script with --model-tag: +# python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b" +# +# llama-swap will automatically load the requested model and swap models +# as needed when you run tests with different --model-tag values. + +models: + # Small vision models (1-4B parameters) + lfm2-vl-1.6b: + name: "LiquidAI LFM2-VL 1.6B (F16)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf LiquidAI/LFM2-VL-1.6B-GGUF:F16 + + gemma-3-4b: + name: "Gemma 3 4B Instruct (F16)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-4b-it-GGUF:F16 + + kimi-vl-3b: + name: "Kimi VL A3B Thinking (F16)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:F16 + + # Medium vision models (7-12B parameters) + qwen2.5-vl-7b: + name: "Qwen2.5-VL 7B Instruct (F16)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:F16 + + gemma-3-12b: + name: "Gemma 3 12B Instruct (F16)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-12b-it-GGUF:F16 + + # Large models (24-27B parameters) + mistral-small-24b-q8: + name: "Mistral Small 3.2 24B Instruct (Q8_K_XL)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q8_K_XL + + mistral-small-24b-q4: + name: "Mistral Small 3.2 24B Instruct (Q4_K_XL)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q4_K_XL + + gemma-3-27b: + name: "Gemma 3 27B Instruct (Q8_0)" + cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-27b-it-GGUF:Q8_0 + +# Optional: Automatically unload models after 5 minutes of inactivity +# Uncomment to enable +# ttl: 300 + +# Optional: Preload a specific model on startup +# Uncomment to enable +# hooks: +# onStartup: +# - loadModel: qwen2.5-vl-7b diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..31d2c0f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +# Jersey Detection Test Dependencies +# Install with: pip install -r requirements.txt + +# HTTP client for llama.cpp server communication +requests>=2.28.0 + +# Image processing +opencv-python>=4.8.0 +numpy>=1.24.0 diff --git a/scan_utils/__init__.py b/scan_utils/__init__.py new file mode 100644 index 0000000..79f9d91 --- /dev/null +++ b/scan_utils/__init__.py @@ -0,0 +1 @@ +# Jersey detection scan utilities diff --git a/scan_utils/jersey_detection.py b/scan_utils/jersey_detection.py new file mode 100644 index 0000000..713bcb0 --- /dev/null +++ b/scan_utils/jersey_detection.py @@ -0,0 +1,149 @@ +import json +import cv2 +import numpy as np +from typing import Dict, Any, Optional +import logging + +# Read the default jersey detection prompt +try: + with open('jersey_prompt.txt', 'r') as f: + DEFAULT_JERSEY_PROMPT = f.read() +except FileNotFoundError: + # Fallback prompt if file is not found + DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys. + +CRITICAL INSTRUCTIONS: +1. ONLY detect jerseys that are CLEARLY VISIBLE in the image +2. ONLY include jersey numbers that you can ACTUALLY READ in the image +3. If you CANNOT see any jerseys, you MUST return {"jerseys": []} +4. DO NOT make up, imagine, or guess jersey numbers that aren't visible +5. DO NOT include jerseys if you cannot clearly see the number + +RESPONSE FORMAT: +Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text. + +Use DOUBLE QUOTES (") for all JSON keys and string values. + +The JSON must have a single key "jerseys" with an array of dictionaries. + +Each dictionary must have exactly these three keys: +- "jersey_number": The number on the jersey (as a string, only if clearly visible) +- "jersey_color": The primary color of the jersey +- "number_color": The color of the number on the jersey + +Example response for an image WITH visible jerseys: +{ + "jerseys": [ + { + "jersey_number": "101", + "jersey_color": "red", + "number_color": "white" + } + ] +} + +Example response for an image WITHOUT jerseys or with unclear numbers: +{"jerseys": []} + +REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array. + +Now analyze the image and return the JSON object.""" + + +class DetectJerseys: + """A class for detecting sports jerseys using a vision language model.""" + + def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None): + """ + Initialize the jersey detection class. + + Args: + llama_cpp_base_url: Base URL for the llama.cpp server + logger: Logger instance for logging messages + prompt: Custom prompt to use for jersey detection (optional) + """ + self.logger = logger or logging.getLogger(__name__) + self.prompt = prompt or DEFAULT_JERSEY_PROMPT + + # Import here to avoid circular dependencies + try: + from scan_utils.llama_cpp_client import LlamaCppClient + self.client = LlamaCppClient(base_url=llama_cpp_base_url) + self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}") + except ImportError as e: + self.logger.error(f"Failed to import LlamaCppClient: {e}") + raise + + def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]: + """ + Detect jerseys in an image using the vision language model. + + Args: + image: OpenCV image (numpy array) to analyze + temperature: Temperature value for the model (default: 0.1) + + Returns: + Dictionary containing detected jerseys or empty dict if invalid + """ + try: + # Create multimodal message with image and prompt + message = self.client.create_multimodal_message( + role="user", + content=self.prompt, + images=[image] + ) + + # Send chat completion request + response = self.client.chat_completion( + messages=[message], + temperature=temperature, + max_tokens=1000 + ) + + # Extract the response text + if 'choices' in response and len(response['choices']) > 0: + response_text = response['choices'][0]['message']['content'] + + # Log the raw response for debugging + self.logger.debug(f"Raw VLM response: {response_text}") + + # Parse JSON response + try: + result = json.loads(response_text) + + # Process jerseys to ensure they have all required fields + jerseys = result.get('jerseys', []) + + # Hallucination detection: filter out example numbers from the prompt + # Using numbers > 100 as examples to avoid filtering valid jersey numbers + HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'} + + processed_jerseys = [] + for jersey in jerseys: + jersey_number = jersey.get('jersey_number', '') + + # Check for hallucination (model returning example numbers) + if jersey_number in HALLUCINATION_NUMBERS: + self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.") + continue + + # Ensure all required fields are present + processed_jersey = { + 'jersey_number': jersey_number, + 'jersey_color': jersey.get('jersey_color', ''), + 'number_color': jersey.get('number_color', 'unknown') # Default to 'unknown' if missing + } + processed_jerseys.append(processed_jersey) + + return {"jerseys": processed_jerseys} + except json.JSONDecodeError as e: + self.logger.error(f"Failed to parse JSON response: {e}") + self.logger.debug(f"Response text was: {response_text}") + return {"jerseys": []} + else: + self.logger.warning("Empty response from VLM") + return {"jerseys": []} + + except Exception as e: + self.logger.error(f"Error during jersey detection: {e}") + return {"jerseys": []} \ No newline at end of file diff --git a/scan_utils/llama_cpp_client.py b/scan_utils/llama_cpp_client.py new file mode 100644 index 0000000..da5d9f5 --- /dev/null +++ b/scan_utils/llama_cpp_client.py @@ -0,0 +1,237 @@ +import base64 +import json +import cv2 +import numpy as np +import requests +from typing import List, Dict, Any, Optional, Union + + +class LlamaCppClient: + """A Python client for interacting with a llama.cpp server.""" + + def __init__(self, base_url: str = "http://192.168.1.34:8080"): + """ + Initialize the client with the base URL of the llama.cpp server. + + Args: + base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080) + """ + self.base_url = base_url.rstrip('/') + + def health_check(self) -> Dict[str, Any]: + """ + Check the health status of the server. + + Returns: + Health status response from the server + """ + response = requests.get(f"{self.base_url}/health") + response.raise_for_status() + return response.json() + + def get_models(self) -> Dict[str, Any]: + """ + Get information about loaded models. + + Returns: + Model information from the server + """ + response = requests.get(f"{self.base_url}/v1/models") + response.raise_for_status() + return response.json() + + def chat_completion( + self, + messages: List[Dict[str, Any]], + temperature: float = 0.1, + min_p: float = 0.15, + repetition_penalty: float = 1.05, + min_image_tokens: int = 64, + max_image_tokens: int = 256, + do_image_splitting: bool = True, + max_tokens: int = -1, + stream: bool = False, + **kwargs + ) -> Union[Dict[str, Any], requests.Response]: + """ + Generate a chat completion using the OpenAI-compatible API. + + Args: + messages: List of message dictionaries with role and content + temperature: Sampling temperature (default: 0.1) + min_p: Minimum probability for sampling (default: 0.15) + repetition_penalty: Repetition penalty factor (default: 1.05) + min_image_tokens: Minimum image tokens (default: 64) + max_image_tokens: Maximum image tokens (default: 256) + do_image_splitting: Whether to split images (default: True) + max_tokens: Maximum tokens to generate (default: -1 for infinity) + stream: Whether to stream the response (default: False) + **kwargs: Additional parameters for the completion + + Returns: + Completion response or streaming response + """ + payload = { + "messages": messages, + "temperature": temperature, + "min_p": min_p, + "repetition_penalty": repetition_penalty, + "min_image_tokens": min_image_tokens, + "max_image_tokens": max_image_tokens, + "do_image_splitting": do_image_splitting, + "max_tokens": max_tokens, + "cache_prompt": True, + "stream": stream, + **kwargs + } + + # Debug: Show model parameter if present (for llama-swap debugging) + if 'model' in payload and payload['model']: + import os + if os.environ.get('DEBUG_LLAMA_SWAP'): + print(f"[DEBUG] Requesting model: {payload['model']}") + + response = requests.post( + f"{self.base_url}/v1/chat/completions", + headers={"Content-Type": "application/json"}, + json=payload, + stream=stream + ) + response.raise_for_status() + + if stream: + return response + + return response.json() + + def completion( + self, + prompt: Union[str, List[Union[str, int]]], + temperature: float = 0.1, + min_p: float = 0.15, + repetition_penalty: float = 1.05, + min_image_tokens: int = 64, + max_image_tokens: int = 256, + do_image_splitting: bool = True, + max_tokens: int = -1, + stream: bool = False, + **kwargs + ) -> Union[Dict[str, Any], requests.Response]: + """ + Generate a completion using the non-OAI compatible API. + + Args: + prompt: The prompt string or list of tokens + temperature: Sampling temperature (default: 0.1) + min_p: Minimum probability for sampling (default: 0.15) + repetition_penalty: Repetition penalty factor (default: 1.05) + min_image_tokens: Minimum image tokens (default: 64) + max_image_tokens: Maximum image tokens (default: 256) + do_image_splitting: Whether to split images (default: True) + max_tokens: Maximum tokens to generate (default: -1 for infinity) + stream: Whether to stream the response (default: False) + **kwargs: Additional parameters for the completion + + Returns: + Completion response or streaming response + """ + payload = { + "prompt": prompt, + "temperature": temperature, + "min_p": min_p, + "repeat_penalty": repetition_penalty, + "min_image_tokens": min_image_tokens, + "max_image_tokens": max_image_tokens, + "do_image_splitting": do_image_splitting, + "cache_prompt": True, + "n_predict": max_tokens, + "stream": stream, + **kwargs + } + + response = requests.post( + f"{self.base_url}/completion", + headers={"Content-Type": "application/json"}, + json=payload, + stream=stream + ) + response.raise_for_status() + + if stream: + return response + + return response.json() + + @staticmethod + def _encode_image_to_base64(image_path: str) -> str: + """ + Encode an image file to base64 string. + + Args: + image_path: Path to the image file + + Returns: + Base64 encoded image string + """ + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + @staticmethod + def _encode_cv2_image_to_base64(image: np.ndarray) -> str: + """ + Encode an OpenCV image to base64 string. + + Args: + image: OpenCV image (numpy array) + + Returns: + Base64 encoded image string + """ + _, buffer = cv2.imencode('.jpg', image) + return base64.b64encode(buffer).decode('utf-8') + + def create_multimodal_message( + self, + role: str, + content: str, + images: Optional[List[Union[str, np.ndarray]]] = None + ) -> Dict[str, Any]: + """ + Create a multimodal message with text and images. + + Args: + role: Role of the message (system, user, assistant) + content: Text content of the message + images: List of image paths or OpenCV images (numpy arrays) + + Returns: + Formatted message dictionary + """ + if not images: + return {"role": role, "content": content} + + # Process images + image_data = [] + for img in images: + if isinstance(img, str): + # Image path + encoded_image = self._encode_image_to_base64(img) + else: + # OpenCV image + encoded_image = self._encode_cv2_image_to_base64(img) + image_data.append(encoded_image) + + # Create multimodal content + multimodal_content = [ + {"type": "text", "text": content} + ] + + for img_data in image_data: + multimodal_content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_data}" + } + }) + + return {"role": role, "content": multimodal_content} \ No newline at end of file diff --git a/test_all_models.sh b/test_all_models.sh new file mode 100755 index 0000000..7836177 --- /dev/null +++ b/test_all_models.sh @@ -0,0 +1,263 @@ +#!/bin/bash +# ============================================================================== +# Test All Models Script for Jersey Detection +# ============================================================================== +# This script automatically tests all models defined in llama-swap-config.yaml +# with the jersey detection test suite. +# +# Usage: +# ./test_all_models.sh +# ./test_all_models.sh /path/to/images +# RESIZE=2048 ./test_all_models.sh +# OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh +# ============================================================================== + +# Note: We don't use 'set -e' here because we have explicit error handling +# in the test loop and want to give the user the option to continue on failures + +# ============================================================================== +# Configuration Variables +# ============================================================================== + +# Image directory containing test images +IMAGES_DIR="${1:-./test_images}" + +# Prompt file to use for testing +PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}" + +# Resize images to this max dimension (set to empty string to disable) +RESIZE="${RESIZE:-1024}" + +# Output file for results +OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}" + +# llama-swap configuration file +LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}" + +# Server URL +SERVER_URL="${SERVER_URL:-http://localhost:8080}" + +# ============================================================================== +# Color codes for output +# ============================================================================== +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# ============================================================================== +# Helper Functions +# ============================================================================== + +print_header() { + echo -e "${CYAN}============================================================================${NC}" + echo -e "${CYAN}$1${NC}" + echo -e "${CYAN}============================================================================${NC}" +} + +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +# ============================================================================== +# Validation +# ============================================================================== + +print_header "Jersey Detection - Test All Models" + +# Check if images directory exists +if [ ! -d "$IMAGES_DIR" ]; then + print_error "Image directory not found: $IMAGES_DIR" + echo "Usage: $0 " + exit 1 +fi + +# Check if prompt file exists +if [ ! -f "$PROMPT_FILE" ]; then + print_error "Prompt file not found: $PROMPT_FILE" + exit 1 +fi + +# Check if llama-swap config exists +if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then + print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG" + exit 1 +fi + +# Check if test script exists +if [ ! -f "test_jersey_detection.py" ]; then + print_error "test_jersey_detection.py not found in current directory" + exit 1 +fi + +# Check if server is running +print_info "Checking if llama-swap server is running at $SERVER_URL..." +if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then + print_error "Cannot connect to llama-swap at $SERVER_URL" + echo "" + echo "Please start llama-swap first:" + echo " llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080" + echo "" + exit 1 +fi +print_success "Server is running" + +# ============================================================================== +# Extract model tags from YAML +# ============================================================================== + +print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..." + +# Extract model IDs (keys under 'models:') +# This uses grep and sed to parse the YAML (simple parser, works for our format) +MODEL_TAGS=$(grep "^ [a-z]" "$LLAMA_SWAP_CONFIG" | \ + grep -v " " | \ + sed 's/:.*//' | \ + sed 's/^ //') + +if [ -z "$MODEL_TAGS" ]; then + print_error "No model tags found in $LLAMA_SWAP_CONFIG" + exit 1 +fi + +# Convert to array +readarray -t MODELS <<< "$MODEL_TAGS" + +MODEL_COUNT=${#MODELS[@]} +print_success "Found $MODEL_COUNT models to test" + +# ============================================================================== +# Display Configuration +# ============================================================================== + +echo "" +print_info "Test Configuration:" +echo " Images directory: $IMAGES_DIR" +echo " Prompt file: $PROMPT_FILE" +echo " Resize: ${RESIZE:-Disabled}" +echo " Output file: $OUTPUT_FILE" +echo " Server URL: $SERVER_URL" +echo " Models to test: $MODEL_COUNT" +echo "" + +# List all models +print_info "Models:" +for i in "${!MODELS[@]}"; do + echo " $((i+1)). ${MODELS[$i]}" +done +echo "" + +# ============================================================================== +# Confirmation +# ============================================================================== + +read -p "Continue with testing? (y/N) " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + print_warning "Testing cancelled" + exit 0 +fi + +# ============================================================================== +# Run Tests +# ============================================================================== + +print_header "Starting Tests" + +START_TIME=$(date +%s) +SUCCESSFUL=0 +FAILED=0 + +for i in "${!MODELS[@]}"; do + MODEL="${MODELS[$i]}" + MODEL_NUM=$((i+1)) + + echo "" + print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL" + + # Build command + CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\"" + CMD="$CMD --model-tag \"$MODEL\"" + CMD="$CMD --output-file \"$OUTPUT_FILE\"" + CMD="$CMD --server-url \"$SERVER_URL\"" + + # Add resize if configured + if [ -n "$RESIZE" ]; then + CMD="$CMD --resize $RESIZE" + fi + + print_info "Running: $CMD" + echo "" + + # Run the test + if eval "$CMD"; then + print_success "Model $MODEL completed successfully" + SUCCESSFUL=$((SUCCESSFUL + 1)) + else + print_error "Model $MODEL failed" + FAILED=$((FAILED + 1)) + + # Ask if user wants to continue + echo "" + read -p "Continue with remaining models? (Y/n) " -n 1 -r + echo + if [[ $REPLY =~ ^[Nn]$ ]]; then + print_warning "Testing stopped by user" + break + fi + fi + + # Show progress + if [ $MODEL_NUM -lt $MODEL_COUNT ]; then + print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed" + fi +done + +# ============================================================================== +# Summary +# ============================================================================== + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) +MINUTES=$((DURATION / 60)) +SECONDS=$((DURATION % 60)) + +echo "" +print_header "Testing Complete" +echo "" +print_info "Summary:" +echo " Total models: $MODEL_COUNT" +echo " Successful: $SUCCESSFUL" +echo " Failed: $FAILED" +echo " Total time: ${MINUTES}m ${SECONDS}s" +echo "" + +if [ $SUCCESSFUL -gt 0 ]; then + print_success "Results saved to: $OUTPUT_FILE" + echo "" + print_info "Analyze results with:" + echo " python analyze_jersey_results.py $OUTPUT_FILE" +fi + +echo "" + +# Exit with error code if any tests failed +if [ $FAILED -gt 0 ]; then + exit 1 +fi + +exit 0 diff --git a/test_jersey_detection.py b/test_jersey_detection.py new file mode 100755 index 0000000..36d9894 --- /dev/null +++ b/test_jersey_detection.py @@ -0,0 +1,971 @@ +#!/usr/bin/env python3 +""" +Test script for evaluating jersey detection performance with different models and prompts. + +Usage: + python test_jersey_detection.py [options] + +Arguments: + image_directory: Path to directory containing test images + prompt_file: Path to text file containing the prompt to use + --model-name: Name of the model being tested (optional, auto-detected from server if not provided) + --model-tag: Model tag for llama-swap integration (optional) + --server-url: Optional llama.cpp server URL (default: read from scan.ini) + --output-file: Output file for results (default: jersey_detection_results.jsonl) + --resize: Maximum image dimension for resizing before processing + +Ground Truth: + Expected jersey numbers are parsed from filenames using dash-separated format: + Example: 1122-8-10-29.jpg expects jerseys 8, 10, and 29 + + The script calculates precision, recall, F1 score, and confidence calibration metrics + to evaluate model accuracy against known correct results. + +Output Files: + : Summary statistics with ground truth metrics (default: jersey_detection_results.jsonl) + +Example: + # Auto-detect model name from server + python test_jersey_detection.py ./images jersey_prompt.txt + + # Resize images to 1024px max dimension before processing + python test_jersey_detection.py ./images jersey_prompt.txt --resize 1024 + + # Use llama-swap to automatically load a specific model + python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024 + + # Specify custom model name (for tracking in results) + python test_jersey_detection.py ./images jersey_prompt.txt --model-name "llama-3.2-vision" + python test_jersey_detection.py ./images jersey_prompt_with_confidence.txt --model-name "qwen2-vl" --resize 1024 + +After running tests, analyze results with: + python analyze_jersey_results.py # Performance and accuracy analysis +""" + +import argparse +import configparser +import json +import os +import re +import requests +import sys +import time +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Any, Optional +import cv2 + +# Add parent directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from scan_utils.llama_cpp_client import LlamaCppClient + + +# Hallucination detection: filter out example numbers from prompts +# Using numbers > 100 as examples to avoid filtering valid jersey numbers +HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'} + + +def parse_expected_jerseys(filename: str) -> List[str]: + """ + Parse expected jersey numbers from filename. + + Format: prefix-number1-number2-number3.ext + Example: 1122-8-10-29.jpg -> ['8', '10', '29'] + + Args: + filename: Image filename + + Returns: + List of expected jersey numbers as strings + """ + # Remove extension + name_without_ext = Path(filename).stem + + # Split by dash + parts = name_without_ext.split('-') + + # First part is typically a prefix/identifier, rest are jersey numbers + # Skip the first part and collect numeric parts + expected = [] + for i, part in enumerate(parts[1:], 1): # Skip first part + # Check if part is numeric (jersey number) + if part.isdigit(): + expected.append(part) + + return expected + + +def clean_response(text: str) -> str: + """ + Clean the response by removing think tags and markdown code blocks. + Some models use tags for chain-of-thought reasoning and wrap JSON in markdown. + + Args: + text: Raw response text + + Returns: + Cleaned text ready for JSON parsing + """ + # Remove ... tags and their content (standard angle brackets) + cleaned = re.sub(r'.*?', '', text, flags=re.DOTALL | re.IGNORECASE) + # Remove ◁think▷...◁/think▷ tags (unicode triangle brackets) + cleaned = re.sub(r'◁think▷.*?◁/think▷', '', cleaned, flags=re.DOTALL | re.IGNORECASE) + # Also remove any standalone think tags (both formats) + cleaned = re.sub(r'', '', cleaned, flags=re.IGNORECASE) + cleaned = re.sub(r'◁/?think▷', '', cleaned, flags=re.IGNORECASE) + + # Remove markdown code blocks (```json ... ``` or ``` ... ```) + # First try to extract content from ```json blocks + json_block_match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, flags=re.DOTALL | re.IGNORECASE) + if json_block_match: + # Extract just the content inside the code block + cleaned = json_block_match.group(1) + else: + # If no code block, just remove any stray ``` markers + cleaned = re.sub(r'```(?:json)?', '', cleaned, flags=re.IGNORECASE) + + return cleaned.strip() + + +def get_llama_server_url_from_config() -> Optional[str]: + """ + Read the LLAMA_CPP_SERVER_URL from scan.ini. + + Returns: + Server URL from config or None if not found + """ + config_path = os.path.join(os.path.dirname(__file__), 'scan.ini') + + if not os.path.exists(config_path): + return None + + try: + config = configparser.ConfigParser() + config.read(config_path) + + if 'DEFAULT' in config and 'LLAMA_CPP_SERVER_URL' in config['DEFAULT']: + return config['DEFAULT']['LLAMA_CPP_SERVER_URL'] + except Exception as e: + print(f"Warning: Failed to read scan.ini: {e}") + + return None + + +class JerseyDetectionTester: + """Test runner for jersey detection evaluation.""" + + def __init__(self, server_url: str, prompt: str, model_name: Optional[str] = None, resize_max: Optional[int] = None, model_tag: Optional[str] = None): + """ + Initialize the tester. + + Args: + server_url: Base URL for the llama.cpp server + prompt: Prompt text to use for detection + model_name: Name of the model being tested (optional) + resize_max: Maximum image dimension (resize if larger, None = no resize) + model_tag: Model tag for llama-swap integration (optional) + """ + self.client = LlamaCppClient(base_url=server_url) + self.prompt = prompt + self.model_name = model_name or "unknown" + self.resize_max = resize_max + self.model_tag = model_tag + self.results = [] + + def test_image(self, image_path: str) -> Dict[str, Any]: + """ + Test jersey detection on a single image. + + Args: + image_path: Path to the image file + + Returns: + Dictionary containing test results for this image + """ + start_time = time.time() + + # Load image + image = cv2.imread(image_path) + if image is None: + filename = Path(image_path).name + expected_jerseys = parse_expected_jerseys(filename) + return { + 'image_path': image_path, + 'error': 'Failed to load image', + 'jerseys': [], + 'processing_time': 0, + 'resized': False, + 'original_size': None, + 'final_size': None, + 'expected_jerseys': expected_jerseys, + 'detected_jerseys': [], + 'true_positives': [], + 'false_positives': [], + 'false_negatives': expected_jerseys, + 'precision': 0.0, + 'recall': 0.0, + 'f1_score': 0.0, + 'avg_confidence_correct': None, + 'avg_confidence_incorrect': None, + 'confidence_correct_count': 0, + 'confidence_incorrect_count': 0 + } + + # Track original size + original_height, original_width = image.shape[:2] + original_size = (original_width, original_height) + resized = False + + # Resize if needed + if self.resize_max and (original_width > self.resize_max or original_height > self.resize_max): + # Calculate new dimensions maintaining aspect ratio + if original_width > original_height: + new_width = self.resize_max + new_height = int(original_height * (self.resize_max / original_width)) + else: + new_height = self.resize_max + new_width = int(original_width * (self.resize_max / original_height)) + + # Resize image + image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA) + resized = True + + final_height, final_width = image.shape[:2] + final_size = (final_width, final_height) + + # Create multimodal message + message = self.client.create_multimodal_message( + role="user", + content=self.prompt, + images=[image] + ) + + # Send to LLM + try: + # Prepare kwargs for chat completion + completion_kwargs = { + 'messages': [message], + 'temperature': 0.1, + 'max_tokens': 1000 + } + + # Add model parameter if model_tag is specified (for llama-swap) + if self.model_tag: + completion_kwargs['model'] = self.model_tag + # Note: We don't print this for every image to avoid spam, but it's being sent + + response = self.client.chat_completion(**completion_kwargs) + + processing_time = time.time() - start_time + + # Extract response text + if 'choices' in response and len(response['choices']) > 0: + response_text = response['choices'][0]['message']['content'] + + # Clean response (remove think tags and markdown code blocks) + cleaned_text = clean_response(response_text) + + # Parse JSON response + try: + result = json.loads(cleaned_text) + jerseys = result.get('jerseys', []) + + # Apply hallucination detection + filtered_jerseys = [] + hallucinated_count = 0 + + for jersey in jerseys: + jersey_number = jersey.get('jersey_number', '') + + # Check for hallucination (model returning example numbers) + if jersey_number in HALLUCINATION_NUMBERS: + hallucinated_count += 1 + continue + + filtered_jerseys.append(jersey) + + # Ground truth comparison + filename = Path(image_path).name + expected_jerseys = set(parse_expected_jerseys(filename)) + detected_jerseys = set(jersey.get('jersey_number', '') for jersey in filtered_jerseys if jersey.get('jersey_number', '')) + + # Calculate ground truth metrics + true_positives = expected_jerseys & detected_jerseys # Correctly detected + false_positives = detected_jerseys - expected_jerseys # Detected but not expected + false_negatives = expected_jerseys - detected_jerseys # Expected but not detected + + # Calculate precision, recall, F1 + tp_count = len(true_positives) + fp_count = len(false_positives) + fn_count = len(false_negatives) + + precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0 + recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0.0 + f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + + # Handle edge case: if no expected jerseys, precision is 1.0 if no detections, else 0.0 + if len(expected_jerseys) == 0: + precision = 1.0 if len(detected_jerseys) == 0 else 0.0 + recall = 1.0 # No jerseys to detect + f1_score = 1.0 if len(detected_jerseys) == 0 else 0.0 + + # Calculate confidence scores for correct vs incorrect detections + confidence_correct = [] # Confidence for true positives + confidence_incorrect = [] # Confidence for false positives + + for jersey in filtered_jerseys: + jersey_number = jersey.get('jersey_number', '') + confidence = jersey.get('confidence') + + if confidence is not None: + if jersey_number in true_positives: + confidence_correct.append(confidence) + elif jersey_number in false_positives: + confidence_incorrect.append(confidence) + + avg_confidence_correct = sum(confidence_correct) / len(confidence_correct) if confidence_correct else None + avg_confidence_incorrect = sum(confidence_incorrect) / len(confidence_incorrect) if confidence_incorrect else None + + return { + 'image_path': image_path, + 'jerseys': filtered_jerseys, + 'hallucinated_count': hallucinated_count, + 'raw_response': cleaned_text, + 'processing_time': processing_time, + 'error': None, + 'resized': resized, + 'original_size': original_size, + 'final_size': final_size, + # Ground truth metrics + 'expected_jerseys': sorted(expected_jerseys), + 'detected_jerseys': sorted(detected_jerseys), + 'true_positives': sorted(true_positives), + 'false_positives': sorted(false_positives), + 'false_negatives': sorted(false_negatives), + 'precision': precision, + 'recall': recall, + 'f1_score': f1_score, + # Confidence calibration metrics + 'avg_confidence_correct': avg_confidence_correct, + 'avg_confidence_incorrect': avg_confidence_incorrect, + 'confidence_correct_count': len(confidence_correct), + 'confidence_incorrect_count': len(confidence_incorrect) + } + except json.JSONDecodeError as e: + filename = Path(image_path).name + expected_jerseys = parse_expected_jerseys(filename) + return { + 'image_path': image_path, + 'error': f'JSON parse error: {e}', + 'raw_response': cleaned_text, + 'original_response': response_text if cleaned_text != response_text else None, + 'jerseys': [], + 'processing_time': processing_time, + 'resized': resized, + 'original_size': original_size, + 'final_size': final_size, + 'expected_jerseys': expected_jerseys, + 'detected_jerseys': [], + 'true_positives': [], + 'false_positives': [], + 'false_negatives': expected_jerseys, + 'precision': 0.0, + 'recall': 0.0, + 'f1_score': 0.0 + } + else: + filename = Path(image_path).name + expected_jerseys = parse_expected_jerseys(filename) + return { + 'image_path': image_path, + 'error': 'Empty response from model', + 'jerseys': [], + 'processing_time': processing_time, + 'resized': resized, + 'original_size': original_size, + 'final_size': final_size, + 'expected_jerseys': expected_jerseys, + 'detected_jerseys': [], + 'true_positives': [], + 'false_positives': [], + 'false_negatives': expected_jerseys, + 'precision': 0.0, + 'recall': 0.0, + 'f1_score': 0.0 + } + + except Exception as e: + processing_time = time.time() - start_time + filename = Path(image_path).name + expected_jerseys = parse_expected_jerseys(filename) + return { + 'image_path': image_path, + 'error': f'Request error: {e}', + 'jerseys': [], + 'processing_time': processing_time, + 'resized': resized, + 'original_size': original_size, + 'final_size': final_size, + 'expected_jerseys': expected_jerseys, + 'detected_jerseys': [], + 'true_positives': [], + 'false_positives': [], + 'false_negatives': expected_jerseys, + 'precision': 0.0, + 'recall': 0.0, + 'f1_score': 0.0, + 'avg_confidence_correct': None, + 'avg_confidence_incorrect': None, + 'confidence_correct_count': 0, + 'confidence_incorrect_count': 0 + } + + def test_directory(self, directory_path: str) -> List[Dict[str, Any]]: + """ + Test all images in a directory. + + Args: + directory_path: Path to directory containing images + + Returns: + List of results for all images + """ + # Get all image files + image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'} + image_files = [] + + for ext in image_extensions: + image_files.extend(Path(directory_path).glob(f'*{ext}')) + image_files.extend(Path(directory_path).glob(f'*{ext.upper()}')) + + image_files = sorted(image_files) + + if not image_files: + print(f"No image files found in {directory_path}") + return [] + + print(f"Found {len(image_files)} images to process\n") + + # Process each image + results = [] + for i, image_path in enumerate(image_files, 1): + # Show model tag in progress if using llama-swap + model_info = f" [{self.model_tag}]" if self.model_tag else "" + print(f"[{i}/{len(image_files)}]{model_info} Processing {image_path.name}...") + result = self.test_image(str(image_path)) + results.append(result) + + # Display result + self._display_result(result) + print() + + return results + + def _display_result(self, result: Dict[str, Any]): + """Display the result for a single image.""" + if result.get('error'): + print(f" ❌ Error: {result['error']}") + if 'raw_response' in result: + print(f" Cleaned response: {result['raw_response']}...") + if result.get('original_response'): + print(f" (Think tags and/or markdown were filtered from response)") + else: + jerseys = result.get('jerseys', []) + hallucinated_count = result.get('hallucinated_count', 0) + + if jerseys: + print(f" ✓ Found {len(jerseys)} jersey(s):") + for jersey in jerseys: + number = jersey.get('jersey_number', 'N/A') + jersey_color = jersey.get('jersey_color', 'N/A') + number_color = jersey.get('number_color', 'N/A') + confidence = jersey.get('confidence', None) + + conf_str = f" (confidence: {confidence})" if confidence is not None else "" + print(f" - #{number}: {jersey_color} jersey, {number_color} number{conf_str}") + else: + print(f" ○ No jerseys detected") + + if hallucinated_count > 0: + print(f" ⚠ Filtered {hallucinated_count} hallucinated detection(s)") + + # Display ground truth comparison + expected = result.get('expected_jerseys', []) + detected = result.get('detected_jerseys', []) + true_positives = result.get('true_positives', []) + false_positives = result.get('false_positives', []) + false_negatives = result.get('false_negatives', []) + + if expected: + print(f" Ground truth: Expected {expected}, Detected {detected}") + if true_positives: + print(f" ✓ Correct: {true_positives}") + if false_positives: + print(f" ✗ False positives: {false_positives}") + if false_negatives: + print(f" ✗ Missed: {false_negatives}") + precision = result.get('precision', 0.0) + recall = result.get('recall', 0.0) + f1 = result.get('f1_score', 0.0) + print(f" Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}") + + print(f" Processing time: {result['processing_time']:.2f}s") + + + def save_results_to_file(self, results: List[Dict[str, Any]], prompt_file: str, output_file: str = "jersey_detection_results.jsonl"): + """ + Save test results to a JSON Lines file for later analysis. + + Args: + results: List of all test results + prompt_file: Path to the prompt file used + output_file: Path to output file (default: jersey_detection_results.jsonl) + """ + # Calculate summary statistics + total_images = len(results) + images_with_errors = sum(1 for r in results if r.get('error')) + images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0) + images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0) + total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error')) + total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error')) + total_raw_detections = total_jerseys + total_hallucinated + total_processing_time = sum(r.get('processing_time', 0) for r in results) + avg_processing_time = total_processing_time / total_images if total_images > 0 else 0 + + # Collect confidence statistics if available + confidences = [ + jersey.get('confidence') + for r in results if not r.get('error') + for jersey in r.get('jerseys', []) + if 'confidence' in jersey and jersey.get('confidence') is not None + ] + + confidence_stats = None + if confidences: + buckets = { + '90-100': sum(1 for c in confidences if 90 <= c <= 100), + '70-89': sum(1 for c in confidences if 70 <= c <= 89), + '50-69': sum(1 for c in confidences if 50 <= c <= 69), + '30-49': sum(1 for c in confidences if 30 <= c <= 49), + '0-29': sum(1 for c in confidences if 0 <= c <= 29) + } + confidence_stats = { + 'avg': sum(confidences) / len(confidences), + 'min': min(confidences), + 'max': max(confidences), + 'count': len(confidences), + 'distribution': buckets + } + + # Calculate resize statistics + images_resized = sum(1 for r in results if r.get('resized', False)) + + # Calculate ground truth statistics + results_without_errors = [r for r in results if not r.get('error')] + total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors) + total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors) + total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors) + total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors) + + # Calculate overall precision, recall, F1 + overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0 + overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0 + overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0 + + # Average per-image metrics + avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0 + avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0 + avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0 + + # Calculate confidence calibration metrics (correct vs incorrect detections) + all_confidence_correct = [] + all_confidence_incorrect = [] + for r in results_without_errors: + if r.get('avg_confidence_correct') is not None: + # Weight by the count of correct detections in this image + count = r.get('confidence_correct_count', 0) + avg_conf = r.get('avg_confidence_correct') + all_confidence_correct.extend([avg_conf] * count) + if r.get('avg_confidence_incorrect') is not None: + # Weight by the count of incorrect detections in this image + count = r.get('confidence_incorrect_count', 0) + avg_conf = r.get('avg_confidence_incorrect') + all_confidence_incorrect.extend([avg_conf] * count) + + overall_avg_confidence_correct = sum(all_confidence_correct) / len(all_confidence_correct) if all_confidence_correct else None + overall_avg_confidence_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect) if all_confidence_incorrect else None + + # Create summary record + summary_record = { + 'timestamp': datetime.now().isoformat(), + 'model_name': self.model_name, + 'model_tag': self.model_tag, + 'prompt_file': prompt_file, + 'prompt_length': len(self.prompt), + 'total_images': total_images, + 'images_with_jerseys': images_with_jerseys, + 'images_without_jerseys': images_without_jerseys, + 'images_with_errors': images_with_errors, + 'total_raw_detections': total_raw_detections, + 'total_valid_jerseys': total_jerseys, + 'total_hallucinated': total_hallucinated, + 'avg_processing_time': avg_processing_time, + 'total_processing_time': total_processing_time, + 'confidence_stats': confidence_stats, + 'empty_response_capable': images_without_jerseys > 0, + 'resize_enabled': self.resize_max is not None, + 'resize_max': self.resize_max, + 'images_resized': images_resized, + # Ground truth statistics + 'ground_truth': { + 'total_expected': total_expected_jerseys, + 'total_true_positives': total_true_positives, + 'total_false_positives': total_false_positives, + 'total_false_negatives': total_false_negatives, + 'overall_precision': overall_precision, + 'overall_recall': overall_recall, + 'overall_f1': overall_f1, + 'avg_precision': avg_precision, + 'avg_recall': avg_recall, + 'avg_f1': avg_f1, + # Confidence calibration + 'avg_confidence_correct': overall_avg_confidence_correct, + 'avg_confidence_incorrect': overall_avg_confidence_incorrect, + 'confidence_correct_count': len(all_confidence_correct), + 'confidence_incorrect_count': len(all_confidence_incorrect) + } + } + + # Append to file + try: + with open(output_file, 'a') as f: + f.write(json.dumps(summary_record) + '\n') + print(f"\n✓ Results saved to {output_file}") + except Exception as e: + print(f"\n❌ Failed to save results: {e}") + + def print_summary(self, results: List[Dict[str, Any]]): + """ + Print summary statistics for all results. + + Args: + results: List of all test results + """ + print("=" * 70) + print("SUMMARY") + print("=" * 70) + print(f"\nModel: {self.model_name}") + if self.model_tag: + print(f"Model tag: {self.model_tag}") + + # Display resize info + if self.resize_max: + images_resized = sum(1 for r in results if r.get('resized', False)) + print(f"Resize: Enabled (max: {self.resize_max}px, {images_resized} images resized)") + else: + print(f"Resize: Disabled") + + total_images = len(results) + images_with_errors = sum(1 for r in results if r.get('error')) + images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0) + images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0) + total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error')) + total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error')) + total_raw_detections = total_jerseys + total_hallucinated + total_processing_time = sum(r.get('processing_time', 0) for r in results) + avg_processing_time = total_processing_time / total_images if total_images > 0 else 0 + + print(f"\nTotal images processed: {total_images}") + print(f" - Images with jerseys: {images_with_jerseys} ({images_with_jerseys/total_images*100:.1f}%)") + print(f" - Images without jerseys: {images_without_jerseys} ({images_without_jerseys/total_images*100:.1f}%)") + print(f" - Images with errors: {images_with_errors} ({images_with_errors/total_images*100:.1f}%)") + print(f"\nJersey detections:") + print(f" - Total raw detections: {total_raw_detections}") + print(f" - Valid jerseys (after filtering): {total_jerseys}") + print(f" - Hallucinations filtered out: {total_hallucinated}") + if images_with_jerseys > 0: + print(f" - Average valid jerseys per image (when detected): {total_jerseys/images_with_jerseys:.2f}") + + # Empty response capability (important for evaluating model's ability to return empty results) + print(f"\nEmpty response capability:") + print(f" - Empty responses returned: {images_without_jerseys}") + print(f" - Percentage of images: {images_without_jerseys/total_images*100:.1f}%") + print(f" - Model can return empty results: {'✓ Yes' if images_without_jerseys > 0 else '✗ No (potential issue)'}") + + if total_hallucinated > 0: + print(f"\nHallucination detection:") + print(f" - Total hallucinated detections filtered: {total_hallucinated}") + images_with_hallucinations = sum(1 for r in results if not r.get('error') and r.get('hallucinated_count', 0) > 0) + print(f" - Images with hallucinations: {images_with_hallucinations} ({images_with_hallucinations/total_images*100:.1f}%)") + + # Ground truth statistics + results_without_errors = [r for r in results if not r.get('error')] + total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors) + + if total_expected_jerseys > 0: + total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors) + total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors) + total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors) + + # Calculate overall metrics + overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0 + overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0 + overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0 + + # Calculate average per-image metrics + avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0 + avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0 + avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0 + + print(f"\nGround truth performance:") + print(f" - Total expected jerseys: {total_expected_jerseys}") + print(f" - True positives: {total_true_positives}") + print(f" - False positives: {total_false_positives}") + print(f" - False negatives: {total_false_negatives}") + print(f"\n Overall metrics (across all jerseys):") + print(f" - Precision: {overall_precision:.2%}") + print(f" - Recall: {overall_recall:.2%}") + print(f" - F1 Score: {overall_f1:.2%}") + print(f"\n Average per-image metrics:") + print(f" - Avg Precision: {avg_precision:.2%}") + print(f" - Avg Recall: {avg_recall:.2%}") + print(f" - Avg F1 Score: {avg_f1:.2%}") + + # Confidence calibration metrics + all_confidence_correct = [] + all_confidence_incorrect = [] + for r in results_without_errors: + if r.get('avg_confidence_correct') is not None: + count = r.get('confidence_correct_count', 0) + avg_conf = r.get('avg_confidence_correct') + all_confidence_correct.extend([avg_conf] * count) + if r.get('avg_confidence_incorrect') is not None: + count = r.get('confidence_incorrect_count', 0) + avg_conf = r.get('avg_confidence_incorrect') + all_confidence_incorrect.extend([avg_conf] * count) + + if all_confidence_correct or all_confidence_incorrect: + print(f"\n Confidence calibration:") + if all_confidence_correct: + avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct) + print(f" - Avg confidence (correct detections): {avg_conf_correct:.2f} ({len(all_confidence_correct)} detections)") + else: + print(f" - Avg confidence (correct detections): N/A (no correct detections with confidence)") + + if all_confidence_incorrect: + avg_conf_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect) + print(f" - Avg confidence (incorrect detections): {avg_conf_incorrect:.2f} ({len(all_confidence_incorrect)} detections)") + + # Show confidence difference + if all_confidence_correct: + avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct) + diff = avg_conf_correct - avg_conf_incorrect + if diff > 0: + print(f" - Confidence difference: +{diff:.2f} (correct > incorrect, good calibration)") + else: + print(f" - Confidence difference: {diff:.2f} (⚠ incorrect ≥ correct, poor calibration)") + else: + print(f" - Avg confidence (incorrect detections): N/A (no incorrect detections with confidence)") + + print(f"\nProcessing time:") + print(f" - Total: {total_processing_time:.2f}s") + print(f" - Average per image: {avg_processing_time:.2f}s") + + # Check for confidence values + has_confidence = any( + any('confidence' in jersey for jersey in r.get('jerseys', [])) + for r in results if not r.get('error') + ) + + if has_confidence: + print(f"\nConfidence statistics:") + confidences = [ + jersey.get('confidence') + for r in results if not r.get('error') + for jersey in r.get('jerseys', []) + if 'confidence' in jersey and jersey.get('confidence') is not None + ] + if confidences: + avg_confidence = sum(confidences) / len(confidences) + min_confidence = min(confidences) + max_confidence = max(confidences) + print(f" - Total detections with confidence: {len(confidences)}") + print(f" - Average confidence: {avg_confidence:.2f}") + print(f" - Min confidence: {min_confidence:.2f}") + print(f" - Max confidence: {max_confidence:.2f}") + + # Confidence distribution by bucket + print(f"\n Confidence distribution:") + buckets = { + '90-100 (Extremely clear)': (90, 100), + '70-89 (Clear, minor issues)': (70, 89), + '50-69 (Partially visible)': (50, 69), + '30-49 (Difficult to read)': (30, 49), + '0-29 (Very uncertain)': (0, 29) + } + + for bucket_name, (min_val, max_val) in buckets.items(): + count = sum(1 for c in confidences if min_val <= c <= max_val) + percentage = (count / len(confidences) * 100) if len(confidences) > 0 else 0 + bar_length = int(percentage / 2) # Scale to max 50 chars + bar = '█' * bar_length + print(f" {bucket_name}: {count:3d} ({percentage:5.1f}%) {bar}") + + # List errors if any + if images_with_errors > 0: + print(f"\nErrors encountered:") + for r in results: + if r.get('error'): + print(f" - {Path(r['image_path']).name}: {r['error']}") + + print() + + +def main(): + """Main entry point for the test script.""" + # Get default server URL from config + default_server_url = get_llama_server_url_from_config() or 'http://192.168.1.34:8080' + + parser = argparse.ArgumentParser( + description='Test jersey detection with different models and prompts', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument('image_directory', help='Path to directory containing test images') + parser.add_argument('prompt_file', help='Path to text file containing the prompt') + parser.add_argument('--model-name', default=None, + help='Name of the model being tested (auto-detected from server if not provided)') + parser.add_argument('--server-url', default=default_server_url, + help=f'llama.cpp server URL (default: {default_server_url})') + parser.add_argument('--output-file', default='jersey_detection_results.jsonl', + help='Output file for results (default: jersey_detection_results.jsonl)') + parser.add_argument('--resize', type=int, default=None, metavar='MAX_SIZE', + help='Resize images to maximum dimension (e.g., 1024) before processing') + parser.add_argument('--model-tag', default=None, + help='Model tag for llama-swap (e.g., "qwen2.5-vl-7b"). If not specified, uses whatever model is loaded.') + + args = parser.parse_args() + + # Validate inputs + if not os.path.isdir(args.image_directory): + print(f"Error: Directory not found: {args.image_directory}") + sys.exit(1) + + if not os.path.isfile(args.prompt_file): + print(f"Error: Prompt file not found: {args.prompt_file}") + sys.exit(1) + + # Load prompt + try: + with open(args.prompt_file, 'r') as f: + prompt = f.read() + except Exception as e: + print(f"Error reading prompt file: {e}") + sys.exit(1) + + # Print test configuration + print("=" * 70) + print("JERSEY DETECTION TEST") + print("=" * 70) + print(f"Model name: {args.model_name if args.model_name else '(auto-detect)'}") + print(f"Model tag: {args.model_tag if args.model_tag else 'None (use loaded model)'}") + print(f"Server URL: {args.server_url}") + print(f"Image directory: {args.image_directory}") + print(f"Prompt file: {args.prompt_file}") + print(f"Prompt length: {len(prompt)} characters") + print(f"Output file: {args.output_file}") + print(f"Resize images: {f'Yes (max: {args.resize}px)' if args.resize else 'No'}") + print("=" * 70) + print() + + # Check server health + print("Checking server health...") + try: + client = LlamaCppClient(base_url=args.server_url) + + # Try health check (handle both JSON and plain text responses) + try: + health = client.health_check() + print(f"✓ Server is healthy: {health}") + except json.JSONDecodeError: + # llama-swap returns plain text "OK" instead of JSON + response = requests.get(f"{args.server_url}/health") + response.raise_for_status() + print(f"✓ Server is healthy: {response.text}") + + # Determine model name to use + model_name = args.model_name + + # If model_tag is provided, use it as the model name (unless user explicitly provided a model_name) + if args.model_tag and not args.model_name: + model_name = args.model_tag + print(f"✓ Using model tag as model name: {model_name}") + elif not model_name: + # Only auto-detect if neither model_tag nor model_name was provided + detected_model_name = None + try: + models = client.get_models() + if 'data' in models and len(models['data']) > 0: + model_id = models['data'][0].get('id', 'unknown') + print(f"✓ Active model: {model_id}") + + # Extract just the model filename (without path) + if model_id and model_id != 'unknown': + # Remove path and get base filename + model_filename = os.path.basename(model_id) + # Remove common extensions (.gguf, .bin, etc.) + model_name_no_ext = os.path.splitext(model_filename)[0] + detected_model_name = model_name_no_ext + except: + pass + + if detected_model_name: + model_name = detected_model_name + print(f"✓ Using auto-detected model name: {model_name}") + else: + model_name = "unknown" + print(f"⚠ Could not detect model name, using 'unknown'") + else: + # User explicitly provided model_name + print(f"✓ Using provided model name: {model_name}") + + except Exception as e: + print(f"❌ Failed to connect to server: {e}") + print(f"Make sure llama.cpp server is running at {args.server_url}") + sys.exit(1) + + print() + + # Show model tag info if using llama-swap + if args.model_tag: + print(f"Requesting model from llama-swap: {args.model_tag}") + + # Check currently running models on llama-swap + try: + running_response = requests.get(f"{args.server_url}/running") + if running_response.status_code == 200: + try: + running_models = running_response.json() + if running_models: + print(f"Currently running models: {running_models}") + except: + pass + except: + pass + + print() + + # Run tests + tester = JerseyDetectionTester(args.server_url, prompt, model_name, args.resize, args.model_tag) + results = tester.test_directory(args.image_directory) + + # Print summary + if results: + tester.print_summary(results) + + # Save results to file + tester.save_results_to_file(results, args.prompt_file, args.output_file) + + +if __name__ == '__main__': + main()