Initial commit: Jersey detection test suite
Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
This commit is contained in:
93
README.md
Normal file
93
README.md
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
# Jersey Detection Testing
|
||||||
|
|
||||||
|
This project contains test scripts, results, and utilities for evaluating vision-language models on jersey number detection tasks using llama.cpp.
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
jersey_test/
|
||||||
|
├── scan_utils/
|
||||||
|
│ ├── jersey_detection.py # Core detection class using VLM
|
||||||
|
│ └── llama_cpp_client.py # Client for llama.cpp server
|
||||||
|
├── docs/
|
||||||
|
│ ├── JERSEY_DETECTION_MODEL_ANALYSIS.md # Model comparison results
|
||||||
|
│ └── LLAMA_SWAP_SETUP.md # Server setup instructions
|
||||||
|
├── test_images/ # Place test images here
|
||||||
|
├── test_images_output/ # Output directory for annotated images
|
||||||
|
├── test_jersey_detection.py # Main test runner
|
||||||
|
├── analyze_jersey_results.py # Results analysis script
|
||||||
|
├── test_all_models.sh # Batch testing shell script
|
||||||
|
├── jersey_prompt.txt # Basic detection prompt
|
||||||
|
├── jersey_prompt_with_confidence.txt # Prompt with confidence scoring
|
||||||
|
└── jersey_detection_results.jsonl # Historical test results
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- llama.cpp server running with a vision-language model
|
||||||
|
- Test images with ground truth encoded in filenames
|
||||||
|
|
||||||
|
## Test Image Naming Convention
|
||||||
|
|
||||||
|
Test images should follow this naming pattern to encode ground truth:
|
||||||
|
```
|
||||||
|
prefix-number1-number2-number3.jpg
|
||||||
|
```
|
||||||
|
|
||||||
|
Example: `game1-23-45-7.jpg` contains jerseys with numbers 23, 45, and 7.
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
### Single Model Test
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python test_jersey_detection.py \
|
||||||
|
--images-dir ./test_images \
|
||||||
|
--prompt-file jersey_prompt_with_confidence.txt \
|
||||||
|
--server-url http://localhost:8080 \
|
||||||
|
--resize 1024 \
|
||||||
|
--output jersey_detection_results.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
### Batch Testing All Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./test_all_models.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit variables at the top of the script to configure:
|
||||||
|
- `IMAGES_DIR` - test images directory
|
||||||
|
- `PROMPT_FILE` - prompt file to use
|
||||||
|
- `SERVER_URL` - llama.cpp/llama-swap server URL
|
||||||
|
- `LLAMA_SWAP_CONFIG` - path to llama-swap config for model list
|
||||||
|
|
||||||
|
### Analyzing Results
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python analyze_jersey_results.py jersey_detection_results.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
Options:
|
||||||
|
- `--csv output.csv` - Export results to CSV
|
||||||
|
- `--filter-model "model_name"` - Filter by model name
|
||||||
|
|
||||||
|
## Historical Results
|
||||||
|
|
||||||
|
The `jersey_detection_results.jsonl` file contains results from 6 test runs:
|
||||||
|
|
||||||
|
| Model | F1 Score | Avg Time/Image | Avg Confidence |
|
||||||
|
|-------|----------|----------------|----------------|
|
||||||
|
| qwen2.5-vl-7b | 72.9% | - | - |
|
||||||
|
| gemma-3-27b | 72.1% | 18.1s | 87.1 |
|
||||||
|
| Mistral-Small-3.2-24B (Q4) | - | 14.2s | 92.1 |
|
||||||
|
| Kimi-VL-A3B-Thinking | - | 29.1s | 88.9 |
|
||||||
|
|
||||||
|
See `docs/JERSEY_DETECTION_MODEL_ANALYSIS.md` for detailed analysis.
|
||||||
|
|
||||||
|
## Key Findings
|
||||||
|
|
||||||
|
1. **Top Recommendation**: qwen2.5-vl-7b (72.9% F1 score)
|
||||||
|
2. **Best Confidence Calibration**: gemma-3-27b
|
||||||
|
3. **Speed Champion**: gemma-3-4b (7.9s/img, 63.8% F1)
|
||||||
|
4. Confidence threshold of 85+ recommended for filtering uncertain detections
|
||||||
663
analyze_jersey_results.py
Executable file
663
analyze_jersey_results.py
Executable file
@ -0,0 +1,663 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Analyze jersey detection test results and compare model performance.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python analyze_jersey_results.py [results_file]
|
||||||
|
python analyze_jersey_results.py [results_file] --csv output.csv
|
||||||
|
python analyze_jersey_results.py [results_file] --csv-only output.csv
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
results_file: Path to the results file (default: jersey_detection_results.jsonl)
|
||||||
|
--csv: Also export results to CSV file
|
||||||
|
--csv-only: Export to CSV only, skip analysis display
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def load_results(results_file: str) -> List[Dict[str, Any]]:
|
||||||
|
"""Load test results from a JSON Lines file."""
|
||||||
|
results = []
|
||||||
|
try:
|
||||||
|
with open(results_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
results.append(json.loads(line))
|
||||||
|
return results
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: Results file not found: {results_file}")
|
||||||
|
sys.exit(1)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error: Invalid JSON in results file: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_confidence_stdev(conf_stats: Dict[str, Any]) -> tuple:
|
||||||
|
"""
|
||||||
|
Calculate standard deviation of confidence scores from distribution.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (stdev, quality_rating)
|
||||||
|
quality_rating: "Excellent", "Good", "Fair", "Poor", or "N/A"
|
||||||
|
"""
|
||||||
|
if not conf_stats or 'distribution' not in conf_stats:
|
||||||
|
return None, "N/A"
|
||||||
|
|
||||||
|
dist = conf_stats['distribution']
|
||||||
|
|
||||||
|
# Reconstruct approximate confidence values from buckets
|
||||||
|
# Use midpoint of each bucket
|
||||||
|
values = []
|
||||||
|
bucket_midpoints = {
|
||||||
|
'90-100': 95,
|
||||||
|
'70-89': 79.5,
|
||||||
|
'50-69': 59.5,
|
||||||
|
'30-49': 39.5,
|
||||||
|
'0-29': 14.5
|
||||||
|
}
|
||||||
|
|
||||||
|
for bucket, count in dist.items():
|
||||||
|
midpoint = bucket_midpoints.get(bucket, 50)
|
||||||
|
values.extend([midpoint] * count)
|
||||||
|
|
||||||
|
if len(values) < 2:
|
||||||
|
return None, "N/A"
|
||||||
|
|
||||||
|
# Calculate standard deviation
|
||||||
|
import math
|
||||||
|
mean = sum(values) / len(values)
|
||||||
|
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
||||||
|
stdev = math.sqrt(variance)
|
||||||
|
|
||||||
|
# Assign quality rating based on StDev
|
||||||
|
if stdev < 5:
|
||||||
|
quality = "Poor"
|
||||||
|
elif stdev < 10:
|
||||||
|
quality = "Fair"
|
||||||
|
elif stdev < 15:
|
||||||
|
quality = "Good"
|
||||||
|
else:
|
||||||
|
quality = "Excel" # Shortened for table
|
||||||
|
|
||||||
|
return stdev, quality
|
||||||
|
|
||||||
|
|
||||||
|
def print_ascii_comparison_table(results: List[Dict[str, Any]]):
|
||||||
|
"""Print a detailed ASCII comparison table of all test runs."""
|
||||||
|
if not results:
|
||||||
|
print("No results to display.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("=" * 280)
|
||||||
|
print("DETAILED MODEL COMPARISON TABLE")
|
||||||
|
print("=" * 280)
|
||||||
|
print()
|
||||||
|
print("Confidence Quality: Excellent (>15), Good (10-15), Fair (5-10), Poor (<5)")
|
||||||
|
print("Confidence Calibration: Conf✓ = avg confidence on correct detections, Conf✗ = avg confidence on incorrect detections")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Table headers with ground truth and confidence calibration columns
|
||||||
|
print("┌" + "─" * 22 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 8 + "┬" + "─" * 8 + "┬" + "─" * 12 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 10 + "┬" + "─" * 21 + "┐")
|
||||||
|
print("│ {:<20} │ {:^8} │ {:^6} │ {:^6} │ {:^6} │ {:^8} │ {:^8} │ {:^8} │ {:^6} │ {:^6} │ {:^10} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^8} │ {:^19} │".format(
|
||||||
|
"Model", "Prompt", "Images", "Valid", "Hallu", "Empty%", "Hallu%", "AvgTime", "Resize", "Conf?", "Conf Qual", "Prec%", "Recall%", "F1%", "Conf✓", "Conf✗", "Date"
|
||||||
|
))
|
||||||
|
print("├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 8 + "┼" + "─" * 8 + "┼" + "─" * 12 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 21 + "┤")
|
||||||
|
|
||||||
|
# Data rows
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
model = result.get('model_name', 'unknown')[:20]
|
||||||
|
prompt = Path(result.get('prompt_file', 'unknown')).stem[:8]
|
||||||
|
total_images = result.get('total_images', 0)
|
||||||
|
valid_jerseys = result.get('total_valid_jerseys', 0)
|
||||||
|
hallucinated = result.get('total_hallucinated', 0)
|
||||||
|
total_detections = valid_jerseys + hallucinated
|
||||||
|
empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
|
||||||
|
hallu_pct = (hallucinated / total_detections * 100) if total_detections > 0 else 0
|
||||||
|
avg_time = result.get('avg_processing_time', 0)
|
||||||
|
|
||||||
|
# Calculate confidence quality
|
||||||
|
conf_stats = result.get('confidence_stats')
|
||||||
|
has_conf = 'Yes' if conf_stats else 'No'
|
||||||
|
stdev, quality = calculate_confidence_stdev(conf_stats)
|
||||||
|
|
||||||
|
# Format confidence quality display
|
||||||
|
if stdev is not None:
|
||||||
|
conf_qual_str = f"{quality} ({stdev:.1f})"
|
||||||
|
else:
|
||||||
|
conf_qual_str = "N/A"
|
||||||
|
|
||||||
|
# Ground truth metrics
|
||||||
|
gt = result.get('ground_truth', {})
|
||||||
|
precision = gt.get('overall_precision', 0) * 100
|
||||||
|
recall = gt.get('overall_recall', 0) * 100
|
||||||
|
f1 = gt.get('overall_f1', 0) * 100
|
||||||
|
|
||||||
|
# Confidence calibration
|
||||||
|
conf_correct = gt.get('avg_confidence_correct')
|
||||||
|
conf_incorrect = gt.get('avg_confidence_incorrect')
|
||||||
|
conf_correct_str = f"{conf_correct:.1f}" if conf_correct is not None else "N/A"
|
||||||
|
conf_incorrect_str = f"{conf_incorrect:.1f}" if conf_incorrect is not None else "N/A"
|
||||||
|
|
||||||
|
resize_max = result.get('resize_max')
|
||||||
|
resize_str = f"{resize_max}px" if resize_max else "No"
|
||||||
|
timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M')
|
||||||
|
|
||||||
|
print("│ {:<20} │ {:>8} │ {:>6} │ {:>6} │ {:>6} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.2f}s │ {:>6} │ {:>6} │ {:>10} │ {:>7.1f}% │ {:>7.1f}% │ {:>7.1f}% │ {:>8} │ {:>8} │ {:>19} │".format(
|
||||||
|
model, prompt, total_images, valid_jerseys, hallucinated, empty_pct, hallu_pct, avg_time, resize_str, has_conf, conf_qual_str, precision, recall, f1, conf_correct_str, conf_incorrect_str, timestamp
|
||||||
|
))
|
||||||
|
|
||||||
|
# Bottom border
|
||||||
|
print("└" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 8 + "┴" + "─" * 8 + "┴" + "─" * 12 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 21 + "┘")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def print_comparison_table(results: List[Dict[str, Any]]):
|
||||||
|
"""Print a simple comparison table of all test runs."""
|
||||||
|
if not results:
|
||||||
|
print("No results to display.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("=" * 140)
|
||||||
|
print("MODEL COMPARISON TABLE")
|
||||||
|
print("=" * 140)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Header
|
||||||
|
header = f"{'Model':<25} {'Prompt':<30} {'Images':<8} {'Valid':<8} {'Hallu':<8} {'Empty%':<9} {'AvgTime':<9} {'Resize':<8} {'Conf?':<7} {'Date':<20}"
|
||||||
|
print(header)
|
||||||
|
print("-" * 150)
|
||||||
|
|
||||||
|
# Data rows
|
||||||
|
for result in results:
|
||||||
|
model = result.get('model_name', 'unknown')[:24]
|
||||||
|
prompt = Path(result.get('prompt_file', 'unknown')).stem[:29]
|
||||||
|
total_images = result.get('total_images', 0)
|
||||||
|
valid_jerseys = result.get('total_valid_jerseys', 0)
|
||||||
|
hallucinated = result.get('total_hallucinated', 0)
|
||||||
|
empty_pct = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
|
||||||
|
avg_time = result.get('avg_processing_time', 0)
|
||||||
|
has_conf = 'Yes' if result.get('confidence_stats') else 'No'
|
||||||
|
resize_max = result.get('resize_max')
|
||||||
|
resize_str = f"{resize_max}px" if resize_max else "No"
|
||||||
|
timestamp = datetime.fromisoformat(result.get('timestamp', '')).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
row = f"{model:<25} {prompt:<30} {total_images:<8} {valid_jerseys:<8} {hallucinated:<8} {empty_pct:<8.1f}% {avg_time:<8.2f}s {resize_str:<8} {has_conf:<7} {timestamp:<20}"
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def print_model_performance_chart(results: List[Dict[str, Any]]):
|
||||||
|
"""Print a performance chart showing key metrics for each model."""
|
||||||
|
if not results:
|
||||||
|
return
|
||||||
|
|
||||||
|
print("=" * 140)
|
||||||
|
print("MODEL PERFORMANCE CHART")
|
||||||
|
print("=" * 140)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Group results by model
|
||||||
|
models = {}
|
||||||
|
for result in results:
|
||||||
|
model_name = result.get('model_name', 'unknown')
|
||||||
|
if model_name not in models:
|
||||||
|
models[model_name] = []
|
||||||
|
models[model_name].append(result)
|
||||||
|
|
||||||
|
# Calculate aggregate statistics for each model
|
||||||
|
for model_name, model_results in models.items():
|
||||||
|
print(f"\n{model_name}")
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
total_runs = len(model_results)
|
||||||
|
total_images = sum(r.get('total_images', 0) for r in model_results)
|
||||||
|
total_valid = sum(r.get('total_valid_jerseys', 0) for r in model_results)
|
||||||
|
total_hallu = sum(r.get('total_hallucinated', 0) for r in model_results)
|
||||||
|
avg_empty_pct = sum((r.get('images_without_jerseys', 0) / r.get('total_images', 1) * 100) for r in model_results) / total_runs if total_runs > 0 else 0
|
||||||
|
avg_time = sum(r.get('avg_processing_time', 0) for r in model_results) / total_runs if total_runs > 0 else 0
|
||||||
|
|
||||||
|
# Check if any runs have confidence stats
|
||||||
|
has_confidence = any(r.get('confidence_stats') for r in model_results)
|
||||||
|
|
||||||
|
# Check resize status
|
||||||
|
resize_enabled = any(r.get('resize_enabled', False) for r in model_results)
|
||||||
|
resize_max_values = [r.get('resize_max') for r in model_results if r.get('resize_max')]
|
||||||
|
resize_info = f"{resize_max_values[0]}px" if resize_max_values else "Disabled"
|
||||||
|
|
||||||
|
print(f" Total test runs: {total_runs}")
|
||||||
|
print(f" Total images processed: {total_images}")
|
||||||
|
print(f" Total valid detections: {total_valid}")
|
||||||
|
print(f" Total hallucinations: {total_hallu}")
|
||||||
|
print(f" Average empty response rate: {avg_empty_pct:.1f}%")
|
||||||
|
print(f" Average processing time: {avg_time:.2f}s/image")
|
||||||
|
print(f" Resize: {resize_info}")
|
||||||
|
print(f" Confidence support: {'Yes' if has_confidence else 'No'}")
|
||||||
|
|
||||||
|
# Show hallucination rate
|
||||||
|
if total_valid + total_hallu > 0:
|
||||||
|
hallu_rate = (total_hallu / (total_valid + total_hallu) * 100)
|
||||||
|
print(f" Hallucination rate: {hallu_rate:.1f}%")
|
||||||
|
|
||||||
|
# Visual bar
|
||||||
|
bar_length = int(hallu_rate / 2) # Scale to max 50 chars
|
||||||
|
bar = '█' * bar_length
|
||||||
|
print(f" Hallucination chart: {bar} ({hallu_rate:.1f}%)")
|
||||||
|
|
||||||
|
# Ground truth performance
|
||||||
|
gt_runs = [r for r in model_results if r.get('ground_truth')]
|
||||||
|
if gt_runs:
|
||||||
|
avg_precision = sum(r['ground_truth'].get('overall_precision', 0) for r in gt_runs) / len(gt_runs)
|
||||||
|
avg_recall = sum(r['ground_truth'].get('overall_recall', 0) for r in gt_runs) / len(gt_runs)
|
||||||
|
avg_f1 = sum(r['ground_truth'].get('overall_f1', 0) for r in gt_runs) / len(gt_runs)
|
||||||
|
total_expected = sum(r['ground_truth'].get('total_expected', 0) for r in gt_runs)
|
||||||
|
total_tp = sum(r['ground_truth'].get('total_true_positives', 0) for r in gt_runs)
|
||||||
|
total_fp = sum(r['ground_truth'].get('total_false_positives', 0) for r in gt_runs)
|
||||||
|
total_fn = sum(r['ground_truth'].get('total_false_negatives', 0) for r in gt_runs)
|
||||||
|
|
||||||
|
print(f"\n Ground truth performance:")
|
||||||
|
print(f" Total expected jerseys: {total_expected}")
|
||||||
|
print(f" True positives: {total_tp}")
|
||||||
|
print(f" False positives: {total_fp}")
|
||||||
|
print(f" False negatives: {total_fn}")
|
||||||
|
print(f" Average Precision: {avg_precision:.1%}")
|
||||||
|
print(f" Average Recall: {avg_recall:.1%}")
|
||||||
|
print(f" Average F1 Score: {avg_f1:.1%}")
|
||||||
|
|
||||||
|
# Visual F1 bar
|
||||||
|
bar_length = int(avg_f1 * 50) # Scale to max 50 chars
|
||||||
|
bar = '█' * bar_length
|
||||||
|
print(f" F1 Score chart: {bar} ({avg_f1:.1%})")
|
||||||
|
|
||||||
|
# Confidence calibration
|
||||||
|
conf_correct_vals = [r['ground_truth'].get('avg_confidence_correct') for r in gt_runs if r['ground_truth'].get('avg_confidence_correct') is not None]
|
||||||
|
conf_incorrect_vals = [r['ground_truth'].get('avg_confidence_incorrect') for r in gt_runs if r['ground_truth'].get('avg_confidence_incorrect') is not None]
|
||||||
|
|
||||||
|
if conf_correct_vals or conf_incorrect_vals:
|
||||||
|
print(f"\n Confidence calibration:")
|
||||||
|
if conf_correct_vals:
|
||||||
|
avg_conf_correct = sum(conf_correct_vals) / len(conf_correct_vals)
|
||||||
|
print(f" Avg confidence (correct detections): {avg_conf_correct:.2f}")
|
||||||
|
if conf_incorrect_vals:
|
||||||
|
avg_conf_incorrect = sum(conf_incorrect_vals) / len(conf_incorrect_vals)
|
||||||
|
print(f" Avg confidence (incorrect detections): {avg_conf_incorrect:.2f}")
|
||||||
|
if conf_correct_vals and conf_incorrect_vals:
|
||||||
|
diff = sum(conf_correct_vals) / len(conf_correct_vals) - sum(conf_incorrect_vals) / len(conf_incorrect_vals)
|
||||||
|
if diff > 0:
|
||||||
|
print(f" Confidence difference: +{diff:.2f} (good calibration)")
|
||||||
|
else:
|
||||||
|
print(f" Confidence difference: {diff:.2f} (⚠ poor calibration)")
|
||||||
|
|
||||||
|
# Confidence distribution if available
|
||||||
|
if has_confidence:
|
||||||
|
print(f"\n Confidence distribution (across all runs):")
|
||||||
|
all_dist = {'90-100': 0, '70-89': 0, '50-69': 0, '30-49': 0, '0-29': 0}
|
||||||
|
total_conf_count = 0
|
||||||
|
|
||||||
|
for result in model_results:
|
||||||
|
conf_stats = result.get('confidence_stats')
|
||||||
|
if conf_stats and 'distribution' in conf_stats:
|
||||||
|
for bucket, count in conf_stats['distribution'].items():
|
||||||
|
all_dist[bucket] += count
|
||||||
|
total_conf_count += count
|
||||||
|
|
||||||
|
if total_conf_count > 0:
|
||||||
|
for bucket, count in all_dist.items():
|
||||||
|
pct = (count / total_conf_count * 100) if total_conf_count > 0 else 0
|
||||||
|
bar_length = int(pct / 2)
|
||||||
|
bar = '█' * bar_length
|
||||||
|
print(f" {bucket}: {count:4d} ({pct:5.1f}%) {bar}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def print_best_performers(results: List[Dict[str, Any]]):
|
||||||
|
"""Print summary of best performing models."""
|
||||||
|
if not results:
|
||||||
|
return
|
||||||
|
|
||||||
|
print("=" * 140)
|
||||||
|
print("BEST PERFORMERS")
|
||||||
|
print("=" * 140)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Group by model and calculate averages
|
||||||
|
models = {}
|
||||||
|
for result in results:
|
||||||
|
model_name = result.get('model_name', 'unknown')
|
||||||
|
if model_name not in models:
|
||||||
|
models[model_name] = {
|
||||||
|
'runs': 0,
|
||||||
|
'total_hallu': 0,
|
||||||
|
'total_detections': 0,
|
||||||
|
'avg_time': [],
|
||||||
|
'empty_capable': []
|
||||||
|
}
|
||||||
|
|
||||||
|
models[model_name]['runs'] += 1
|
||||||
|
models[model_name]['total_hallu'] += result.get('total_hallucinated', 0)
|
||||||
|
models[model_name]['total_detections'] += result.get('total_valid_jerseys', 0) + result.get('total_hallucinated', 0)
|
||||||
|
models[model_name]['avg_time'].append(result.get('avg_processing_time', 0))
|
||||||
|
models[model_name]['empty_capable'].append(result.get('empty_response_capable', False))
|
||||||
|
|
||||||
|
# Calculate scores
|
||||||
|
model_scores = []
|
||||||
|
for model_name, stats in models.items():
|
||||||
|
hallu_rate = (stats['total_hallu'] / stats['total_detections'] * 100) if stats['total_detections'] > 0 else 0
|
||||||
|
avg_time = sum(stats['avg_time']) / len(stats['avg_time']) if stats['avg_time'] else 0
|
||||||
|
empty_capable = any(stats['empty_capable'])
|
||||||
|
|
||||||
|
model_scores.append({
|
||||||
|
'model': model_name,
|
||||||
|
'hallu_rate': hallu_rate,
|
||||||
|
'avg_time': avg_time,
|
||||||
|
'empty_capable': empty_capable,
|
||||||
|
'runs': stats['runs']
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by hallucination rate (lower is better)
|
||||||
|
model_scores.sort(key=lambda x: x['hallu_rate'])
|
||||||
|
|
||||||
|
print("Lowest hallucination rate:")
|
||||||
|
for i, score in enumerate(model_scores[:3], 1):
|
||||||
|
capable = "✓" if score['empty_capable'] else "✗"
|
||||||
|
print(f" {i}. {score['model']}: {score['hallu_rate']:.1f}% (empty capable: {capable}, avg time: {score['avg_time']:.2f}s)")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Sort by speed (lower is better)
|
||||||
|
model_scores.sort(key=lambda x: x['avg_time'])
|
||||||
|
|
||||||
|
print("Fastest processing:")
|
||||||
|
for i, score in enumerate(model_scores[:3], 1):
|
||||||
|
capable = "✓" if score['empty_capable'] else "✗"
|
||||||
|
print(f" {i}. {score['model']}: {score['avg_time']:.2f}s/image (hallu rate: {score['hallu_rate']:.1f}%, empty capable: {capable})")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Models with empty response capability
|
||||||
|
empty_models = [s for s in model_scores if s['empty_capable']]
|
||||||
|
print(f"Models with empty response capability: {len(empty_models)}/{len(model_scores)}")
|
||||||
|
for score in empty_models:
|
||||||
|
print(f" - {score['model']}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Best F1 scores (ground truth accuracy)
|
||||||
|
models_with_gt = {}
|
||||||
|
for result in results:
|
||||||
|
if result.get('ground_truth'):
|
||||||
|
model_name = result.get('model_name', 'unknown')
|
||||||
|
if model_name not in models_with_gt:
|
||||||
|
models_with_gt[model_name] = {
|
||||||
|
'f1_scores': [],
|
||||||
|
'precision_scores': [],
|
||||||
|
'recall_scores': []
|
||||||
|
}
|
||||||
|
gt = result['ground_truth']
|
||||||
|
models_with_gt[model_name]['f1_scores'].append(gt.get('overall_f1', 0))
|
||||||
|
models_with_gt[model_name]['precision_scores'].append(gt.get('overall_precision', 0))
|
||||||
|
models_with_gt[model_name]['recall_scores'].append(gt.get('overall_recall', 0))
|
||||||
|
|
||||||
|
if models_with_gt:
|
||||||
|
gt_scores = []
|
||||||
|
for model_name, stats in models_with_gt.items():
|
||||||
|
avg_f1 = sum(stats['f1_scores']) / len(stats['f1_scores']) if stats['f1_scores'] else 0
|
||||||
|
avg_precision = sum(stats['precision_scores']) / len(stats['precision_scores']) if stats['precision_scores'] else 0
|
||||||
|
avg_recall = sum(stats['recall_scores']) / len(stats['recall_scores']) if stats['recall_scores'] else 0
|
||||||
|
gt_scores.append({
|
||||||
|
'model': model_name,
|
||||||
|
'avg_f1': avg_f1,
|
||||||
|
'avg_precision': avg_precision,
|
||||||
|
'avg_recall': avg_recall
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by F1 score (higher is better)
|
||||||
|
gt_scores.sort(key=lambda x: x['avg_f1'], reverse=True)
|
||||||
|
|
||||||
|
print("Highest ground truth F1 scores:")
|
||||||
|
for i, score in enumerate(gt_scores[:3], 1):
|
||||||
|
print(f" {i}. {score['model']}: F1={score['avg_f1']:.1%} (Precision={score['avg_precision']:.1%}, Recall={score['avg_recall']:.1%})")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_csv(results: List[Dict[str, Any]], csv_file: str):
|
||||||
|
"""Export results to CSV file for spreadsheet import."""
|
||||||
|
if not results:
|
||||||
|
print("No results to export.")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(csv_file, 'w', newline='') as f:
|
||||||
|
# Define CSV columns
|
||||||
|
fieldnames = [
|
||||||
|
'timestamp',
|
||||||
|
'model_name',
|
||||||
|
'model_tag',
|
||||||
|
'prompt_file',
|
||||||
|
'prompt_length',
|
||||||
|
'total_images',
|
||||||
|
'images_with_jerseys',
|
||||||
|
'images_without_jerseys',
|
||||||
|
'images_with_errors',
|
||||||
|
'total_raw_detections',
|
||||||
|
'total_valid_jerseys',
|
||||||
|
'total_hallucinated',
|
||||||
|
'hallucination_rate_pct',
|
||||||
|
'empty_response_rate_pct',
|
||||||
|
'avg_processing_time',
|
||||||
|
'total_processing_time',
|
||||||
|
'resize_enabled',
|
||||||
|
'resize_max',
|
||||||
|
'images_resized',
|
||||||
|
'has_confidence',
|
||||||
|
'confidence_avg',
|
||||||
|
'confidence_min',
|
||||||
|
'confidence_max',
|
||||||
|
'confidence_count',
|
||||||
|
'confidence_stdev',
|
||||||
|
'confidence_quality',
|
||||||
|
'conf_90_100',
|
||||||
|
'conf_70_89',
|
||||||
|
'conf_50_69',
|
||||||
|
'conf_30_49',
|
||||||
|
'conf_0_29',
|
||||||
|
# Ground truth columns
|
||||||
|
'gt_total_expected',
|
||||||
|
'gt_total_true_positives',
|
||||||
|
'gt_total_false_positives',
|
||||||
|
'gt_total_false_negatives',
|
||||||
|
'gt_overall_precision',
|
||||||
|
'gt_overall_recall',
|
||||||
|
'gt_overall_f1',
|
||||||
|
'gt_avg_precision',
|
||||||
|
'gt_avg_recall',
|
||||||
|
'gt_avg_f1',
|
||||||
|
# Confidence calibration
|
||||||
|
'gt_avg_confidence_correct',
|
||||||
|
'gt_avg_confidence_incorrect',
|
||||||
|
'gt_confidence_correct_count',
|
||||||
|
'gt_confidence_incorrect_count'
|
||||||
|
]
|
||||||
|
|
||||||
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
# Write data rows
|
||||||
|
for result in results:
|
||||||
|
# Calculate derived values
|
||||||
|
total_images = result.get('total_images', 0)
|
||||||
|
valid_jerseys = result.get('total_valid_jerseys', 0)
|
||||||
|
hallucinated = result.get('total_hallucinated', 0)
|
||||||
|
total_detections = valid_jerseys + hallucinated
|
||||||
|
hallu_rate = (hallucinated / total_detections * 100) if total_detections > 0 else 0
|
||||||
|
empty_rate = (result.get('images_without_jerseys', 0) / total_images * 100) if total_images > 0 else 0
|
||||||
|
|
||||||
|
# Extract confidence stats
|
||||||
|
conf_stats = result.get('confidence_stats')
|
||||||
|
has_confidence = conf_stats is not None
|
||||||
|
conf_avg = conf_stats.get('avg', '') if conf_stats else ''
|
||||||
|
conf_min = conf_stats.get('min', '') if conf_stats else ''
|
||||||
|
conf_max = conf_stats.get('max', '') if conf_stats else ''
|
||||||
|
conf_count = conf_stats.get('count', '') if conf_stats else ''
|
||||||
|
|
||||||
|
# Calculate confidence standard deviation and quality
|
||||||
|
conf_stdev, conf_quality = calculate_confidence_stdev(conf_stats)
|
||||||
|
|
||||||
|
# Extract confidence distribution
|
||||||
|
conf_dist = conf_stats.get('distribution', {}) if conf_stats else {}
|
||||||
|
conf_90_100 = conf_dist.get('90-100', '')
|
||||||
|
conf_70_89 = conf_dist.get('70-89', '')
|
||||||
|
conf_50_69 = conf_dist.get('50-69', '')
|
||||||
|
conf_30_49 = conf_dist.get('30-49', '')
|
||||||
|
conf_0_29 = conf_dist.get('0-29', '')
|
||||||
|
|
||||||
|
# Extract ground truth stats
|
||||||
|
gt = result.get('ground_truth', {})
|
||||||
|
gt_total_expected = gt.get('total_expected', '')
|
||||||
|
gt_total_tp = gt.get('total_true_positives', '')
|
||||||
|
gt_total_fp = gt.get('total_false_positives', '')
|
||||||
|
gt_total_fn = gt.get('total_false_negatives', '')
|
||||||
|
gt_overall_precision = gt.get('overall_precision', '')
|
||||||
|
gt_overall_recall = gt.get('overall_recall', '')
|
||||||
|
gt_overall_f1 = gt.get('overall_f1', '')
|
||||||
|
gt_avg_precision = gt.get('avg_precision', '')
|
||||||
|
gt_avg_recall = gt.get('avg_recall', '')
|
||||||
|
gt_avg_f1 = gt.get('avg_f1', '')
|
||||||
|
gt_avg_conf_correct = gt.get('avg_confidence_correct', '')
|
||||||
|
gt_avg_conf_incorrect = gt.get('avg_confidence_incorrect', '')
|
||||||
|
gt_conf_correct_count = gt.get('confidence_correct_count', '')
|
||||||
|
gt_conf_incorrect_count = gt.get('confidence_incorrect_count', '')
|
||||||
|
|
||||||
|
row = {
|
||||||
|
'timestamp': result.get('timestamp', ''),
|
||||||
|
'model_name': result.get('model_name', ''),
|
||||||
|
'model_tag': result.get('model_tag', ''),
|
||||||
|
'prompt_file': result.get('prompt_file', ''),
|
||||||
|
'prompt_length': result.get('prompt_length', ''),
|
||||||
|
'total_images': total_images,
|
||||||
|
'images_with_jerseys': result.get('images_with_jerseys', ''),
|
||||||
|
'images_without_jerseys': result.get('images_without_jerseys', ''),
|
||||||
|
'images_with_errors': result.get('images_with_errors', ''),
|
||||||
|
'total_raw_detections': result.get('total_raw_detections', ''),
|
||||||
|
'total_valid_jerseys': valid_jerseys,
|
||||||
|
'total_hallucinated': hallucinated,
|
||||||
|
'hallucination_rate_pct': f"{hallu_rate:.2f}",
|
||||||
|
'empty_response_rate_pct': f"{empty_rate:.2f}",
|
||||||
|
'avg_processing_time': f"{result.get('avg_processing_time', 0):.4f}",
|
||||||
|
'total_processing_time': f"{result.get('total_processing_time', 0):.2f}",
|
||||||
|
'resize_enabled': result.get('resize_enabled', False),
|
||||||
|
'resize_max': result.get('resize_max', ''),
|
||||||
|
'images_resized': result.get('images_resized', ''),
|
||||||
|
'has_confidence': has_confidence,
|
||||||
|
'confidence_avg': f"{conf_avg:.2f}" if conf_avg != '' else '',
|
||||||
|
'confidence_min': conf_min,
|
||||||
|
'confidence_max': conf_max,
|
||||||
|
'confidence_count': conf_count,
|
||||||
|
'confidence_stdev': f"{conf_stdev:.2f}" if conf_stdev is not None else '',
|
||||||
|
'confidence_quality': conf_quality if conf_quality != 'N/A' else '',
|
||||||
|
'conf_90_100': conf_90_100,
|
||||||
|
'conf_70_89': conf_70_89,
|
||||||
|
'conf_50_69': conf_50_69,
|
||||||
|
'conf_30_49': conf_30_49,
|
||||||
|
'conf_0_29': conf_0_29,
|
||||||
|
# Ground truth data
|
||||||
|
'gt_total_expected': gt_total_expected,
|
||||||
|
'gt_total_true_positives': gt_total_tp,
|
||||||
|
'gt_total_false_positives': gt_total_fp,
|
||||||
|
'gt_total_false_negatives': gt_total_fn,
|
||||||
|
'gt_overall_precision': f"{gt_overall_precision:.4f}" if gt_overall_precision != '' else '',
|
||||||
|
'gt_overall_recall': f"{gt_overall_recall:.4f}" if gt_overall_recall != '' else '',
|
||||||
|
'gt_overall_f1': f"{gt_overall_f1:.4f}" if gt_overall_f1 != '' else '',
|
||||||
|
'gt_avg_precision': f"{gt_avg_precision:.4f}" if gt_avg_precision != '' else '',
|
||||||
|
'gt_avg_recall': f"{gt_avg_recall:.4f}" if gt_avg_recall != '' else '',
|
||||||
|
'gt_avg_f1': f"{gt_avg_f1:.4f}" if gt_avg_f1 != '' else '',
|
||||||
|
'gt_avg_confidence_correct': f"{gt_avg_conf_correct:.2f}" if gt_avg_conf_correct != '' else '',
|
||||||
|
'gt_avg_confidence_incorrect': f"{gt_avg_conf_incorrect:.2f}" if gt_avg_conf_incorrect != '' else '',
|
||||||
|
'gt_confidence_correct_count': gt_conf_correct_count,
|
||||||
|
'gt_confidence_incorrect_count': gt_conf_incorrect_count
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
print(f"✓ Results exported to CSV: {csv_file}")
|
||||||
|
print(f" Rows: {len(results)}")
|
||||||
|
print(f" Columns: {len(fieldnames)}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to export to CSV: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point for the analysis script."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Analyze jersey detection test results',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
# Show analysis
|
||||||
|
python analyze_jersey_results.py
|
||||||
|
|
||||||
|
# Show analysis and export to CSV
|
||||||
|
python analyze_jersey_results.py --csv results.csv
|
||||||
|
|
||||||
|
# Export to CSV only (no analysis display)
|
||||||
|
python analyze_jersey_results.py --csv-only results.csv
|
||||||
|
|
||||||
|
# Analyze custom results file
|
||||||
|
python analyze_jersey_results.py custom_results.jsonl --csv custom.csv
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument('results_file', nargs='?', default='jersey_detection_results.jsonl',
|
||||||
|
help='Path to results file (default: jersey_detection_results.jsonl)')
|
||||||
|
parser.add_argument('--csv', metavar='FILE', dest='csv_file',
|
||||||
|
help='Export results to CSV file (in addition to showing analysis)')
|
||||||
|
parser.add_argument('--csv-only', metavar='FILE', dest='csv_only',
|
||||||
|
help='Export to CSV file only, skip analysis display')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Check if file exists
|
||||||
|
if not Path(args.results_file).exists():
|
||||||
|
print(f"Error: Results file not found: {args.results_file}")
|
||||||
|
print(f"Run some tests first with test_jersey_detection.py to generate results.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load results
|
||||||
|
results = load_results(args.results_file)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
print(f"No results found in {args.results_file}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
print(f"Loaded {len(results)} test run(s) from {args.results_file}\n")
|
||||||
|
|
||||||
|
# Handle CSV-only mode
|
||||||
|
if args.csv_only:
|
||||||
|
export_to_csv(results, args.csv_only)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Print analyses (unless CSV-only mode)
|
||||||
|
print_ascii_comparison_table(results)
|
||||||
|
print_model_performance_chart(results)
|
||||||
|
print_best_performers(results)
|
||||||
|
|
||||||
|
# Export to CSV if requested
|
||||||
|
if args.csv_file:
|
||||||
|
print()
|
||||||
|
export_to_csv(results, args.csv_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
296
docs/JERSEY_DETECTION_MODEL_ANALYSIS.md
Normal file
296
docs/JERSEY_DETECTION_MODEL_ANALYSIS.md
Normal file
@ -0,0 +1,296 @@
|
|||||||
|
# Jersey Detection Model Analysis Report
|
||||||
|
|
||||||
|
**Date:** October 22, 2025
|
||||||
|
**Models Tested:** 8 vision-language models
|
||||||
|
**Test Images:** 194 images with known jersey numbers
|
||||||
|
**Purpose:** Determine the best model for automated jersey number detection in sports photography
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
After comprehensive testing of 8 different AI models on 194 sports images with known jersey numbers, we recommend **qwen2.5-vl-7b** as the best overall model for jersey detection, with **gemma-3-27b** as a close second choice depending on specific needs.
|
||||||
|
|
||||||
|
### Key Findings:
|
||||||
|
|
||||||
|
1. **Best Overall Performance**: qwen2.5-vl-7b achieves the highest accuracy (72.9% F1 score)
|
||||||
|
2. **Confidence Scores Are Useful**: 7 out of 8 models show reliable confidence calibration, meaning higher confidence scores correlate with correct detections
|
||||||
|
3. **Speed vs Accuracy Trade-off**: The most accurate models take 13-21 seconds per image; faster models sacrifice significant accuracy
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model Performance Comparison
|
||||||
|
|
||||||
|
### Top 3 Recommended Models
|
||||||
|
|
||||||
|
| Rank | Model | Accuracy (F1) | Speed | Correct Detections | False Alarms | Confidence Reliability |
|
||||||
|
|------|-------|---------------|-------|--------------------|--------------|-----------------------|
|
||||||
|
| 🥇 1 | qwen2.5-vl-7b | 72.9% | 13.4s | 328 / 436 (75%) | 136 | Good |
|
||||||
|
| 🥈 2 | gemma-3-27b | 72.1% | 20.9s | 343 / 462 (74%) | 147 | Very Good (+6.0) |
|
||||||
|
| 🥉 3 | gemma-3-12b | 69.8% | 18.9s | 322 / 462 (70%) | 139 | Good (+3.1) |
|
||||||
|
|
||||||
|
### Complete Results Table
|
||||||
|
|
||||||
|
| Model | Accuracy (F1 Score) | Correct Detections | False Alarms | Missed Jerseys | Speed (sec/image) | Confidence Calibration |
|
||||||
|
|-------|--------------------|--------------------|--------------|----------------|-------------------|------------------------|
|
||||||
|
| **qwen2.5-vl-7b** | **72.9%** ⭐ | 328 / 436 | 136 | 108 | 13.4 | +0.5 (Good) |
|
||||||
|
| **gemma-3-27b** | **72.1%** | 343 / 462 | 147 | 119 | 20.9 | +6.0 (Very Good) |
|
||||||
|
| **gemma-3-12b** | 69.8% | 322 / 462 | 139 | 140 | 18.9 | +3.1 (Good) |
|
||||||
|
| mistral-small-24b-q4 | 67.6% | 328 / 462 | 180 | 134 | 15.1 | +2.4 (Good) |
|
||||||
|
| mistral-small-24b-q8 | 67.2% | 330 / 462 | 190 | 132 | 22.6 | +3.1 (Good) |
|
||||||
|
| gemma-3-4b | 63.8% | 277 / 462 | 130 | 185 | 7.9 ⚡ | +6.2 (Very Good) |
|
||||||
|
| lfm2-vl-1.6b | 50.5% | 171 / 448 | 58 | 277 | 4.6 ⚡⚡ | +11.9 (Excellent) |
|
||||||
|
| kimi-vl-3b | 2.0% ❌ | 5 / 416 | 67 | 411 | 40.0 🐌 | -1.3 (Poor) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Understanding the Metrics
|
||||||
|
|
||||||
|
### What the Numbers Mean:
|
||||||
|
|
||||||
|
- **Accuracy (F1 Score)**: Overall effectiveness balancing correct detections and false alarms
|
||||||
|
- 70%+ = Excellent for production use
|
||||||
|
- 60-70% = Good for assisted workflows
|
||||||
|
- Below 60% = Not recommended
|
||||||
|
|
||||||
|
- **Correct Detections**: Out of all jerseys that should have been found, how many were actually detected
|
||||||
|
- Example: "328 / 436" means the model found 328 jerseys out of 436 that were actually in the images
|
||||||
|
|
||||||
|
- **False Alarms**: Jersey numbers detected that weren't actually in the image
|
||||||
|
- Lower is better - these are incorrect detections
|
||||||
|
- Can be filtered using confidence scores
|
||||||
|
|
||||||
|
- **Missed Jerseys**: Jersey numbers that were in the image but not detected
|
||||||
|
- Lower is better - these are opportunities lost
|
||||||
|
|
||||||
|
- **Speed**: Average seconds to process one image
|
||||||
|
- ⚡⚡ = Very fast (< 8s)
|
||||||
|
- ⚡ = Fast (8-15s)
|
||||||
|
- Standard = 15-25s
|
||||||
|
- 🐌 = Slow (> 30s)
|
||||||
|
|
||||||
|
- **Confidence Calibration**: The difference between average confidence on correct vs incorrect detections
|
||||||
|
- Positive number (e.g., +6.0) = Good calibration - correct detections have higher confidence
|
||||||
|
- Negative number = Poor calibration - can't trust confidence scores
|
||||||
|
- Higher positive values = Better for filtering with confidence thresholds
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Detailed Analysis
|
||||||
|
|
||||||
|
### 1. Best Model: qwen2.5-vl-7b
|
||||||
|
|
||||||
|
**Why It's the Best:**
|
||||||
|
- ✅ Highest overall accuracy (72.9%)
|
||||||
|
- ✅ Best recall - finds 75% of all jerseys
|
||||||
|
- ✅ Reasonable speed (13.4 seconds per image)
|
||||||
|
- ✅ Very low hallucination rate (only 1%)
|
||||||
|
- ✅ Confidence scores are reliable for filtering
|
||||||
|
|
||||||
|
**Strengths:**
|
||||||
|
- Finds the most jerseys (highest recall at 75.2%)
|
||||||
|
- Rarely makes up fake jersey numbers (hallucination rate: 1%)
|
||||||
|
- Almost always returns results (empty response rate: 2.6%)
|
||||||
|
|
||||||
|
**Weaknesses:**
|
||||||
|
- Generates 136 false positives (30% of detections are incorrect)
|
||||||
|
- Confidence calibration is minimal (+0.5), making threshold filtering less effective
|
||||||
|
- All confidence scores are 90-95, showing limited variation
|
||||||
|
|
||||||
|
**Best For:**
|
||||||
|
- Applications where finding all jerseys is critical
|
||||||
|
- Batch processing where moderate false positives are acceptable
|
||||||
|
- When combined with manual review of results
|
||||||
|
|
||||||
|
### 2. Runner-Up: gemma-3-27b
|
||||||
|
|
||||||
|
**Why It's Excellent:**
|
||||||
|
- ✅ Nearly identical accuracy to the winner (72.1% vs 72.9%)
|
||||||
|
- ✅ Finds the most total jerseys (343 correct detections)
|
||||||
|
- ✅ Excellent confidence calibration (+6.0 difference)
|
||||||
|
- ✅ No hallucinations
|
||||||
|
- ⚠️ Slower processing (20.9s per image)
|
||||||
|
|
||||||
|
**Strengths:**
|
||||||
|
- Best for confidence-based filtering (6-point difference between correct/incorrect)
|
||||||
|
- Highest absolute number of correct detections (343)
|
||||||
|
- More varied confidence scores (54% in 90-100 range, 42% in 70-89 range)
|
||||||
|
|
||||||
|
**Weaknesses:**
|
||||||
|
- 56% slower than qwen2.5-vl-7b
|
||||||
|
- Similar false positive rate
|
||||||
|
|
||||||
|
**Best For:**
|
||||||
|
- Applications requiring confidence-based filtering
|
||||||
|
- When processing time is not critical
|
||||||
|
- Maximizing total correct detections
|
||||||
|
|
||||||
|
### 3. Alternative: gemma-3-4b (Speed Champion)
|
||||||
|
|
||||||
|
**Why Consider It:**
|
||||||
|
- ⚡ Fast processing (7.9 seconds per image)
|
||||||
|
- ✅ Very good confidence calibration (+6.2)
|
||||||
|
- ✅ Zero hallucinations
|
||||||
|
- ⚠️ Lower accuracy (63.8%)
|
||||||
|
|
||||||
|
**Trade-offs:**
|
||||||
|
- 41% faster than qwen2.5-vl-7b
|
||||||
|
- But 12% lower accuracy
|
||||||
|
- Misses 40% of jerseys (185 false negatives)
|
||||||
|
|
||||||
|
**Best For:**
|
||||||
|
- Real-time or high-volume processing
|
||||||
|
- Applications where speed is more important than completeness
|
||||||
|
- Initial rough filtering before manual review
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Should You Use Confidence Scores for Filtering?
|
||||||
|
|
||||||
|
### Answer: **YES** - Confidence scores are useful for most models
|
||||||
|
|
||||||
|
### Evidence from Testing:
|
||||||
|
|
||||||
|
**7 out of 8 models show good confidence calibration:**
|
||||||
|
|
||||||
|
| Model | Avg Confidence (Correct) | Avg Confidence (Incorrect) | Difference | Reliability |
|
||||||
|
|-------|--------------------------|---------------------------|------------|-------------|
|
||||||
|
| lfm2-vl-1.6b | 91.8 | 80.0 | **+11.9** | ⭐⭐⭐ Excellent |
|
||||||
|
| gemma-3-4b | 85.2 | 79.0 | **+6.2** | ⭐⭐ Very Good |
|
||||||
|
| gemma-3-27b | 88.2 | 82.2 | **+6.0** | ⭐⭐ Very Good |
|
||||||
|
| gemma-3-12b | 91.8 | 88.7 | **+3.1** | ⭐ Good |
|
||||||
|
| mistral-small-24b-q8 | 92.3 | 89.1 | **+3.1** | ⭐ Good |
|
||||||
|
| mistral-small-24b-q4 | 93.0 | 90.7 | **+2.4** | ⭐ Good |
|
||||||
|
| qwen2.5-vl-7b | 94.6 | 94.1 | +0.5 | Limited utility |
|
||||||
|
| kimi-vl-3b | 88.4 | 89.7 | **-1.3** | ❌ Not reliable |
|
||||||
|
|
||||||
|
### What This Means:
|
||||||
|
|
||||||
|
**For most models**, setting a confidence threshold can significantly reduce false positives:
|
||||||
|
- A threshold of 85 on gemma-3-27b would keep most correct detections (88.2 avg) while filtering many incorrect ones (82.2 avg)
|
||||||
|
- A threshold of 85 on gemma-3-4b would be even more effective
|
||||||
|
|
||||||
|
**Exception: qwen2.5-vl-7b** has minimal difference (94.6 vs 94.1), making threshold filtering less useful despite being the most accurate model.
|
||||||
|
|
||||||
|
### Recommended Filtering Strategy:
|
||||||
|
|
||||||
|
1. **Use gemma-3-27b with confidence threshold of 85+** for best balance of accuracy and filtering
|
||||||
|
2. **Use gemma-3-4b with confidence threshold of 85+** for faster processing with good filtering
|
||||||
|
3. **Use qwen2.5-vl-7b without filtering** when you need maximum recall and will manually review results
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model-Specific Recommendations
|
||||||
|
|
||||||
|
### For Different Use Cases:
|
||||||
|
|
||||||
|
#### 🎯 **Highest Accuracy Required**
|
||||||
|
- **Model:** qwen2.5-vl-7b
|
||||||
|
- **Expected Results:** Find 75% of jerseys, 30% false positive rate
|
||||||
|
- **Processing:** 13.4 seconds per image
|
||||||
|
- **Setup:** Use raw results, manually review all detections
|
||||||
|
|
||||||
|
#### 🎯 **Best Balance of Speed and Accuracy**
|
||||||
|
- **Model:** gemma-3-12b
|
||||||
|
- **Expected Results:** Find 70% of jerseys, reasonable false positive rate
|
||||||
|
- **Processing:** 18.9 seconds per image
|
||||||
|
- **Setup:** Apply confidence threshold of 90+ to reduce false positives
|
||||||
|
|
||||||
|
#### 🎯 **Maximum Quality with Confidence Filtering**
|
||||||
|
- **Model:** gemma-3-27b
|
||||||
|
- **Expected Results:** Find 74% of jerseys, filter false positives effectively
|
||||||
|
- **Processing:** 20.9 seconds per image
|
||||||
|
- **Setup:** Apply confidence threshold of 85+ to reduce false positives by ~50%
|
||||||
|
|
||||||
|
#### ⚡ **Speed is Critical**
|
||||||
|
- **Model:** gemma-3-4b
|
||||||
|
- **Expected Results:** Find 60% of jerseys quickly
|
||||||
|
- **Processing:** 7.9 seconds per image
|
||||||
|
- **Setup:** Apply confidence threshold of 85+ for quality filtering
|
||||||
|
|
||||||
|
#### ❌ **Do Not Use**
|
||||||
|
- **kimi-vl-3b**: Only 2% accuracy, extremely slow, poor confidence calibration
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Recommendations
|
||||||
|
|
||||||
|
### 1. Production Deployment Strategy
|
||||||
|
|
||||||
|
**Recommended:** Two-tier approach
|
||||||
|
- **Tier 1 (Automatic):** gemma-3-27b with confidence threshold 85+
|
||||||
|
- Automatically tag high-confidence detections
|
||||||
|
- Expected: ~200 correct detections per 194 images with minimal false positives
|
||||||
|
|
||||||
|
- **Tier 2 (Review Queue):** qwen2.5-vl-7b on remaining images
|
||||||
|
- Human review of all detections below confidence threshold
|
||||||
|
- Catches jerseys missed by Tier 1
|
||||||
|
|
||||||
|
### 2. Confidence Threshold Guidelines
|
||||||
|
|
||||||
|
Based on testing data:
|
||||||
|
|
||||||
|
| Model | Recommended Threshold | Expected Precision | Expected Recall |
|
||||||
|
|-------|----------------------|-------------------|-----------------|
|
||||||
|
| gemma-3-27b | 85+ | ~85-90% | ~60-65% |
|
||||||
|
| gemma-3-4b | 85+ | ~80-85% | ~50-55% |
|
||||||
|
| gemma-3-12b | 90+ | ~80-85% | ~60-65% |
|
||||||
|
| qwen2.5-vl-7b | Don't filter | 70.7% | 75.2% |
|
||||||
|
|
||||||
|
### 3. Performance Optimization
|
||||||
|
|
||||||
|
**Processing 1000 images:**
|
||||||
|
- qwen2.5-vl-7b: ~3.7 hours
|
||||||
|
- gemma-3-27b: ~5.8 hours
|
||||||
|
- gemma-3-4b: ~2.2 hours
|
||||||
|
|
||||||
|
**Recommendation:** Use gemma-3-4b for initial pass, qwen2.5-vl-7b for second pass on low-confidence results.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Conclusions
|
||||||
|
|
||||||
|
### Main Findings:
|
||||||
|
|
||||||
|
1. **qwen2.5-vl-7b is the most accurate model** but has limited confidence score utility
|
||||||
|
2. **gemma-3-27b offers the best combination** of accuracy and confidence-based filtering
|
||||||
|
3. **Confidence scores are highly valuable** for reducing false positives in most models
|
||||||
|
4. **Speed vs accuracy trade-offs are significant** - fastest model is 9% less accurate than best
|
||||||
|
5. **One model (kimi-vl-3b) is completely unsuitable** for this task
|
||||||
|
|
||||||
|
### Strategic Recommendations:
|
||||||
|
|
||||||
|
**For most users:** Deploy gemma-3-27b with confidence threshold of 85+
|
||||||
|
- Balances accuracy, speed, and filtering capability
|
||||||
|
- Reduces manual review burden significantly
|
||||||
|
- Good confidence calibration enables automated decision-making
|
||||||
|
|
||||||
|
**For maximum accuracy:** Deploy qwen2.5-vl-7b without filtering
|
||||||
|
- Best for finding all possible jerseys
|
||||||
|
- Requires manual review of results
|
||||||
|
- Accept higher false positive rate
|
||||||
|
|
||||||
|
**For high-volume processing:** Deploy gemma-3-4b with confidence threshold of 85+
|
||||||
|
- Fast enough for real-time applications
|
||||||
|
- Good accuracy for the speed
|
||||||
|
- Effective filtering capability
|
||||||
|
|
||||||
|
### Final Verdict:
|
||||||
|
|
||||||
|
**Winner: qwen2.5-vl-7b** for pure accuracy
|
||||||
|
**Best Overall: gemma-3-27b** for practical deployment with confidence filtering
|
||||||
|
**Best Value: gemma-3-4b** for speed-sensitive applications
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Technical Notes
|
||||||
|
|
||||||
|
- **Test Dataset:** 194 images with ground truth jersey numbers encoded in filenames
|
||||||
|
- **Total Expected Jerseys:** 416-462 depending on which images each model processed successfully
|
||||||
|
- **Evaluation Metrics:** Precision, Recall, F1 Score, Confidence Calibration
|
||||||
|
- **Hardware:** Testing performed on comparable hardware configurations
|
||||||
|
- **Prompt:** All models used identical jersey detection prompt with confidence scores
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Report generated from comprehensive testing of 8 vision-language models for jersey number detection in sports photography.*
|
||||||
237
docs/LLAMA_SWAP_SETUP.md
Normal file
237
docs/LLAMA_SWAP_SETUP.md
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
# llama-swap Setup Guide for Jersey Detection Testing
|
||||||
|
|
||||||
|
This guide explains how to use [llama-swap](https://github.com/mostlygeek/llama-swap) to automatically switch between different vision language models when testing jersey detection.
|
||||||
|
|
||||||
|
## What is llama-swap?
|
||||||
|
|
||||||
|
llama-swap is a model-swapping proxy that sits between your application and llama.cpp servers. It automatically loads and unloads models based on the `model` parameter in API requests, allowing you to test multiple models without manually restarting servers.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Docker (Recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pull the CUDA image (or cpu, vulkan, intel depending on your hardware)
|
||||||
|
docker pull ghcr.io/mostlygeek/llama-swap:cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
### Homebrew (macOS/Linux)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
brew tap mostlygeek/llama-swap
|
||||||
|
brew install llama-swap
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pre-built Binaries
|
||||||
|
|
||||||
|
Download from the [releases page](https://github.com/mostlygeek/llama-swap/releases).
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
A configuration file `llama-swap-config.yaml` is provided with 8 pre-configured vision models:
|
||||||
|
|
||||||
|
### Small Models (1-4B parameters)
|
||||||
|
- `lfm2-vl-1.6b` - LiquidAI LFM2-VL 1.6B (F16)
|
||||||
|
- `gemma-3-4b` - Gemma 3 4B Instruct (F16)
|
||||||
|
- `kimi-vl-3b` - Kimi VL A3B Thinking (F16)
|
||||||
|
|
||||||
|
### Medium Models (7-12B parameters)
|
||||||
|
- `qwen2.5-vl-7b` - Qwen2.5-VL 7B Instruct (F16)
|
||||||
|
- `gemma-3-12b` - Gemma 3 12B Instruct (F16)
|
||||||
|
|
||||||
|
### Large Models (24-27B parameters)
|
||||||
|
- `mistral-small-24b-q8` - Mistral Small 3.2 24B (Q8_K_XL)
|
||||||
|
- `mistral-small-24b-q4` - Mistral Small 3.2 24B (Q4_K_XL)
|
||||||
|
- `gemma-3-27b` - Gemma 3 27B Instruct (Q8_0)
|
||||||
|
|
||||||
|
## Starting llama-swap
|
||||||
|
|
||||||
|
### Using Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -it --rm --runtime nvidia -p 8080:8080 \
|
||||||
|
-v $(pwd)/llama-swap-config.yaml:/app/config.yaml \
|
||||||
|
-v /path/to/hf/cache:/root/.cache/huggingface \
|
||||||
|
ghcr.io/mostlygeek/llama-swap:cuda
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using Binary
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-swap --config llama-swap-config.yaml --listen localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing with Jersey Detection Script
|
||||||
|
|
||||||
|
Once llama-swap is running, you can test different models by specifying the `--model-tag` parameter:
|
||||||
|
|
||||||
|
### Test a Single Model
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test Qwen2.5-VL 7B with resizing
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt \
|
||||||
|
--model-tag "qwen2.5-vl-7b" \
|
||||||
|
--resize 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test Multiple Models Sequentially
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test small models
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b" --resize 1024
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-4b" --resize 1024
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "kimi-vl-3b" --resize 1024
|
||||||
|
|
||||||
|
# Test medium models
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-12b" --resize 1024
|
||||||
|
|
||||||
|
# Test large models
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "mistral-small-24b-q4" --resize 1024
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "gemma-3-27b" --resize 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Automated Testing Scripts
|
||||||
|
|
||||||
|
Two bash scripts are provided for automated testing:
|
||||||
|
|
||||||
|
#### 1. Full Test Suite (`test_all_models.sh`)
|
||||||
|
|
||||||
|
Tests **all models** defined in `llama-swap-config.yaml`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Basic usage (uses defaults)
|
||||||
|
./test_all_models.sh ./test_images
|
||||||
|
|
||||||
|
# Customize configuration with environment variables
|
||||||
|
RESIZE=2048 ./test_all_models.sh ./test_images
|
||||||
|
OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh ./test_images
|
||||||
|
PROMPT_FILE=custom_prompt.txt ./test_all_models.sh ./test_images
|
||||||
|
|
||||||
|
# Disable resize
|
||||||
|
RESIZE= ./test_all_models.sh ./test_images
|
||||||
|
```
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Automatically extracts all model tags from YAML config
|
||||||
|
- Color-coded output with progress tracking
|
||||||
|
- Confirms before starting tests
|
||||||
|
- Shows summary with success/failure counts
|
||||||
|
- Asks to continue if a model fails
|
||||||
|
|
||||||
|
**Default Configuration:**
|
||||||
|
- Images: `./test_images`
|
||||||
|
- Prompt: `jersey_prompt_with_confidence.txt`
|
||||||
|
- Resize: `1024px`
|
||||||
|
- Output: `jersey_detection_results.jsonl`
|
||||||
|
|
||||||
|
#### 2. Quick Test (`test_quick.sh`)
|
||||||
|
|
||||||
|
Tests a **small subset** of models for rapid iteration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test default selection (small, medium, large)
|
||||||
|
./test_quick.sh ./test_images
|
||||||
|
|
||||||
|
# Test custom models
|
||||||
|
MODELS="lfm2-vl-1.6b qwen2.5-vl-7b" ./test_quick.sh ./test_images
|
||||||
|
|
||||||
|
# Customize settings
|
||||||
|
RESIZE=512 MODELS="gemma-3-4b" ./test_quick.sh ./test_images
|
||||||
|
```
|
||||||
|
|
||||||
|
**Default Models:**
|
||||||
|
- `lfm2-vl-1.6b` (Small - 1.6B)
|
||||||
|
- `qwen2.5-vl-7b` (Medium - 7B)
|
||||||
|
- `mistral-small-24b-q4` (Large - 24B Q4)
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Quick validation after prompt changes
|
||||||
|
- Testing configuration adjustments
|
||||||
|
- Rapid prototyping before full test run
|
||||||
|
|
||||||
|
## Analyzing Results
|
||||||
|
|
||||||
|
After testing multiple models, use the analysis script to compare performance:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python analyze_jersey_results.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will show:
|
||||||
|
- Comparison table of all models tested
|
||||||
|
- Performance charts with hallucination rates
|
||||||
|
- Best performers by speed and accuracy
|
||||||
|
- Confidence distribution (if applicable)
|
||||||
|
|
||||||
|
## Model Swapping Behavior
|
||||||
|
|
||||||
|
llama-swap will:
|
||||||
|
1. **Automatically load** the requested model when you specify `--model-tag`
|
||||||
|
2. **Automatically unload** the previous model (if different from current request)
|
||||||
|
3. **Keep running** if you test the same model multiple times
|
||||||
|
4. **Monitor** model loading/unloading in the web UI at `http://localhost:8080/ui`
|
||||||
|
|
||||||
|
## Optional: Model Auto-Unloading
|
||||||
|
|
||||||
|
To automatically unload models after 5 minutes of inactivity, uncomment this line in `llama-swap-config.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
ttl: 300
|
||||||
|
```
|
||||||
|
|
||||||
|
## Optional: Preload Model on Startup
|
||||||
|
|
||||||
|
To preload a specific model when llama-swap starts, uncomment and modify this section:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hooks:
|
||||||
|
onStartup:
|
||||||
|
- loadModel: qwen2.5-vl-7b
|
||||||
|
```
|
||||||
|
|
||||||
|
## Customizing Models
|
||||||
|
|
||||||
|
To add or modify models, edit `llama-swap-config.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
models:
|
||||||
|
my-custom-model:
|
||||||
|
name: "My Custom Model Description"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf user/model-name:quantization
|
||||||
|
```
|
||||||
|
|
||||||
|
Then test with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "my-custom-model"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Model not loading
|
||||||
|
- Check llama-swap logs at `http://localhost:8080/log` or via `curl http://localhost:8080/log/stream`
|
||||||
|
- Verify the model name in the config matches the `--model-tag` parameter
|
||||||
|
- Ensure sufficient GPU memory for the model
|
||||||
|
|
||||||
|
### Connection refused
|
||||||
|
- Verify llama-swap is running: `curl http://localhost:8080/health`
|
||||||
|
- Check the server URL matches: default is `http://192.168.1.126:8080` (from scan.ini)
|
||||||
|
|
||||||
|
### Slow model switching
|
||||||
|
- First load downloads models from HuggingFace (can be slow)
|
||||||
|
- Subsequent loads are faster (cached locally)
|
||||||
|
- Use quantized models (Q4, Q8) for faster loading and lower memory usage
|
||||||
|
|
||||||
|
## Web UI
|
||||||
|
|
||||||
|
llama-swap includes a web interface for monitoring:
|
||||||
|
- **Dashboard**: `http://localhost:8080/ui` - View loaded models and logs
|
||||||
|
- **Activity**: See recent API requests
|
||||||
|
- **Logs**: Real-time log monitoring
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
|
||||||
|
- [llama-swap Documentation](https://github.com/mostlygeek/llama-swap/tree/main/docs)
|
||||||
|
- [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp)
|
||||||
6
jersey_detection_results.jsonl
Normal file
6
jersey_detection_results.jsonl
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{"timestamp": "2025-10-19T19:30:44.272849", "model_name": "LFM2-VL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 88, "images_without_jerseys": 110, "images_with_errors": 0, "total_raw_detections": 470, "total_valid_jerseys": 235, "total_hallucinated": 235, "avg_processing_time": 4.607636096501591, "total_processing_time": 912.3119471073151, "confidence_stats": {"avg": 84.14893617021276, "min": 0, "max": 100, "count": 235, "distribution": {"90-100": 138, "70-89": 70, "50-69": 8, "30-49": 8, "0-29": 11}}, "empty_response_capable": true}
|
||||||
|
{"timestamp": "2025-10-19T22:10:05.135029", "model_name": "ggml-org_Kimi-VL-A3B-Thinking-2506-GGUF_Kimi-VL-A3B-Thinking-2506-bf16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 28, "images_without_jerseys": 163, "images_with_errors": 7, "total_raw_detections": 49, "total_valid_jerseys": 49, "total_hallucinated": 0, "avg_processing_time": 29.11009831259949, "total_processing_time": 5763.799465894699, "confidence_stats": {"avg": 88.85714285714286, "min": 60, "max": 95, "count": 49, "distribution": {"90-100": 37, "70-89": 9, "50-69": 3, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
|
||||||
|
{"timestamp": "2025-10-20T01:20:31.076468", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-BF16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 37.221905313356956, "total_processing_time": 7369.937252044678, "confidence_stats": {"avg": 90.81983805668017, "min": 70, "max": 95, "count": 494, "distribution": {"90-100": 362, "70-89": 132, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
|
||||||
|
{"timestamp": "2025-10-20T12:04:37.833650", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q8_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 496, "total_valid_jerseys": 496, "total_hallucinated": 0, "avg_processing_time": 20.684308366342023, "total_processing_time": 4095.493056535721, "confidence_stats": {"avg": 90.76612903225806, "min": 70, "max": 95, "count": 496, "distribution": {"90-100": 363, "70-89": 133, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
|
||||||
|
{"timestamp": "2025-10-20T13:01:42.747694", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 14.196594772916852, "total_processing_time": 2810.9257650375366, "confidence_stats": {"avg": 92.09514170040485, "min": 80, "max": 95, "count": 494, "distribution": {"90-100": 415, "70-89": 79, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
|
||||||
|
{"timestamp": "2025-10-20T15:01:25.669340", "model_name": "unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-Q8_0", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 185, "images_without_jerseys": 13, "images_with_errors": 0, "total_raw_detections": 428, "total_valid_jerseys": 428, "total_hallucinated": 0, "avg_processing_time": 18.127051142731098, "total_processing_time": 3589.1561262607574, "confidence_stats": {"avg": 87.14953271028037, "min": 55, "max": 100, "count": 428, "distribution": {"90-100": 250, "70-89": 166, "50-69": 12, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
|
||||||
43
jersey_prompt.txt
Normal file
43
jersey_prompt.txt
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
|
||||||
|
|
||||||
|
CRITICAL INSTRUCTIONS:
|
||||||
|
1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
|
||||||
|
2. ONLY include jersey numbers that you can ACTUALLY READ in the image
|
||||||
|
3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
|
||||||
|
4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
|
||||||
|
5. DO NOT include jerseys if you cannot clearly see the number
|
||||||
|
|
||||||
|
RESPONSE FORMAT:
|
||||||
|
Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
|
||||||
|
|
||||||
|
Use DOUBLE QUOTES (") for all JSON keys and string values.
|
||||||
|
|
||||||
|
The JSON must have a single key "jerseys" with an array of dictionaries.
|
||||||
|
|
||||||
|
Each dictionary must have exactly these three keys:
|
||||||
|
- "jersey_number": The number on the jersey (as a string, only if clearly visible)
|
||||||
|
- "jersey_color": The primary color of the jersey
|
||||||
|
- "number_color": The color of the number on the jersey
|
||||||
|
|
||||||
|
Example response for an image WITH visible jerseys:
|
||||||
|
{
|
||||||
|
"jerseys": [
|
||||||
|
{
|
||||||
|
"jersey_number": "101",
|
||||||
|
"jersey_color": "red",
|
||||||
|
"number_color": "white"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"jersey_number": "142",
|
||||||
|
"jersey_color": "blue",
|
||||||
|
"number_color": "yellow"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Example response for an image WITHOUT jerseys or with unclear numbers:
|
||||||
|
{"jerseys": []}
|
||||||
|
|
||||||
|
REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
|
||||||
|
|
||||||
|
Now analyze the image and return the JSON object.
|
||||||
53
jersey_prompt_with_confidence.txt
Normal file
53
jersey_prompt_with_confidence.txt
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
|
||||||
|
|
||||||
|
CRITICAL INSTRUCTIONS:
|
||||||
|
1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
|
||||||
|
2. ONLY include jersey numbers that you can ACTUALLY READ in the image
|
||||||
|
3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
|
||||||
|
4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
|
||||||
|
5. DO NOT include jerseys if you cannot clearly see the number
|
||||||
|
|
||||||
|
RESPONSE FORMAT:
|
||||||
|
Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
|
||||||
|
|
||||||
|
Use DOUBLE QUOTES (") for all JSON keys and string values.
|
||||||
|
|
||||||
|
The JSON must have a single key "jerseys" with an array of dictionaries.
|
||||||
|
|
||||||
|
Each dictionary must have exactly these four keys:
|
||||||
|
- "jersey_number": The number on the jersey (as a string, only if clearly visible)
|
||||||
|
- "jersey_color": The primary color of the jersey
|
||||||
|
- "number_color": The color of the number on the jersey
|
||||||
|
- "confidence": A number from 0 to 100 representing your confidence in this detection (0 = no confidence, 100 = absolutely certain)
|
||||||
|
|
||||||
|
CONFIDENCE SCORING GUIDELINES:
|
||||||
|
- 90-100: Jersey number is extremely clear and unambiguous
|
||||||
|
- 70-89: Jersey number is clear but might have minor occlusion or angle issues
|
||||||
|
- 50-69: Jersey number is partially visible or somewhat unclear
|
||||||
|
- 30-49: Jersey number is difficult to read but you can make it out
|
||||||
|
- 0-29: Very uncertain, number is barely visible
|
||||||
|
|
||||||
|
Example response for an image WITH visible jerseys:
|
||||||
|
{
|
||||||
|
"jerseys": [
|
||||||
|
{
|
||||||
|
"jersey_number": "101",
|
||||||
|
"jersey_color": "red",
|
||||||
|
"number_color": "white",
|
||||||
|
"confidence": 95
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"jersey_number": "142",
|
||||||
|
"jersey_color": "blue",
|
||||||
|
"number_color": "yellow",
|
||||||
|
"confidence": 78
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Example response for an image WITHOUT jerseys or with unclear numbers:
|
||||||
|
{"jerseys": []}
|
||||||
|
|
||||||
|
REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array. Always provide a confidence score that honestly reflects how certain you are about each detection.
|
||||||
|
|
||||||
|
Now analyze the image and return the JSON object.
|
||||||
59
llama-swap-config.yaml
Normal file
59
llama-swap-config.yaml
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# llama-swap configuration for jersey detection testing
|
||||||
|
# ==================================================
|
||||||
|
# This configuration allows automatic model switching for testing
|
||||||
|
# different vision language models with the jersey detection test script.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# llama-swap --config llama-swap-config.yaml --listen localhost:8080
|
||||||
|
#
|
||||||
|
# Then use the test script with --model-tag:
|
||||||
|
# python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "lfm2-vl-1.6b"
|
||||||
|
#
|
||||||
|
# llama-swap will automatically load the requested model and swap models
|
||||||
|
# as needed when you run tests with different --model-tag values.
|
||||||
|
|
||||||
|
models:
|
||||||
|
# Small vision models (1-4B parameters)
|
||||||
|
lfm2-vl-1.6b:
|
||||||
|
name: "LiquidAI LFM2-VL 1.6B (F16)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf LiquidAI/LFM2-VL-1.6B-GGUF:F16
|
||||||
|
|
||||||
|
gemma-3-4b:
|
||||||
|
name: "Gemma 3 4B Instruct (F16)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-4b-it-GGUF:F16
|
||||||
|
|
||||||
|
kimi-vl-3b:
|
||||||
|
name: "Kimi VL A3B Thinking (F16)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:F16
|
||||||
|
|
||||||
|
# Medium vision models (7-12B parameters)
|
||||||
|
qwen2.5-vl-7b:
|
||||||
|
name: "Qwen2.5-VL 7B Instruct (F16)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:F16
|
||||||
|
|
||||||
|
gemma-3-12b:
|
||||||
|
name: "Gemma 3 12B Instruct (F16)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-12b-it-GGUF:F16
|
||||||
|
|
||||||
|
# Large models (24-27B parameters)
|
||||||
|
mistral-small-24b-q8:
|
||||||
|
name: "Mistral Small 3.2 24B Instruct (Q8_K_XL)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q8_K_XL
|
||||||
|
|
||||||
|
mistral-small-24b-q4:
|
||||||
|
name: "Mistral Small 3.2 24B Instruct (Q4_K_XL)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF:Q4_K_XL
|
||||||
|
|
||||||
|
gemma-3-27b:
|
||||||
|
name: "Gemma 3 27B Instruct (Q8_0)"
|
||||||
|
cmd: llama-server --no-mmap -ngl 999 -fa on --host 0.0.0.0 --port ${PORT} -hf unsloth/gemma-3-27b-it-GGUF:Q8_0
|
||||||
|
|
||||||
|
# Optional: Automatically unload models after 5 minutes of inactivity
|
||||||
|
# Uncomment to enable
|
||||||
|
# ttl: 300
|
||||||
|
|
||||||
|
# Optional: Preload a specific model on startup
|
||||||
|
# Uncomment to enable
|
||||||
|
# hooks:
|
||||||
|
# onStartup:
|
||||||
|
# - loadModel: qwen2.5-vl-7b
|
||||||
9
requirements.txt
Normal file
9
requirements.txt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# Jersey Detection Test Dependencies
|
||||||
|
# Install with: pip install -r requirements.txt
|
||||||
|
|
||||||
|
# HTTP client for llama.cpp server communication
|
||||||
|
requests>=2.28.0
|
||||||
|
|
||||||
|
# Image processing
|
||||||
|
opencv-python>=4.8.0
|
||||||
|
numpy>=1.24.0
|
||||||
1
scan_utils/__init__.py
Normal file
1
scan_utils/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# Jersey detection scan utilities
|
||||||
149
scan_utils/jersey_detection.py
Normal file
149
scan_utils/jersey_detection.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
import json
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Read the default jersey detection prompt
|
||||||
|
try:
|
||||||
|
with open('jersey_prompt.txt', 'r') as f:
|
||||||
|
DEFAULT_JERSEY_PROMPT = f.read()
|
||||||
|
except FileNotFoundError:
|
||||||
|
# Fallback prompt if file is not found
|
||||||
|
DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
|
||||||
|
|
||||||
|
CRITICAL INSTRUCTIONS:
|
||||||
|
1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
|
||||||
|
2. ONLY include jersey numbers that you can ACTUALLY READ in the image
|
||||||
|
3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
|
||||||
|
4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
|
||||||
|
5. DO NOT include jerseys if you cannot clearly see the number
|
||||||
|
|
||||||
|
RESPONSE FORMAT:
|
||||||
|
Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
|
||||||
|
|
||||||
|
Use DOUBLE QUOTES (") for all JSON keys and string values.
|
||||||
|
|
||||||
|
The JSON must have a single key "jerseys" with an array of dictionaries.
|
||||||
|
|
||||||
|
Each dictionary must have exactly these three keys:
|
||||||
|
- "jersey_number": The number on the jersey (as a string, only if clearly visible)
|
||||||
|
- "jersey_color": The primary color of the jersey
|
||||||
|
- "number_color": The color of the number on the jersey
|
||||||
|
|
||||||
|
Example response for an image WITH visible jerseys:
|
||||||
|
{
|
||||||
|
"jerseys": [
|
||||||
|
{
|
||||||
|
"jersey_number": "101",
|
||||||
|
"jersey_color": "red",
|
||||||
|
"number_color": "white"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Example response for an image WITHOUT jerseys or with unclear numbers:
|
||||||
|
{"jerseys": []}
|
||||||
|
|
||||||
|
REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
|
||||||
|
|
||||||
|
Now analyze the image and return the JSON object."""
|
||||||
|
|
||||||
|
|
||||||
|
class DetectJerseys:
|
||||||
|
"""A class for detecting sports jerseys using a vision language model."""
|
||||||
|
|
||||||
|
def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize the jersey detection class.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
llama_cpp_base_url: Base URL for the llama.cpp server
|
||||||
|
logger: Logger instance for logging messages
|
||||||
|
prompt: Custom prompt to use for jersey detection (optional)
|
||||||
|
"""
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
self.prompt = prompt or DEFAULT_JERSEY_PROMPT
|
||||||
|
|
||||||
|
# Import here to avoid circular dependencies
|
||||||
|
try:
|
||||||
|
from scan_utils.llama_cpp_client import LlamaCppClient
|
||||||
|
self.client = LlamaCppClient(base_url=llama_cpp_base_url)
|
||||||
|
self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
|
||||||
|
except ImportError as e:
|
||||||
|
self.logger.error(f"Failed to import LlamaCppClient: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Detect jerseys in an image using the vision language model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: OpenCV image (numpy array) to analyze
|
||||||
|
temperature: Temperature value for the model (default: 0.1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing detected jerseys or empty dict if invalid
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Create multimodal message with image and prompt
|
||||||
|
message = self.client.create_multimodal_message(
|
||||||
|
role="user",
|
||||||
|
content=self.prompt,
|
||||||
|
images=[image]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Send chat completion request
|
||||||
|
response = self.client.chat_completion(
|
||||||
|
messages=[message],
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=1000
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract the response text
|
||||||
|
if 'choices' in response and len(response['choices']) > 0:
|
||||||
|
response_text = response['choices'][0]['message']['content']
|
||||||
|
|
||||||
|
# Log the raw response for debugging
|
||||||
|
self.logger.debug(f"Raw VLM response: {response_text}")
|
||||||
|
|
||||||
|
# Parse JSON response
|
||||||
|
try:
|
||||||
|
result = json.loads(response_text)
|
||||||
|
|
||||||
|
# Process jerseys to ensure they have all required fields
|
||||||
|
jerseys = result.get('jerseys', [])
|
||||||
|
|
||||||
|
# Hallucination detection: filter out example numbers from the prompt
|
||||||
|
# Using numbers > 100 as examples to avoid filtering valid jersey numbers
|
||||||
|
HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
|
||||||
|
|
||||||
|
processed_jerseys = []
|
||||||
|
for jersey in jerseys:
|
||||||
|
jersey_number = jersey.get('jersey_number', '')
|
||||||
|
|
||||||
|
# Check for hallucination (model returning example numbers)
|
||||||
|
if jersey_number in HALLUCINATION_NUMBERS:
|
||||||
|
self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure all required fields are present
|
||||||
|
processed_jersey = {
|
||||||
|
'jersey_number': jersey_number,
|
||||||
|
'jersey_color': jersey.get('jersey_color', ''),
|
||||||
|
'number_color': jersey.get('number_color', 'unknown') # Default to 'unknown' if missing
|
||||||
|
}
|
||||||
|
processed_jerseys.append(processed_jersey)
|
||||||
|
|
||||||
|
return {"jerseys": processed_jerseys}
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
self.logger.error(f"Failed to parse JSON response: {e}")
|
||||||
|
self.logger.debug(f"Response text was: {response_text}")
|
||||||
|
return {"jerseys": []}
|
||||||
|
else:
|
||||||
|
self.logger.warning("Empty response from VLM")
|
||||||
|
return {"jerseys": []}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error during jersey detection: {e}")
|
||||||
|
return {"jerseys": []}
|
||||||
237
scan_utils/llama_cpp_client.py
Normal file
237
scan_utils/llama_cpp_client.py
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
from typing import List, Dict, Any, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaCppClient:
|
||||||
|
"""A Python client for interacting with a llama.cpp server."""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str = "http://192.168.1.34:8080"):
|
||||||
|
"""
|
||||||
|
Initialize the client with the base URL of the llama.cpp server.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
|
||||||
|
"""
|
||||||
|
self.base_url = base_url.rstrip('/')
|
||||||
|
|
||||||
|
def health_check(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Check the health status of the server.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Health status response from the server
|
||||||
|
"""
|
||||||
|
response = requests.get(f"{self.base_url}/health")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def get_models(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get information about loaded models.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Model information from the server
|
||||||
|
"""
|
||||||
|
response = requests.get(f"{self.base_url}/v1/models")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def chat_completion(
|
||||||
|
self,
|
||||||
|
messages: List[Dict[str, Any]],
|
||||||
|
temperature: float = 0.1,
|
||||||
|
min_p: float = 0.15,
|
||||||
|
repetition_penalty: float = 1.05,
|
||||||
|
min_image_tokens: int = 64,
|
||||||
|
max_image_tokens: int = 256,
|
||||||
|
do_image_splitting: bool = True,
|
||||||
|
max_tokens: int = -1,
|
||||||
|
stream: bool = False,
|
||||||
|
**kwargs
|
||||||
|
) -> Union[Dict[str, Any], requests.Response]:
|
||||||
|
"""
|
||||||
|
Generate a chat completion using the OpenAI-compatible API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
messages: List of message dictionaries with role and content
|
||||||
|
temperature: Sampling temperature (default: 0.1)
|
||||||
|
min_p: Minimum probability for sampling (default: 0.15)
|
||||||
|
repetition_penalty: Repetition penalty factor (default: 1.05)
|
||||||
|
min_image_tokens: Minimum image tokens (default: 64)
|
||||||
|
max_image_tokens: Maximum image tokens (default: 256)
|
||||||
|
do_image_splitting: Whether to split images (default: True)
|
||||||
|
max_tokens: Maximum tokens to generate (default: -1 for infinity)
|
||||||
|
stream: Whether to stream the response (default: False)
|
||||||
|
**kwargs: Additional parameters for the completion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Completion response or streaming response
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"messages": messages,
|
||||||
|
"temperature": temperature,
|
||||||
|
"min_p": min_p,
|
||||||
|
"repetition_penalty": repetition_penalty,
|
||||||
|
"min_image_tokens": min_image_tokens,
|
||||||
|
"max_image_tokens": max_image_tokens,
|
||||||
|
"do_image_splitting": do_image_splitting,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"cache_prompt": True,
|
||||||
|
"stream": stream,
|
||||||
|
**kwargs
|
||||||
|
}
|
||||||
|
|
||||||
|
# Debug: Show model parameter if present (for llama-swap debugging)
|
||||||
|
if 'model' in payload and payload['model']:
|
||||||
|
import os
|
||||||
|
if os.environ.get('DEBUG_LLAMA_SWAP'):
|
||||||
|
print(f"[DEBUG] Requesting model: {payload['model']}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat/completions",
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
json=payload,
|
||||||
|
stream=stream
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return response
|
||||||
|
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def completion(
|
||||||
|
self,
|
||||||
|
prompt: Union[str, List[Union[str, int]]],
|
||||||
|
temperature: float = 0.1,
|
||||||
|
min_p: float = 0.15,
|
||||||
|
repetition_penalty: float = 1.05,
|
||||||
|
min_image_tokens: int = 64,
|
||||||
|
max_image_tokens: int = 256,
|
||||||
|
do_image_splitting: bool = True,
|
||||||
|
max_tokens: int = -1,
|
||||||
|
stream: bool = False,
|
||||||
|
**kwargs
|
||||||
|
) -> Union[Dict[str, Any], requests.Response]:
|
||||||
|
"""
|
||||||
|
Generate a completion using the non-OAI compatible API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The prompt string or list of tokens
|
||||||
|
temperature: Sampling temperature (default: 0.1)
|
||||||
|
min_p: Minimum probability for sampling (default: 0.15)
|
||||||
|
repetition_penalty: Repetition penalty factor (default: 1.05)
|
||||||
|
min_image_tokens: Minimum image tokens (default: 64)
|
||||||
|
max_image_tokens: Maximum image tokens (default: 256)
|
||||||
|
do_image_splitting: Whether to split images (default: True)
|
||||||
|
max_tokens: Maximum tokens to generate (default: -1 for infinity)
|
||||||
|
stream: Whether to stream the response (default: False)
|
||||||
|
**kwargs: Additional parameters for the completion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Completion response or streaming response
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"prompt": prompt,
|
||||||
|
"temperature": temperature,
|
||||||
|
"min_p": min_p,
|
||||||
|
"repeat_penalty": repetition_penalty,
|
||||||
|
"min_image_tokens": min_image_tokens,
|
||||||
|
"max_image_tokens": max_image_tokens,
|
||||||
|
"do_image_splitting": do_image_splitting,
|
||||||
|
"cache_prompt": True,
|
||||||
|
"n_predict": max_tokens,
|
||||||
|
"stream": stream,
|
||||||
|
**kwargs
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/completion",
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
json=payload,
|
||||||
|
stream=stream
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return response
|
||||||
|
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _encode_image_to_base64(image_path: str) -> str:
|
||||||
|
"""
|
||||||
|
Encode an image file to base64 string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_path: Path to the image file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Base64 encoded image string
|
||||||
|
"""
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
|
||||||
|
"""
|
||||||
|
Encode an OpenCV image to base64 string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: OpenCV image (numpy array)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Base64 encoded image string
|
||||||
|
"""
|
||||||
|
_, buffer = cv2.imencode('.jpg', image)
|
||||||
|
return base64.b64encode(buffer).decode('utf-8')
|
||||||
|
|
||||||
|
def create_multimodal_message(
|
||||||
|
self,
|
||||||
|
role: str,
|
||||||
|
content: str,
|
||||||
|
images: Optional[List[Union[str, np.ndarray]]] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Create a multimodal message with text and images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
role: Role of the message (system, user, assistant)
|
||||||
|
content: Text content of the message
|
||||||
|
images: List of image paths or OpenCV images (numpy arrays)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted message dictionary
|
||||||
|
"""
|
||||||
|
if not images:
|
||||||
|
return {"role": role, "content": content}
|
||||||
|
|
||||||
|
# Process images
|
||||||
|
image_data = []
|
||||||
|
for img in images:
|
||||||
|
if isinstance(img, str):
|
||||||
|
# Image path
|
||||||
|
encoded_image = self._encode_image_to_base64(img)
|
||||||
|
else:
|
||||||
|
# OpenCV image
|
||||||
|
encoded_image = self._encode_cv2_image_to_base64(img)
|
||||||
|
image_data.append(encoded_image)
|
||||||
|
|
||||||
|
# Create multimodal content
|
||||||
|
multimodal_content = [
|
||||||
|
{"type": "text", "text": content}
|
||||||
|
]
|
||||||
|
|
||||||
|
for img_data in image_data:
|
||||||
|
multimodal_content.append({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{img_data}"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return {"role": role, "content": multimodal_content}
|
||||||
263
test_all_models.sh
Executable file
263
test_all_models.sh
Executable file
@ -0,0 +1,263 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# ==============================================================================
|
||||||
|
# Test All Models Script for Jersey Detection
|
||||||
|
# ==============================================================================
|
||||||
|
# This script automatically tests all models defined in llama-swap-config.yaml
|
||||||
|
# with the jersey detection test suite.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./test_all_models.sh
|
||||||
|
# ./test_all_models.sh /path/to/images
|
||||||
|
# RESIZE=2048 ./test_all_models.sh
|
||||||
|
# OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Note: We don't use 'set -e' here because we have explicit error handling
|
||||||
|
# in the test loop and want to give the user the option to continue on failures
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Configuration Variables
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Image directory containing test images
|
||||||
|
IMAGES_DIR="${1:-./test_images}"
|
||||||
|
|
||||||
|
# Prompt file to use for testing
|
||||||
|
PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
|
||||||
|
|
||||||
|
# Resize images to this max dimension (set to empty string to disable)
|
||||||
|
RESIZE="${RESIZE:-1024}"
|
||||||
|
|
||||||
|
# Output file for results
|
||||||
|
OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
|
||||||
|
|
||||||
|
# llama-swap configuration file
|
||||||
|
LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
|
||||||
|
|
||||||
|
# Server URL
|
||||||
|
SERVER_URL="${SERVER_URL:-http://localhost:8080}"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Color codes for output
|
||||||
|
# ==============================================================================
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
CYAN='\033[0;36m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Helper Functions
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
print_header() {
|
||||||
|
echo -e "${CYAN}============================================================================${NC}"
|
||||||
|
echo -e "${CYAN}$1${NC}"
|
||||||
|
echo -e "${CYAN}============================================================================${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_info() {
|
||||||
|
echo -e "${BLUE}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_success() {
|
||||||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Validation
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
print_header "Jersey Detection - Test All Models"
|
||||||
|
|
||||||
|
# Check if images directory exists
|
||||||
|
if [ ! -d "$IMAGES_DIR" ]; then
|
||||||
|
print_error "Image directory not found: $IMAGES_DIR"
|
||||||
|
echo "Usage: $0 <image_directory>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if prompt file exists
|
||||||
|
if [ ! -f "$PROMPT_FILE" ]; then
|
||||||
|
print_error "Prompt file not found: $PROMPT_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if llama-swap config exists
|
||||||
|
if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
|
||||||
|
print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if test script exists
|
||||||
|
if [ ! -f "test_jersey_detection.py" ]; then
|
||||||
|
print_error "test_jersey_detection.py not found in current directory"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if server is running
|
||||||
|
print_info "Checking if llama-swap server is running at $SERVER_URL..."
|
||||||
|
if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
|
||||||
|
print_error "Cannot connect to llama-swap at $SERVER_URL"
|
||||||
|
echo ""
|
||||||
|
echo "Please start llama-swap first:"
|
||||||
|
echo " llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
|
||||||
|
echo ""
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
print_success "Server is running"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Extract model tags from YAML
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
|
||||||
|
|
||||||
|
# Extract model IDs (keys under 'models:')
|
||||||
|
# This uses grep and sed to parse the YAML (simple parser, works for our format)
|
||||||
|
MODEL_TAGS=$(grep "^ [a-z]" "$LLAMA_SWAP_CONFIG" | \
|
||||||
|
grep -v " " | \
|
||||||
|
sed 's/:.*//' | \
|
||||||
|
sed 's/^ //')
|
||||||
|
|
||||||
|
if [ -z "$MODEL_TAGS" ]; then
|
||||||
|
print_error "No model tags found in $LLAMA_SWAP_CONFIG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Convert to array
|
||||||
|
readarray -t MODELS <<< "$MODEL_TAGS"
|
||||||
|
|
||||||
|
MODEL_COUNT=${#MODELS[@]}
|
||||||
|
print_success "Found $MODEL_COUNT models to test"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Display Configuration
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
print_info "Test Configuration:"
|
||||||
|
echo " Images directory: $IMAGES_DIR"
|
||||||
|
echo " Prompt file: $PROMPT_FILE"
|
||||||
|
echo " Resize: ${RESIZE:-Disabled}"
|
||||||
|
echo " Output file: $OUTPUT_FILE"
|
||||||
|
echo " Server URL: $SERVER_URL"
|
||||||
|
echo " Models to test: $MODEL_COUNT"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# List all models
|
||||||
|
print_info "Models:"
|
||||||
|
for i in "${!MODELS[@]}"; do
|
||||||
|
echo " $((i+1)). ${MODELS[$i]}"
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Confirmation
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
read -p "Continue with testing? (y/N) " -n 1 -r
|
||||||
|
echo
|
||||||
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||||
|
print_warning "Testing cancelled"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Run Tests
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
print_header "Starting Tests"
|
||||||
|
|
||||||
|
START_TIME=$(date +%s)
|
||||||
|
SUCCESSFUL=0
|
||||||
|
FAILED=0
|
||||||
|
|
||||||
|
for i in "${!MODELS[@]}"; do
|
||||||
|
MODEL="${MODELS[$i]}"
|
||||||
|
MODEL_NUM=$((i+1))
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
|
||||||
|
|
||||||
|
# Build command
|
||||||
|
CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
|
||||||
|
CMD="$CMD --model-tag \"$MODEL\""
|
||||||
|
CMD="$CMD --output-file \"$OUTPUT_FILE\""
|
||||||
|
CMD="$CMD --server-url \"$SERVER_URL\""
|
||||||
|
|
||||||
|
# Add resize if configured
|
||||||
|
if [ -n "$RESIZE" ]; then
|
||||||
|
CMD="$CMD --resize $RESIZE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_info "Running: $CMD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
if eval "$CMD"; then
|
||||||
|
print_success "Model $MODEL completed successfully"
|
||||||
|
SUCCESSFUL=$((SUCCESSFUL + 1))
|
||||||
|
else
|
||||||
|
print_error "Model $MODEL failed"
|
||||||
|
FAILED=$((FAILED + 1))
|
||||||
|
|
||||||
|
# Ask if user wants to continue
|
||||||
|
echo ""
|
||||||
|
read -p "Continue with remaining models? (Y/n) " -n 1 -r
|
||||||
|
echo
|
||||||
|
if [[ $REPLY =~ ^[Nn]$ ]]; then
|
||||||
|
print_warning "Testing stopped by user"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Show progress
|
||||||
|
if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
|
||||||
|
print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Summary
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
END_TIME=$(date +%s)
|
||||||
|
DURATION=$((END_TIME - START_TIME))
|
||||||
|
MINUTES=$((DURATION / 60))
|
||||||
|
SECONDS=$((DURATION % 60))
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
print_header "Testing Complete"
|
||||||
|
echo ""
|
||||||
|
print_info "Summary:"
|
||||||
|
echo " Total models: $MODEL_COUNT"
|
||||||
|
echo " Successful: $SUCCESSFUL"
|
||||||
|
echo " Failed: $FAILED"
|
||||||
|
echo " Total time: ${MINUTES}m ${SECONDS}s"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [ $SUCCESSFUL -gt 0 ]; then
|
||||||
|
print_success "Results saved to: $OUTPUT_FILE"
|
||||||
|
echo ""
|
||||||
|
print_info "Analyze results with:"
|
||||||
|
echo " python analyze_jersey_results.py $OUTPUT_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Exit with error code if any tests failed
|
||||||
|
if [ $FAILED -gt 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
971
test_jersey_detection.py
Executable file
971
test_jersey_detection.py
Executable file
@ -0,0 +1,971 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for evaluating jersey detection performance with different models and prompts.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python test_jersey_detection.py <image_directory> <prompt_file> [options]
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
image_directory: Path to directory containing test images
|
||||||
|
prompt_file: Path to text file containing the prompt to use
|
||||||
|
--model-name: Name of the model being tested (optional, auto-detected from server if not provided)
|
||||||
|
--model-tag: Model tag for llama-swap integration (optional)
|
||||||
|
--server-url: Optional llama.cpp server URL (default: read from scan.ini)
|
||||||
|
--output-file: Output file for results (default: jersey_detection_results.jsonl)
|
||||||
|
--resize: Maximum image dimension for resizing before processing
|
||||||
|
|
||||||
|
Ground Truth:
|
||||||
|
Expected jersey numbers are parsed from filenames using dash-separated format:
|
||||||
|
Example: 1122-8-10-29.jpg expects jerseys 8, 10, and 29
|
||||||
|
|
||||||
|
The script calculates precision, recall, F1 score, and confidence calibration metrics
|
||||||
|
to evaluate model accuracy against known correct results.
|
||||||
|
|
||||||
|
Output Files:
|
||||||
|
<output_file>: Summary statistics with ground truth metrics (default: jersey_detection_results.jsonl)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
# Auto-detect model name from server
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt
|
||||||
|
|
||||||
|
# Resize images to 1024px max dimension before processing
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --resize 1024
|
||||||
|
|
||||||
|
# Use llama-swap to automatically load a specific model
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
|
||||||
|
|
||||||
|
# Specify custom model name (for tracking in results)
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt.txt --model-name "llama-3.2-vision"
|
||||||
|
python test_jersey_detection.py ./images jersey_prompt_with_confidence.txt --model-name "qwen2-vl" --resize 1024
|
||||||
|
|
||||||
|
After running tests, analyze results with:
|
||||||
|
python analyze_jersey_results.py # Performance and accuracy analysis
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import configparser
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Any, Optional
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
from scan_utils.llama_cpp_client import LlamaCppClient
|
||||||
|
|
||||||
|
|
||||||
|
# Hallucination detection: filter out example numbers from prompts
|
||||||
|
# Using numbers > 100 as examples to avoid filtering valid jersey numbers
|
||||||
|
HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_expected_jerseys(filename: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Parse expected jersey numbers from filename.
|
||||||
|
|
||||||
|
Format: prefix-number1-number2-number3.ext
|
||||||
|
Example: 1122-8-10-29.jpg -> ['8', '10', '29']
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Image filename
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of expected jersey numbers as strings
|
||||||
|
"""
|
||||||
|
# Remove extension
|
||||||
|
name_without_ext = Path(filename).stem
|
||||||
|
|
||||||
|
# Split by dash
|
||||||
|
parts = name_without_ext.split('-')
|
||||||
|
|
||||||
|
# First part is typically a prefix/identifier, rest are jersey numbers
|
||||||
|
# Skip the first part and collect numeric parts
|
||||||
|
expected = []
|
||||||
|
for i, part in enumerate(parts[1:], 1): # Skip first part
|
||||||
|
# Check if part is numeric (jersey number)
|
||||||
|
if part.isdigit():
|
||||||
|
expected.append(part)
|
||||||
|
|
||||||
|
return expected
|
||||||
|
|
||||||
|
|
||||||
|
def clean_response(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean the response by removing think tags and markdown code blocks.
|
||||||
|
Some models use <think> tags for chain-of-thought reasoning and wrap JSON in markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw response text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned text ready for JSON parsing
|
||||||
|
"""
|
||||||
|
# Remove <think>...</think> tags and their content (standard angle brackets)
|
||||||
|
cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
# Remove ◁think▷...◁/think▷ tags (unicode triangle brackets)
|
||||||
|
cleaned = re.sub(r'◁think▷.*?◁/think▷', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
# Also remove any standalone think tags (both formats)
|
||||||
|
cleaned = re.sub(r'</?think>', '', cleaned, flags=re.IGNORECASE)
|
||||||
|
cleaned = re.sub(r'◁/?think▷', '', cleaned, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# Remove markdown code blocks (```json ... ``` or ``` ... ```)
|
||||||
|
# First try to extract content from ```json blocks
|
||||||
|
json_block_match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
if json_block_match:
|
||||||
|
# Extract just the content inside the code block
|
||||||
|
cleaned = json_block_match.group(1)
|
||||||
|
else:
|
||||||
|
# If no code block, just remove any stray ``` markers
|
||||||
|
cleaned = re.sub(r'```(?:json)?', '', cleaned, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
return cleaned.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_llama_server_url_from_config() -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Read the LLAMA_CPP_SERVER_URL from scan.ini.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Server URL from config or None if not found
|
||||||
|
"""
|
||||||
|
config_path = os.path.join(os.path.dirname(__file__), 'scan.ini')
|
||||||
|
|
||||||
|
if not os.path.exists(config_path):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read(config_path)
|
||||||
|
|
||||||
|
if 'DEFAULT' in config and 'LLAMA_CPP_SERVER_URL' in config['DEFAULT']:
|
||||||
|
return config['DEFAULT']['LLAMA_CPP_SERVER_URL']
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Failed to read scan.ini: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class JerseyDetectionTester:
|
||||||
|
"""Test runner for jersey detection evaluation."""
|
||||||
|
|
||||||
|
def __init__(self, server_url: str, prompt: str, model_name: Optional[str] = None, resize_max: Optional[int] = None, model_tag: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize the tester.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
server_url: Base URL for the llama.cpp server
|
||||||
|
prompt: Prompt text to use for detection
|
||||||
|
model_name: Name of the model being tested (optional)
|
||||||
|
resize_max: Maximum image dimension (resize if larger, None = no resize)
|
||||||
|
model_tag: Model tag for llama-swap integration (optional)
|
||||||
|
"""
|
||||||
|
self.client = LlamaCppClient(base_url=server_url)
|
||||||
|
self.prompt = prompt
|
||||||
|
self.model_name = model_name or "unknown"
|
||||||
|
self.resize_max = resize_max
|
||||||
|
self.model_tag = model_tag
|
||||||
|
self.results = []
|
||||||
|
|
||||||
|
def test_image(self, image_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Test jersey detection on a single image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_path: Path to the image file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing test results for this image
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Load image
|
||||||
|
image = cv2.imread(image_path)
|
||||||
|
if image is None:
|
||||||
|
filename = Path(image_path).name
|
||||||
|
expected_jerseys = parse_expected_jerseys(filename)
|
||||||
|
return {
|
||||||
|
'image_path': image_path,
|
||||||
|
'error': 'Failed to load image',
|
||||||
|
'jerseys': [],
|
||||||
|
'processing_time': 0,
|
||||||
|
'resized': False,
|
||||||
|
'original_size': None,
|
||||||
|
'final_size': None,
|
||||||
|
'expected_jerseys': expected_jerseys,
|
||||||
|
'detected_jerseys': [],
|
||||||
|
'true_positives': [],
|
||||||
|
'false_positives': [],
|
||||||
|
'false_negatives': expected_jerseys,
|
||||||
|
'precision': 0.0,
|
||||||
|
'recall': 0.0,
|
||||||
|
'f1_score': 0.0,
|
||||||
|
'avg_confidence_correct': None,
|
||||||
|
'avg_confidence_incorrect': None,
|
||||||
|
'confidence_correct_count': 0,
|
||||||
|
'confidence_incorrect_count': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Track original size
|
||||||
|
original_height, original_width = image.shape[:2]
|
||||||
|
original_size = (original_width, original_height)
|
||||||
|
resized = False
|
||||||
|
|
||||||
|
# Resize if needed
|
||||||
|
if self.resize_max and (original_width > self.resize_max or original_height > self.resize_max):
|
||||||
|
# Calculate new dimensions maintaining aspect ratio
|
||||||
|
if original_width > original_height:
|
||||||
|
new_width = self.resize_max
|
||||||
|
new_height = int(original_height * (self.resize_max / original_width))
|
||||||
|
else:
|
||||||
|
new_height = self.resize_max
|
||||||
|
new_width = int(original_width * (self.resize_max / original_height))
|
||||||
|
|
||||||
|
# Resize image
|
||||||
|
image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
|
||||||
|
resized = True
|
||||||
|
|
||||||
|
final_height, final_width = image.shape[:2]
|
||||||
|
final_size = (final_width, final_height)
|
||||||
|
|
||||||
|
# Create multimodal message
|
||||||
|
message = self.client.create_multimodal_message(
|
||||||
|
role="user",
|
||||||
|
content=self.prompt,
|
||||||
|
images=[image]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Send to LLM
|
||||||
|
try:
|
||||||
|
# Prepare kwargs for chat completion
|
||||||
|
completion_kwargs = {
|
||||||
|
'messages': [message],
|
||||||
|
'temperature': 0.1,
|
||||||
|
'max_tokens': 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add model parameter if model_tag is specified (for llama-swap)
|
||||||
|
if self.model_tag:
|
||||||
|
completion_kwargs['model'] = self.model_tag
|
||||||
|
# Note: We don't print this for every image to avoid spam, but it's being sent
|
||||||
|
|
||||||
|
response = self.client.chat_completion(**completion_kwargs)
|
||||||
|
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Extract response text
|
||||||
|
if 'choices' in response and len(response['choices']) > 0:
|
||||||
|
response_text = response['choices'][0]['message']['content']
|
||||||
|
|
||||||
|
# Clean response (remove think tags and markdown code blocks)
|
||||||
|
cleaned_text = clean_response(response_text)
|
||||||
|
|
||||||
|
# Parse JSON response
|
||||||
|
try:
|
||||||
|
result = json.loads(cleaned_text)
|
||||||
|
jerseys = result.get('jerseys', [])
|
||||||
|
|
||||||
|
# Apply hallucination detection
|
||||||
|
filtered_jerseys = []
|
||||||
|
hallucinated_count = 0
|
||||||
|
|
||||||
|
for jersey in jerseys:
|
||||||
|
jersey_number = jersey.get('jersey_number', '')
|
||||||
|
|
||||||
|
# Check for hallucination (model returning example numbers)
|
||||||
|
if jersey_number in HALLUCINATION_NUMBERS:
|
||||||
|
hallucinated_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
filtered_jerseys.append(jersey)
|
||||||
|
|
||||||
|
# Ground truth comparison
|
||||||
|
filename = Path(image_path).name
|
||||||
|
expected_jerseys = set(parse_expected_jerseys(filename))
|
||||||
|
detected_jerseys = set(jersey.get('jersey_number', '') for jersey in filtered_jerseys if jersey.get('jersey_number', ''))
|
||||||
|
|
||||||
|
# Calculate ground truth metrics
|
||||||
|
true_positives = expected_jerseys & detected_jerseys # Correctly detected
|
||||||
|
false_positives = detected_jerseys - expected_jerseys # Detected but not expected
|
||||||
|
false_negatives = expected_jerseys - detected_jerseys # Expected but not detected
|
||||||
|
|
||||||
|
# Calculate precision, recall, F1
|
||||||
|
tp_count = len(true_positives)
|
||||||
|
fp_count = len(false_positives)
|
||||||
|
fn_count = len(false_negatives)
|
||||||
|
|
||||||
|
precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0
|
||||||
|
recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0.0
|
||||||
|
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
||||||
|
|
||||||
|
# Handle edge case: if no expected jerseys, precision is 1.0 if no detections, else 0.0
|
||||||
|
if len(expected_jerseys) == 0:
|
||||||
|
precision = 1.0 if len(detected_jerseys) == 0 else 0.0
|
||||||
|
recall = 1.0 # No jerseys to detect
|
||||||
|
f1_score = 1.0 if len(detected_jerseys) == 0 else 0.0
|
||||||
|
|
||||||
|
# Calculate confidence scores for correct vs incorrect detections
|
||||||
|
confidence_correct = [] # Confidence for true positives
|
||||||
|
confidence_incorrect = [] # Confidence for false positives
|
||||||
|
|
||||||
|
for jersey in filtered_jerseys:
|
||||||
|
jersey_number = jersey.get('jersey_number', '')
|
||||||
|
confidence = jersey.get('confidence')
|
||||||
|
|
||||||
|
if confidence is not None:
|
||||||
|
if jersey_number in true_positives:
|
||||||
|
confidence_correct.append(confidence)
|
||||||
|
elif jersey_number in false_positives:
|
||||||
|
confidence_incorrect.append(confidence)
|
||||||
|
|
||||||
|
avg_confidence_correct = sum(confidence_correct) / len(confidence_correct) if confidence_correct else None
|
||||||
|
avg_confidence_incorrect = sum(confidence_incorrect) / len(confidence_incorrect) if confidence_incorrect else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
'image_path': image_path,
|
||||||
|
'jerseys': filtered_jerseys,
|
||||||
|
'hallucinated_count': hallucinated_count,
|
||||||
|
'raw_response': cleaned_text,
|
||||||
|
'processing_time': processing_time,
|
||||||
|
'error': None,
|
||||||
|
'resized': resized,
|
||||||
|
'original_size': original_size,
|
||||||
|
'final_size': final_size,
|
||||||
|
# Ground truth metrics
|
||||||
|
'expected_jerseys': sorted(expected_jerseys),
|
||||||
|
'detected_jerseys': sorted(detected_jerseys),
|
||||||
|
'true_positives': sorted(true_positives),
|
||||||
|
'false_positives': sorted(false_positives),
|
||||||
|
'false_negatives': sorted(false_negatives),
|
||||||
|
'precision': precision,
|
||||||
|
'recall': recall,
|
||||||
|
'f1_score': f1_score,
|
||||||
|
# Confidence calibration metrics
|
||||||
|
'avg_confidence_correct': avg_confidence_correct,
|
||||||
|
'avg_confidence_incorrect': avg_confidence_incorrect,
|
||||||
|
'confidence_correct_count': len(confidence_correct),
|
||||||
|
'confidence_incorrect_count': len(confidence_incorrect)
|
||||||
|
}
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
filename = Path(image_path).name
|
||||||
|
expected_jerseys = parse_expected_jerseys(filename)
|
||||||
|
return {
|
||||||
|
'image_path': image_path,
|
||||||
|
'error': f'JSON parse error: {e}',
|
||||||
|
'raw_response': cleaned_text,
|
||||||
|
'original_response': response_text if cleaned_text != response_text else None,
|
||||||
|
'jerseys': [],
|
||||||
|
'processing_time': processing_time,
|
||||||
|
'resized': resized,
|
||||||
|
'original_size': original_size,
|
||||||
|
'final_size': final_size,
|
||||||
|
'expected_jerseys': expected_jerseys,
|
||||||
|
'detected_jerseys': [],
|
||||||
|
'true_positives': [],
|
||||||
|
'false_positives': [],
|
||||||
|
'false_negatives': expected_jerseys,
|
||||||
|
'precision': 0.0,
|
||||||
|
'recall': 0.0,
|
||||||
|
'f1_score': 0.0
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
filename = Path(image_path).name
|
||||||
|
expected_jerseys = parse_expected_jerseys(filename)
|
||||||
|
return {
|
||||||
|
'image_path': image_path,
|
||||||
|
'error': 'Empty response from model',
|
||||||
|
'jerseys': [],
|
||||||
|
'processing_time': processing_time,
|
||||||
|
'resized': resized,
|
||||||
|
'original_size': original_size,
|
||||||
|
'final_size': final_size,
|
||||||
|
'expected_jerseys': expected_jerseys,
|
||||||
|
'detected_jerseys': [],
|
||||||
|
'true_positives': [],
|
||||||
|
'false_positives': [],
|
||||||
|
'false_negatives': expected_jerseys,
|
||||||
|
'precision': 0.0,
|
||||||
|
'recall': 0.0,
|
||||||
|
'f1_score': 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
filename = Path(image_path).name
|
||||||
|
expected_jerseys = parse_expected_jerseys(filename)
|
||||||
|
return {
|
||||||
|
'image_path': image_path,
|
||||||
|
'error': f'Request error: {e}',
|
||||||
|
'jerseys': [],
|
||||||
|
'processing_time': processing_time,
|
||||||
|
'resized': resized,
|
||||||
|
'original_size': original_size,
|
||||||
|
'final_size': final_size,
|
||||||
|
'expected_jerseys': expected_jerseys,
|
||||||
|
'detected_jerseys': [],
|
||||||
|
'true_positives': [],
|
||||||
|
'false_positives': [],
|
||||||
|
'false_negatives': expected_jerseys,
|
||||||
|
'precision': 0.0,
|
||||||
|
'recall': 0.0,
|
||||||
|
'f1_score': 0.0,
|
||||||
|
'avg_confidence_correct': None,
|
||||||
|
'avg_confidence_incorrect': None,
|
||||||
|
'confidence_correct_count': 0,
|
||||||
|
'confidence_incorrect_count': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_directory(self, directory_path: str) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Test all images in a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory_path: Path to directory containing images
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of results for all images
|
||||||
|
"""
|
||||||
|
# Get all image files
|
||||||
|
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
|
||||||
|
image_files = []
|
||||||
|
|
||||||
|
for ext in image_extensions:
|
||||||
|
image_files.extend(Path(directory_path).glob(f'*{ext}'))
|
||||||
|
image_files.extend(Path(directory_path).glob(f'*{ext.upper()}'))
|
||||||
|
|
||||||
|
image_files = sorted(image_files)
|
||||||
|
|
||||||
|
if not image_files:
|
||||||
|
print(f"No image files found in {directory_path}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Found {len(image_files)} images to process\n")
|
||||||
|
|
||||||
|
# Process each image
|
||||||
|
results = []
|
||||||
|
for i, image_path in enumerate(image_files, 1):
|
||||||
|
# Show model tag in progress if using llama-swap
|
||||||
|
model_info = f" [{self.model_tag}]" if self.model_tag else ""
|
||||||
|
print(f"[{i}/{len(image_files)}]{model_info} Processing {image_path.name}...")
|
||||||
|
result = self.test_image(str(image_path))
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Display result
|
||||||
|
self._display_result(result)
|
||||||
|
print()
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _display_result(self, result: Dict[str, Any]):
|
||||||
|
"""Display the result for a single image."""
|
||||||
|
if result.get('error'):
|
||||||
|
print(f" ❌ Error: {result['error']}")
|
||||||
|
if 'raw_response' in result:
|
||||||
|
print(f" Cleaned response: {result['raw_response']}...")
|
||||||
|
if result.get('original_response'):
|
||||||
|
print(f" (Think tags and/or markdown were filtered from response)")
|
||||||
|
else:
|
||||||
|
jerseys = result.get('jerseys', [])
|
||||||
|
hallucinated_count = result.get('hallucinated_count', 0)
|
||||||
|
|
||||||
|
if jerseys:
|
||||||
|
print(f" ✓ Found {len(jerseys)} jersey(s):")
|
||||||
|
for jersey in jerseys:
|
||||||
|
number = jersey.get('jersey_number', 'N/A')
|
||||||
|
jersey_color = jersey.get('jersey_color', 'N/A')
|
||||||
|
number_color = jersey.get('number_color', 'N/A')
|
||||||
|
confidence = jersey.get('confidence', None)
|
||||||
|
|
||||||
|
conf_str = f" (confidence: {confidence})" if confidence is not None else ""
|
||||||
|
print(f" - #{number}: {jersey_color} jersey, {number_color} number{conf_str}")
|
||||||
|
else:
|
||||||
|
print(f" ○ No jerseys detected")
|
||||||
|
|
||||||
|
if hallucinated_count > 0:
|
||||||
|
print(f" ⚠ Filtered {hallucinated_count} hallucinated detection(s)")
|
||||||
|
|
||||||
|
# Display ground truth comparison
|
||||||
|
expected = result.get('expected_jerseys', [])
|
||||||
|
detected = result.get('detected_jerseys', [])
|
||||||
|
true_positives = result.get('true_positives', [])
|
||||||
|
false_positives = result.get('false_positives', [])
|
||||||
|
false_negatives = result.get('false_negatives', [])
|
||||||
|
|
||||||
|
if expected:
|
||||||
|
print(f" Ground truth: Expected {expected}, Detected {detected}")
|
||||||
|
if true_positives:
|
||||||
|
print(f" ✓ Correct: {true_positives}")
|
||||||
|
if false_positives:
|
||||||
|
print(f" ✗ False positives: {false_positives}")
|
||||||
|
if false_negatives:
|
||||||
|
print(f" ✗ Missed: {false_negatives}")
|
||||||
|
precision = result.get('precision', 0.0)
|
||||||
|
recall = result.get('recall', 0.0)
|
||||||
|
f1 = result.get('f1_score', 0.0)
|
||||||
|
print(f" Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}")
|
||||||
|
|
||||||
|
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||||
|
|
||||||
|
|
||||||
|
def save_results_to_file(self, results: List[Dict[str, Any]], prompt_file: str, output_file: str = "jersey_detection_results.jsonl"):
|
||||||
|
"""
|
||||||
|
Save test results to a JSON Lines file for later analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: List of all test results
|
||||||
|
prompt_file: Path to the prompt file used
|
||||||
|
output_file: Path to output file (default: jersey_detection_results.jsonl)
|
||||||
|
"""
|
||||||
|
# Calculate summary statistics
|
||||||
|
total_images = len(results)
|
||||||
|
images_with_errors = sum(1 for r in results if r.get('error'))
|
||||||
|
images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
|
||||||
|
images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
|
||||||
|
total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
|
||||||
|
total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
|
||||||
|
total_raw_detections = total_jerseys + total_hallucinated
|
||||||
|
total_processing_time = sum(r.get('processing_time', 0) for r in results)
|
||||||
|
avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
|
||||||
|
|
||||||
|
# Collect confidence statistics if available
|
||||||
|
confidences = [
|
||||||
|
jersey.get('confidence')
|
||||||
|
for r in results if not r.get('error')
|
||||||
|
for jersey in r.get('jerseys', [])
|
||||||
|
if 'confidence' in jersey and jersey.get('confidence') is not None
|
||||||
|
]
|
||||||
|
|
||||||
|
confidence_stats = None
|
||||||
|
if confidences:
|
||||||
|
buckets = {
|
||||||
|
'90-100': sum(1 for c in confidences if 90 <= c <= 100),
|
||||||
|
'70-89': sum(1 for c in confidences if 70 <= c <= 89),
|
||||||
|
'50-69': sum(1 for c in confidences if 50 <= c <= 69),
|
||||||
|
'30-49': sum(1 for c in confidences if 30 <= c <= 49),
|
||||||
|
'0-29': sum(1 for c in confidences if 0 <= c <= 29)
|
||||||
|
}
|
||||||
|
confidence_stats = {
|
||||||
|
'avg': sum(confidences) / len(confidences),
|
||||||
|
'min': min(confidences),
|
||||||
|
'max': max(confidences),
|
||||||
|
'count': len(confidences),
|
||||||
|
'distribution': buckets
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate resize statistics
|
||||||
|
images_resized = sum(1 for r in results if r.get('resized', False))
|
||||||
|
|
||||||
|
# Calculate ground truth statistics
|
||||||
|
results_without_errors = [r for r in results if not r.get('error')]
|
||||||
|
total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
|
||||||
|
total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
|
||||||
|
total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
|
||||||
|
total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
|
||||||
|
|
||||||
|
# Calculate overall precision, recall, F1
|
||||||
|
overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
|
||||||
|
overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
|
||||||
|
overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
|
||||||
|
|
||||||
|
# Average per-image metrics
|
||||||
|
avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
|
||||||
|
avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
|
||||||
|
avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
|
||||||
|
|
||||||
|
# Calculate confidence calibration metrics (correct vs incorrect detections)
|
||||||
|
all_confidence_correct = []
|
||||||
|
all_confidence_incorrect = []
|
||||||
|
for r in results_without_errors:
|
||||||
|
if r.get('avg_confidence_correct') is not None:
|
||||||
|
# Weight by the count of correct detections in this image
|
||||||
|
count = r.get('confidence_correct_count', 0)
|
||||||
|
avg_conf = r.get('avg_confidence_correct')
|
||||||
|
all_confidence_correct.extend([avg_conf] * count)
|
||||||
|
if r.get('avg_confidence_incorrect') is not None:
|
||||||
|
# Weight by the count of incorrect detections in this image
|
||||||
|
count = r.get('confidence_incorrect_count', 0)
|
||||||
|
avg_conf = r.get('avg_confidence_incorrect')
|
||||||
|
all_confidence_incorrect.extend([avg_conf] * count)
|
||||||
|
|
||||||
|
overall_avg_confidence_correct = sum(all_confidence_correct) / len(all_confidence_correct) if all_confidence_correct else None
|
||||||
|
overall_avg_confidence_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect) if all_confidence_incorrect else None
|
||||||
|
|
||||||
|
# Create summary record
|
||||||
|
summary_record = {
|
||||||
|
'timestamp': datetime.now().isoformat(),
|
||||||
|
'model_name': self.model_name,
|
||||||
|
'model_tag': self.model_tag,
|
||||||
|
'prompt_file': prompt_file,
|
||||||
|
'prompt_length': len(self.prompt),
|
||||||
|
'total_images': total_images,
|
||||||
|
'images_with_jerseys': images_with_jerseys,
|
||||||
|
'images_without_jerseys': images_without_jerseys,
|
||||||
|
'images_with_errors': images_with_errors,
|
||||||
|
'total_raw_detections': total_raw_detections,
|
||||||
|
'total_valid_jerseys': total_jerseys,
|
||||||
|
'total_hallucinated': total_hallucinated,
|
||||||
|
'avg_processing_time': avg_processing_time,
|
||||||
|
'total_processing_time': total_processing_time,
|
||||||
|
'confidence_stats': confidence_stats,
|
||||||
|
'empty_response_capable': images_without_jerseys > 0,
|
||||||
|
'resize_enabled': self.resize_max is not None,
|
||||||
|
'resize_max': self.resize_max,
|
||||||
|
'images_resized': images_resized,
|
||||||
|
# Ground truth statistics
|
||||||
|
'ground_truth': {
|
||||||
|
'total_expected': total_expected_jerseys,
|
||||||
|
'total_true_positives': total_true_positives,
|
||||||
|
'total_false_positives': total_false_positives,
|
||||||
|
'total_false_negatives': total_false_negatives,
|
||||||
|
'overall_precision': overall_precision,
|
||||||
|
'overall_recall': overall_recall,
|
||||||
|
'overall_f1': overall_f1,
|
||||||
|
'avg_precision': avg_precision,
|
||||||
|
'avg_recall': avg_recall,
|
||||||
|
'avg_f1': avg_f1,
|
||||||
|
# Confidence calibration
|
||||||
|
'avg_confidence_correct': overall_avg_confidence_correct,
|
||||||
|
'avg_confidence_incorrect': overall_avg_confidence_incorrect,
|
||||||
|
'confidence_correct_count': len(all_confidence_correct),
|
||||||
|
'confidence_incorrect_count': len(all_confidence_incorrect)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Append to file
|
||||||
|
try:
|
||||||
|
with open(output_file, 'a') as f:
|
||||||
|
f.write(json.dumps(summary_record) + '\n')
|
||||||
|
print(f"\n✓ Results saved to {output_file}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Failed to save results: {e}")
|
||||||
|
|
||||||
|
def print_summary(self, results: List[Dict[str, Any]]):
|
||||||
|
"""
|
||||||
|
Print summary statistics for all results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: List of all test results
|
||||||
|
"""
|
||||||
|
print("=" * 70)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"\nModel: {self.model_name}")
|
||||||
|
if self.model_tag:
|
||||||
|
print(f"Model tag: {self.model_tag}")
|
||||||
|
|
||||||
|
# Display resize info
|
||||||
|
if self.resize_max:
|
||||||
|
images_resized = sum(1 for r in results if r.get('resized', False))
|
||||||
|
print(f"Resize: Enabled (max: {self.resize_max}px, {images_resized} images resized)")
|
||||||
|
else:
|
||||||
|
print(f"Resize: Disabled")
|
||||||
|
|
||||||
|
total_images = len(results)
|
||||||
|
images_with_errors = sum(1 for r in results if r.get('error'))
|
||||||
|
images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
|
||||||
|
images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
|
||||||
|
total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
|
||||||
|
total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
|
||||||
|
total_raw_detections = total_jerseys + total_hallucinated
|
||||||
|
total_processing_time = sum(r.get('processing_time', 0) for r in results)
|
||||||
|
avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
|
||||||
|
|
||||||
|
print(f"\nTotal images processed: {total_images}")
|
||||||
|
print(f" - Images with jerseys: {images_with_jerseys} ({images_with_jerseys/total_images*100:.1f}%)")
|
||||||
|
print(f" - Images without jerseys: {images_without_jerseys} ({images_without_jerseys/total_images*100:.1f}%)")
|
||||||
|
print(f" - Images with errors: {images_with_errors} ({images_with_errors/total_images*100:.1f}%)")
|
||||||
|
print(f"\nJersey detections:")
|
||||||
|
print(f" - Total raw detections: {total_raw_detections}")
|
||||||
|
print(f" - Valid jerseys (after filtering): {total_jerseys}")
|
||||||
|
print(f" - Hallucinations filtered out: {total_hallucinated}")
|
||||||
|
if images_with_jerseys > 0:
|
||||||
|
print(f" - Average valid jerseys per image (when detected): {total_jerseys/images_with_jerseys:.2f}")
|
||||||
|
|
||||||
|
# Empty response capability (important for evaluating model's ability to return empty results)
|
||||||
|
print(f"\nEmpty response capability:")
|
||||||
|
print(f" - Empty responses returned: {images_without_jerseys}")
|
||||||
|
print(f" - Percentage of images: {images_without_jerseys/total_images*100:.1f}%")
|
||||||
|
print(f" - Model can return empty results: {'✓ Yes' if images_without_jerseys > 0 else '✗ No (potential issue)'}")
|
||||||
|
|
||||||
|
if total_hallucinated > 0:
|
||||||
|
print(f"\nHallucination detection:")
|
||||||
|
print(f" - Total hallucinated detections filtered: {total_hallucinated}")
|
||||||
|
images_with_hallucinations = sum(1 for r in results if not r.get('error') and r.get('hallucinated_count', 0) > 0)
|
||||||
|
print(f" - Images with hallucinations: {images_with_hallucinations} ({images_with_hallucinations/total_images*100:.1f}%)")
|
||||||
|
|
||||||
|
# Ground truth statistics
|
||||||
|
results_without_errors = [r for r in results if not r.get('error')]
|
||||||
|
total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
|
||||||
|
|
||||||
|
if total_expected_jerseys > 0:
|
||||||
|
total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
|
||||||
|
total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
|
||||||
|
total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
|
||||||
|
|
||||||
|
# Calculate overall metrics
|
||||||
|
overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
|
||||||
|
overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
|
||||||
|
overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
|
||||||
|
|
||||||
|
# Calculate average per-image metrics
|
||||||
|
avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
|
||||||
|
avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
|
||||||
|
avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
|
||||||
|
|
||||||
|
print(f"\nGround truth performance:")
|
||||||
|
print(f" - Total expected jerseys: {total_expected_jerseys}")
|
||||||
|
print(f" - True positives: {total_true_positives}")
|
||||||
|
print(f" - False positives: {total_false_positives}")
|
||||||
|
print(f" - False negatives: {total_false_negatives}")
|
||||||
|
print(f"\n Overall metrics (across all jerseys):")
|
||||||
|
print(f" - Precision: {overall_precision:.2%}")
|
||||||
|
print(f" - Recall: {overall_recall:.2%}")
|
||||||
|
print(f" - F1 Score: {overall_f1:.2%}")
|
||||||
|
print(f"\n Average per-image metrics:")
|
||||||
|
print(f" - Avg Precision: {avg_precision:.2%}")
|
||||||
|
print(f" - Avg Recall: {avg_recall:.2%}")
|
||||||
|
print(f" - Avg F1 Score: {avg_f1:.2%}")
|
||||||
|
|
||||||
|
# Confidence calibration metrics
|
||||||
|
all_confidence_correct = []
|
||||||
|
all_confidence_incorrect = []
|
||||||
|
for r in results_without_errors:
|
||||||
|
if r.get('avg_confidence_correct') is not None:
|
||||||
|
count = r.get('confidence_correct_count', 0)
|
||||||
|
avg_conf = r.get('avg_confidence_correct')
|
||||||
|
all_confidence_correct.extend([avg_conf] * count)
|
||||||
|
if r.get('avg_confidence_incorrect') is not None:
|
||||||
|
count = r.get('confidence_incorrect_count', 0)
|
||||||
|
avg_conf = r.get('avg_confidence_incorrect')
|
||||||
|
all_confidence_incorrect.extend([avg_conf] * count)
|
||||||
|
|
||||||
|
if all_confidence_correct or all_confidence_incorrect:
|
||||||
|
print(f"\n Confidence calibration:")
|
||||||
|
if all_confidence_correct:
|
||||||
|
avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
|
||||||
|
print(f" - Avg confidence (correct detections): {avg_conf_correct:.2f} ({len(all_confidence_correct)} detections)")
|
||||||
|
else:
|
||||||
|
print(f" - Avg confidence (correct detections): N/A (no correct detections with confidence)")
|
||||||
|
|
||||||
|
if all_confidence_incorrect:
|
||||||
|
avg_conf_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect)
|
||||||
|
print(f" - Avg confidence (incorrect detections): {avg_conf_incorrect:.2f} ({len(all_confidence_incorrect)} detections)")
|
||||||
|
|
||||||
|
# Show confidence difference
|
||||||
|
if all_confidence_correct:
|
||||||
|
avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
|
||||||
|
diff = avg_conf_correct - avg_conf_incorrect
|
||||||
|
if diff > 0:
|
||||||
|
print(f" - Confidence difference: +{diff:.2f} (correct > incorrect, good calibration)")
|
||||||
|
else:
|
||||||
|
print(f" - Confidence difference: {diff:.2f} (⚠ incorrect ≥ correct, poor calibration)")
|
||||||
|
else:
|
||||||
|
print(f" - Avg confidence (incorrect detections): N/A (no incorrect detections with confidence)")
|
||||||
|
|
||||||
|
print(f"\nProcessing time:")
|
||||||
|
print(f" - Total: {total_processing_time:.2f}s")
|
||||||
|
print(f" - Average per image: {avg_processing_time:.2f}s")
|
||||||
|
|
||||||
|
# Check for confidence values
|
||||||
|
has_confidence = any(
|
||||||
|
any('confidence' in jersey for jersey in r.get('jerseys', []))
|
||||||
|
for r in results if not r.get('error')
|
||||||
|
)
|
||||||
|
|
||||||
|
if has_confidence:
|
||||||
|
print(f"\nConfidence statistics:")
|
||||||
|
confidences = [
|
||||||
|
jersey.get('confidence')
|
||||||
|
for r in results if not r.get('error')
|
||||||
|
for jersey in r.get('jerseys', [])
|
||||||
|
if 'confidence' in jersey and jersey.get('confidence') is not None
|
||||||
|
]
|
||||||
|
if confidences:
|
||||||
|
avg_confidence = sum(confidences) / len(confidences)
|
||||||
|
min_confidence = min(confidences)
|
||||||
|
max_confidence = max(confidences)
|
||||||
|
print(f" - Total detections with confidence: {len(confidences)}")
|
||||||
|
print(f" - Average confidence: {avg_confidence:.2f}")
|
||||||
|
print(f" - Min confidence: {min_confidence:.2f}")
|
||||||
|
print(f" - Max confidence: {max_confidence:.2f}")
|
||||||
|
|
||||||
|
# Confidence distribution by bucket
|
||||||
|
print(f"\n Confidence distribution:")
|
||||||
|
buckets = {
|
||||||
|
'90-100 (Extremely clear)': (90, 100),
|
||||||
|
'70-89 (Clear, minor issues)': (70, 89),
|
||||||
|
'50-69 (Partially visible)': (50, 69),
|
||||||
|
'30-49 (Difficult to read)': (30, 49),
|
||||||
|
'0-29 (Very uncertain)': (0, 29)
|
||||||
|
}
|
||||||
|
|
||||||
|
for bucket_name, (min_val, max_val) in buckets.items():
|
||||||
|
count = sum(1 for c in confidences if min_val <= c <= max_val)
|
||||||
|
percentage = (count / len(confidences) * 100) if len(confidences) > 0 else 0
|
||||||
|
bar_length = int(percentage / 2) # Scale to max 50 chars
|
||||||
|
bar = '█' * bar_length
|
||||||
|
print(f" {bucket_name}: {count:3d} ({percentage:5.1f}%) {bar}")
|
||||||
|
|
||||||
|
# List errors if any
|
||||||
|
if images_with_errors > 0:
|
||||||
|
print(f"\nErrors encountered:")
|
||||||
|
for r in results:
|
||||||
|
if r.get('error'):
|
||||||
|
print(f" - {Path(r['image_path']).name}: {r['error']}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point for the test script."""
|
||||||
|
# Get default server URL from config
|
||||||
|
default_server_url = get_llama_server_url_from_config() or 'http://192.168.1.34:8080'
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Test jersey detection with different models and prompts',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument('image_directory', help='Path to directory containing test images')
|
||||||
|
parser.add_argument('prompt_file', help='Path to text file containing the prompt')
|
||||||
|
parser.add_argument('--model-name', default=None,
|
||||||
|
help='Name of the model being tested (auto-detected from server if not provided)')
|
||||||
|
parser.add_argument('--server-url', default=default_server_url,
|
||||||
|
help=f'llama.cpp server URL (default: {default_server_url})')
|
||||||
|
parser.add_argument('--output-file', default='jersey_detection_results.jsonl',
|
||||||
|
help='Output file for results (default: jersey_detection_results.jsonl)')
|
||||||
|
parser.add_argument('--resize', type=int, default=None, metavar='MAX_SIZE',
|
||||||
|
help='Resize images to maximum dimension (e.g., 1024) before processing')
|
||||||
|
parser.add_argument('--model-tag', default=None,
|
||||||
|
help='Model tag for llama-swap (e.g., "qwen2.5-vl-7b"). If not specified, uses whatever model is loaded.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Validate inputs
|
||||||
|
if not os.path.isdir(args.image_directory):
|
||||||
|
print(f"Error: Directory not found: {args.image_directory}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not os.path.isfile(args.prompt_file):
|
||||||
|
print(f"Error: Prompt file not found: {args.prompt_file}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load prompt
|
||||||
|
try:
|
||||||
|
with open(args.prompt_file, 'r') as f:
|
||||||
|
prompt = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading prompt file: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Print test configuration
|
||||||
|
print("=" * 70)
|
||||||
|
print("JERSEY DETECTION TEST")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Model name: {args.model_name if args.model_name else '(auto-detect)'}")
|
||||||
|
print(f"Model tag: {args.model_tag if args.model_tag else 'None (use loaded model)'}")
|
||||||
|
print(f"Server URL: {args.server_url}")
|
||||||
|
print(f"Image directory: {args.image_directory}")
|
||||||
|
print(f"Prompt file: {args.prompt_file}")
|
||||||
|
print(f"Prompt length: {len(prompt)} characters")
|
||||||
|
print(f"Output file: {args.output_file}")
|
||||||
|
print(f"Resize images: {f'Yes (max: {args.resize}px)' if args.resize else 'No'}")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check server health
|
||||||
|
print("Checking server health...")
|
||||||
|
try:
|
||||||
|
client = LlamaCppClient(base_url=args.server_url)
|
||||||
|
|
||||||
|
# Try health check (handle both JSON and plain text responses)
|
||||||
|
try:
|
||||||
|
health = client.health_check()
|
||||||
|
print(f"✓ Server is healthy: {health}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# llama-swap returns plain text "OK" instead of JSON
|
||||||
|
response = requests.get(f"{args.server_url}/health")
|
||||||
|
response.raise_for_status()
|
||||||
|
print(f"✓ Server is healthy: {response.text}")
|
||||||
|
|
||||||
|
# Determine model name to use
|
||||||
|
model_name = args.model_name
|
||||||
|
|
||||||
|
# If model_tag is provided, use it as the model name (unless user explicitly provided a model_name)
|
||||||
|
if args.model_tag and not args.model_name:
|
||||||
|
model_name = args.model_tag
|
||||||
|
print(f"✓ Using model tag as model name: {model_name}")
|
||||||
|
elif not model_name:
|
||||||
|
# Only auto-detect if neither model_tag nor model_name was provided
|
||||||
|
detected_model_name = None
|
||||||
|
try:
|
||||||
|
models = client.get_models()
|
||||||
|
if 'data' in models and len(models['data']) > 0:
|
||||||
|
model_id = models['data'][0].get('id', 'unknown')
|
||||||
|
print(f"✓ Active model: {model_id}")
|
||||||
|
|
||||||
|
# Extract just the model filename (without path)
|
||||||
|
if model_id and model_id != 'unknown':
|
||||||
|
# Remove path and get base filename
|
||||||
|
model_filename = os.path.basename(model_id)
|
||||||
|
# Remove common extensions (.gguf, .bin, etc.)
|
||||||
|
model_name_no_ext = os.path.splitext(model_filename)[0]
|
||||||
|
detected_model_name = model_name_no_ext
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if detected_model_name:
|
||||||
|
model_name = detected_model_name
|
||||||
|
print(f"✓ Using auto-detected model name: {model_name}")
|
||||||
|
else:
|
||||||
|
model_name = "unknown"
|
||||||
|
print(f"⚠ Could not detect model name, using 'unknown'")
|
||||||
|
else:
|
||||||
|
# User explicitly provided model_name
|
||||||
|
print(f"✓ Using provided model name: {model_name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to connect to server: {e}")
|
||||||
|
print(f"Make sure llama.cpp server is running at {args.server_url}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Show model tag info if using llama-swap
|
||||||
|
if args.model_tag:
|
||||||
|
print(f"Requesting model from llama-swap: {args.model_tag}")
|
||||||
|
|
||||||
|
# Check currently running models on llama-swap
|
||||||
|
try:
|
||||||
|
running_response = requests.get(f"{args.server_url}/running")
|
||||||
|
if running_response.status_code == 200:
|
||||||
|
try:
|
||||||
|
running_models = running_response.json()
|
||||||
|
if running_models:
|
||||||
|
print(f"Currently running models: {running_models}")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
tester = JerseyDetectionTester(args.server_url, prompt, model_name, args.resize, args.model_tag)
|
||||||
|
results = tester.test_directory(args.image_directory)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
if results:
|
||||||
|
tester.print_summary(results)
|
||||||
|
|
||||||
|
# Save results to file
|
||||||
|
tester.save_results_to_file(results, args.prompt_file, args.output_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user