Files
jersey_test/test_jersey_detection.py
Rick McEwen 8706edcd13 Initial commit: Jersey detection test suite
Test scripts and utilities for evaluating vision-language models
on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00

972 lines
44 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Test script for evaluating jersey detection performance with different models and prompts.
Usage:
python test_jersey_detection.py <image_directory> <prompt_file> [options]
Arguments:
image_directory: Path to directory containing test images
prompt_file: Path to text file containing the prompt to use
--model-name: Name of the model being tested (optional, auto-detected from server if not provided)
--model-tag: Model tag for llama-swap integration (optional)
--server-url: Optional llama.cpp server URL (default: read from scan.ini)
--output-file: Output file for results (default: jersey_detection_results.jsonl)
--resize: Maximum image dimension for resizing before processing
Ground Truth:
Expected jersey numbers are parsed from filenames using dash-separated format:
Example: 1122-8-10-29.jpg expects jerseys 8, 10, and 29
The script calculates precision, recall, F1 score, and confidence calibration metrics
to evaluate model accuracy against known correct results.
Output Files:
<output_file>: Summary statistics with ground truth metrics (default: jersey_detection_results.jsonl)
Example:
# Auto-detect model name from server
python test_jersey_detection.py ./images jersey_prompt.txt
# Resize images to 1024px max dimension before processing
python test_jersey_detection.py ./images jersey_prompt.txt --resize 1024
# Use llama-swap to automatically load a specific model
python test_jersey_detection.py ./images jersey_prompt.txt --model-tag "qwen2.5-vl-7b" --resize 1024
# Specify custom model name (for tracking in results)
python test_jersey_detection.py ./images jersey_prompt.txt --model-name "llama-3.2-vision"
python test_jersey_detection.py ./images jersey_prompt_with_confidence.txt --model-name "qwen2-vl" --resize 1024
After running tests, analyze results with:
python analyze_jersey_results.py # Performance and accuracy analysis
"""
import argparse
import configparser
import json
import os
import re
import requests
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
import cv2
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from scan_utils.llama_cpp_client import LlamaCppClient
# Hallucination detection: filter out example numbers from prompts
# Using numbers > 100 as examples to avoid filtering valid jersey numbers
HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
def parse_expected_jerseys(filename: str) -> List[str]:
"""
Parse expected jersey numbers from filename.
Format: prefix-number1-number2-number3.ext
Example: 1122-8-10-29.jpg -> ['8', '10', '29']
Args:
filename: Image filename
Returns:
List of expected jersey numbers as strings
"""
# Remove extension
name_without_ext = Path(filename).stem
# Split by dash
parts = name_without_ext.split('-')
# First part is typically a prefix/identifier, rest are jersey numbers
# Skip the first part and collect numeric parts
expected = []
for i, part in enumerate(parts[1:], 1): # Skip first part
# Check if part is numeric (jersey number)
if part.isdigit():
expected.append(part)
return expected
def clean_response(text: str) -> str:
"""
Clean the response by removing think tags and markdown code blocks.
Some models use <think> tags for chain-of-thought reasoning and wrap JSON in markdown.
Args:
text: Raw response text
Returns:
Cleaned text ready for JSON parsing
"""
# Remove <think>...</think> tags and their content (standard angle brackets)
cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
# Remove ◁think▷...◁/think▷ tags (unicode triangle brackets)
cleaned = re.sub(r'◁think▷.*?◁/think▷', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
# Also remove any standalone think tags (both formats)
cleaned = re.sub(r'</?think>', '', cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r'◁/?think▷', '', cleaned, flags=re.IGNORECASE)
# Remove markdown code blocks (```json ... ``` or ``` ... ```)
# First try to extract content from ```json blocks
json_block_match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, flags=re.DOTALL | re.IGNORECASE)
if json_block_match:
# Extract just the content inside the code block
cleaned = json_block_match.group(1)
else:
# If no code block, just remove any stray ``` markers
cleaned = re.sub(r'```(?:json)?', '', cleaned, flags=re.IGNORECASE)
return cleaned.strip()
def get_llama_server_url_from_config() -> Optional[str]:
"""
Read the LLAMA_CPP_SERVER_URL from scan.ini.
Returns:
Server URL from config or None if not found
"""
config_path = os.path.join(os.path.dirname(__file__), 'scan.ini')
if not os.path.exists(config_path):
return None
try:
config = configparser.ConfigParser()
config.read(config_path)
if 'DEFAULT' in config and 'LLAMA_CPP_SERVER_URL' in config['DEFAULT']:
return config['DEFAULT']['LLAMA_CPP_SERVER_URL']
except Exception as e:
print(f"Warning: Failed to read scan.ini: {e}")
return None
class JerseyDetectionTester:
"""Test runner for jersey detection evaluation."""
def __init__(self, server_url: str, prompt: str, model_name: Optional[str] = None, resize_max: Optional[int] = None, model_tag: Optional[str] = None):
"""
Initialize the tester.
Args:
server_url: Base URL for the llama.cpp server
prompt: Prompt text to use for detection
model_name: Name of the model being tested (optional)
resize_max: Maximum image dimension (resize if larger, None = no resize)
model_tag: Model tag for llama-swap integration (optional)
"""
self.client = LlamaCppClient(base_url=server_url)
self.prompt = prompt
self.model_name = model_name or "unknown"
self.resize_max = resize_max
self.model_tag = model_tag
self.results = []
def test_image(self, image_path: str) -> Dict[str, Any]:
"""
Test jersey detection on a single image.
Args:
image_path: Path to the image file
Returns:
Dictionary containing test results for this image
"""
start_time = time.time()
# Load image
image = cv2.imread(image_path)
if image is None:
filename = Path(image_path).name
expected_jerseys = parse_expected_jerseys(filename)
return {
'image_path': image_path,
'error': 'Failed to load image',
'jerseys': [],
'processing_time': 0,
'resized': False,
'original_size': None,
'final_size': None,
'expected_jerseys': expected_jerseys,
'detected_jerseys': [],
'true_positives': [],
'false_positives': [],
'false_negatives': expected_jerseys,
'precision': 0.0,
'recall': 0.0,
'f1_score': 0.0,
'avg_confidence_correct': None,
'avg_confidence_incorrect': None,
'confidence_correct_count': 0,
'confidence_incorrect_count': 0
}
# Track original size
original_height, original_width = image.shape[:2]
original_size = (original_width, original_height)
resized = False
# Resize if needed
if self.resize_max and (original_width > self.resize_max or original_height > self.resize_max):
# Calculate new dimensions maintaining aspect ratio
if original_width > original_height:
new_width = self.resize_max
new_height = int(original_height * (self.resize_max / original_width))
else:
new_height = self.resize_max
new_width = int(original_width * (self.resize_max / original_height))
# Resize image
image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
resized = True
final_height, final_width = image.shape[:2]
final_size = (final_width, final_height)
# Create multimodal message
message = self.client.create_multimodal_message(
role="user",
content=self.prompt,
images=[image]
)
# Send to LLM
try:
# Prepare kwargs for chat completion
completion_kwargs = {
'messages': [message],
'temperature': 0.1,
'max_tokens': 1000
}
# Add model parameter if model_tag is specified (for llama-swap)
if self.model_tag:
completion_kwargs['model'] = self.model_tag
# Note: We don't print this for every image to avoid spam, but it's being sent
response = self.client.chat_completion(**completion_kwargs)
processing_time = time.time() - start_time
# Extract response text
if 'choices' in response and len(response['choices']) > 0:
response_text = response['choices'][0]['message']['content']
# Clean response (remove think tags and markdown code blocks)
cleaned_text = clean_response(response_text)
# Parse JSON response
try:
result = json.loads(cleaned_text)
jerseys = result.get('jerseys', [])
# Apply hallucination detection
filtered_jerseys = []
hallucinated_count = 0
for jersey in jerseys:
jersey_number = jersey.get('jersey_number', '')
# Check for hallucination (model returning example numbers)
if jersey_number in HALLUCINATION_NUMBERS:
hallucinated_count += 1
continue
filtered_jerseys.append(jersey)
# Ground truth comparison
filename = Path(image_path).name
expected_jerseys = set(parse_expected_jerseys(filename))
detected_jerseys = set(jersey.get('jersey_number', '') for jersey in filtered_jerseys if jersey.get('jersey_number', ''))
# Calculate ground truth metrics
true_positives = expected_jerseys & detected_jerseys # Correctly detected
false_positives = detected_jerseys - expected_jerseys # Detected but not expected
false_negatives = expected_jerseys - detected_jerseys # Expected but not detected
# Calculate precision, recall, F1
tp_count = len(true_positives)
fp_count = len(false_positives)
fn_count = len(false_negatives)
precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0
recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0.0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
# Handle edge case: if no expected jerseys, precision is 1.0 if no detections, else 0.0
if len(expected_jerseys) == 0:
precision = 1.0 if len(detected_jerseys) == 0 else 0.0
recall = 1.0 # No jerseys to detect
f1_score = 1.0 if len(detected_jerseys) == 0 else 0.0
# Calculate confidence scores for correct vs incorrect detections
confidence_correct = [] # Confidence for true positives
confidence_incorrect = [] # Confidence for false positives
for jersey in filtered_jerseys:
jersey_number = jersey.get('jersey_number', '')
confidence = jersey.get('confidence')
if confidence is not None:
if jersey_number in true_positives:
confidence_correct.append(confidence)
elif jersey_number in false_positives:
confidence_incorrect.append(confidence)
avg_confidence_correct = sum(confidence_correct) / len(confidence_correct) if confidence_correct else None
avg_confidence_incorrect = sum(confidence_incorrect) / len(confidence_incorrect) if confidence_incorrect else None
return {
'image_path': image_path,
'jerseys': filtered_jerseys,
'hallucinated_count': hallucinated_count,
'raw_response': cleaned_text,
'processing_time': processing_time,
'error': None,
'resized': resized,
'original_size': original_size,
'final_size': final_size,
# Ground truth metrics
'expected_jerseys': sorted(expected_jerseys),
'detected_jerseys': sorted(detected_jerseys),
'true_positives': sorted(true_positives),
'false_positives': sorted(false_positives),
'false_negatives': sorted(false_negatives),
'precision': precision,
'recall': recall,
'f1_score': f1_score,
# Confidence calibration metrics
'avg_confidence_correct': avg_confidence_correct,
'avg_confidence_incorrect': avg_confidence_incorrect,
'confidence_correct_count': len(confidence_correct),
'confidence_incorrect_count': len(confidence_incorrect)
}
except json.JSONDecodeError as e:
filename = Path(image_path).name
expected_jerseys = parse_expected_jerseys(filename)
return {
'image_path': image_path,
'error': f'JSON parse error: {e}',
'raw_response': cleaned_text,
'original_response': response_text if cleaned_text != response_text else None,
'jerseys': [],
'processing_time': processing_time,
'resized': resized,
'original_size': original_size,
'final_size': final_size,
'expected_jerseys': expected_jerseys,
'detected_jerseys': [],
'true_positives': [],
'false_positives': [],
'false_negatives': expected_jerseys,
'precision': 0.0,
'recall': 0.0,
'f1_score': 0.0
}
else:
filename = Path(image_path).name
expected_jerseys = parse_expected_jerseys(filename)
return {
'image_path': image_path,
'error': 'Empty response from model',
'jerseys': [],
'processing_time': processing_time,
'resized': resized,
'original_size': original_size,
'final_size': final_size,
'expected_jerseys': expected_jerseys,
'detected_jerseys': [],
'true_positives': [],
'false_positives': [],
'false_negatives': expected_jerseys,
'precision': 0.0,
'recall': 0.0,
'f1_score': 0.0
}
except Exception as e:
processing_time = time.time() - start_time
filename = Path(image_path).name
expected_jerseys = parse_expected_jerseys(filename)
return {
'image_path': image_path,
'error': f'Request error: {e}',
'jerseys': [],
'processing_time': processing_time,
'resized': resized,
'original_size': original_size,
'final_size': final_size,
'expected_jerseys': expected_jerseys,
'detected_jerseys': [],
'true_positives': [],
'false_positives': [],
'false_negatives': expected_jerseys,
'precision': 0.0,
'recall': 0.0,
'f1_score': 0.0,
'avg_confidence_correct': None,
'avg_confidence_incorrect': None,
'confidence_correct_count': 0,
'confidence_incorrect_count': 0
}
def test_directory(self, directory_path: str) -> List[Dict[str, Any]]:
"""
Test all images in a directory.
Args:
directory_path: Path to directory containing images
Returns:
List of results for all images
"""
# Get all image files
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
image_files = []
for ext in image_extensions:
image_files.extend(Path(directory_path).glob(f'*{ext}'))
image_files.extend(Path(directory_path).glob(f'*{ext.upper()}'))
image_files = sorted(image_files)
if not image_files:
print(f"No image files found in {directory_path}")
return []
print(f"Found {len(image_files)} images to process\n")
# Process each image
results = []
for i, image_path in enumerate(image_files, 1):
# Show model tag in progress if using llama-swap
model_info = f" [{self.model_tag}]" if self.model_tag else ""
print(f"[{i}/{len(image_files)}]{model_info} Processing {image_path.name}...")
result = self.test_image(str(image_path))
results.append(result)
# Display result
self._display_result(result)
print()
return results
def _display_result(self, result: Dict[str, Any]):
"""Display the result for a single image."""
if result.get('error'):
print(f" ❌ Error: {result['error']}")
if 'raw_response' in result:
print(f" Cleaned response: {result['raw_response']}...")
if result.get('original_response'):
print(f" (Think tags and/or markdown were filtered from response)")
else:
jerseys = result.get('jerseys', [])
hallucinated_count = result.get('hallucinated_count', 0)
if jerseys:
print(f" ✓ Found {len(jerseys)} jersey(s):")
for jersey in jerseys:
number = jersey.get('jersey_number', 'N/A')
jersey_color = jersey.get('jersey_color', 'N/A')
number_color = jersey.get('number_color', 'N/A')
confidence = jersey.get('confidence', None)
conf_str = f" (confidence: {confidence})" if confidence is not None else ""
print(f" - #{number}: {jersey_color} jersey, {number_color} number{conf_str}")
else:
print(f" ○ No jerseys detected")
if hallucinated_count > 0:
print(f" ⚠ Filtered {hallucinated_count} hallucinated detection(s)")
# Display ground truth comparison
expected = result.get('expected_jerseys', [])
detected = result.get('detected_jerseys', [])
true_positives = result.get('true_positives', [])
false_positives = result.get('false_positives', [])
false_negatives = result.get('false_negatives', [])
if expected:
print(f" Ground truth: Expected {expected}, Detected {detected}")
if true_positives:
print(f" ✓ Correct: {true_positives}")
if false_positives:
print(f" ✗ False positives: {false_positives}")
if false_negatives:
print(f" ✗ Missed: {false_negatives}")
precision = result.get('precision', 0.0)
recall = result.get('recall', 0.0)
f1 = result.get('f1_score', 0.0)
print(f" Precision: {precision:.2%}, Recall: {recall:.2%}, F1: {f1:.2%}")
print(f" Processing time: {result['processing_time']:.2f}s")
def save_results_to_file(self, results: List[Dict[str, Any]], prompt_file: str, output_file: str = "jersey_detection_results.jsonl"):
"""
Save test results to a JSON Lines file for later analysis.
Args:
results: List of all test results
prompt_file: Path to the prompt file used
output_file: Path to output file (default: jersey_detection_results.jsonl)
"""
# Calculate summary statistics
total_images = len(results)
images_with_errors = sum(1 for r in results if r.get('error'))
images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
total_raw_detections = total_jerseys + total_hallucinated
total_processing_time = sum(r.get('processing_time', 0) for r in results)
avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
# Collect confidence statistics if available
confidences = [
jersey.get('confidence')
for r in results if not r.get('error')
for jersey in r.get('jerseys', [])
if 'confidence' in jersey and jersey.get('confidence') is not None
]
confidence_stats = None
if confidences:
buckets = {
'90-100': sum(1 for c in confidences if 90 <= c <= 100),
'70-89': sum(1 for c in confidences if 70 <= c <= 89),
'50-69': sum(1 for c in confidences if 50 <= c <= 69),
'30-49': sum(1 for c in confidences if 30 <= c <= 49),
'0-29': sum(1 for c in confidences if 0 <= c <= 29)
}
confidence_stats = {
'avg': sum(confidences) / len(confidences),
'min': min(confidences),
'max': max(confidences),
'count': len(confidences),
'distribution': buckets
}
# Calculate resize statistics
images_resized = sum(1 for r in results if r.get('resized', False))
# Calculate ground truth statistics
results_without_errors = [r for r in results if not r.get('error')]
total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
# Calculate overall precision, recall, F1
overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
# Average per-image metrics
avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
# Calculate confidence calibration metrics (correct vs incorrect detections)
all_confidence_correct = []
all_confidence_incorrect = []
for r in results_without_errors:
if r.get('avg_confidence_correct') is not None:
# Weight by the count of correct detections in this image
count = r.get('confidence_correct_count', 0)
avg_conf = r.get('avg_confidence_correct')
all_confidence_correct.extend([avg_conf] * count)
if r.get('avg_confidence_incorrect') is not None:
# Weight by the count of incorrect detections in this image
count = r.get('confidence_incorrect_count', 0)
avg_conf = r.get('avg_confidence_incorrect')
all_confidence_incorrect.extend([avg_conf] * count)
overall_avg_confidence_correct = sum(all_confidence_correct) / len(all_confidence_correct) if all_confidence_correct else None
overall_avg_confidence_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect) if all_confidence_incorrect else None
# Create summary record
summary_record = {
'timestamp': datetime.now().isoformat(),
'model_name': self.model_name,
'model_tag': self.model_tag,
'prompt_file': prompt_file,
'prompt_length': len(self.prompt),
'total_images': total_images,
'images_with_jerseys': images_with_jerseys,
'images_without_jerseys': images_without_jerseys,
'images_with_errors': images_with_errors,
'total_raw_detections': total_raw_detections,
'total_valid_jerseys': total_jerseys,
'total_hallucinated': total_hallucinated,
'avg_processing_time': avg_processing_time,
'total_processing_time': total_processing_time,
'confidence_stats': confidence_stats,
'empty_response_capable': images_without_jerseys > 0,
'resize_enabled': self.resize_max is not None,
'resize_max': self.resize_max,
'images_resized': images_resized,
# Ground truth statistics
'ground_truth': {
'total_expected': total_expected_jerseys,
'total_true_positives': total_true_positives,
'total_false_positives': total_false_positives,
'total_false_negatives': total_false_negatives,
'overall_precision': overall_precision,
'overall_recall': overall_recall,
'overall_f1': overall_f1,
'avg_precision': avg_precision,
'avg_recall': avg_recall,
'avg_f1': avg_f1,
# Confidence calibration
'avg_confidence_correct': overall_avg_confidence_correct,
'avg_confidence_incorrect': overall_avg_confidence_incorrect,
'confidence_correct_count': len(all_confidence_correct),
'confidence_incorrect_count': len(all_confidence_incorrect)
}
}
# Append to file
try:
with open(output_file, 'a') as f:
f.write(json.dumps(summary_record) + '\n')
print(f"\n✓ Results saved to {output_file}")
except Exception as e:
print(f"\n❌ Failed to save results: {e}")
def print_summary(self, results: List[Dict[str, Any]]):
"""
Print summary statistics for all results.
Args:
results: List of all test results
"""
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"\nModel: {self.model_name}")
if self.model_tag:
print(f"Model tag: {self.model_tag}")
# Display resize info
if self.resize_max:
images_resized = sum(1 for r in results if r.get('resized', False))
print(f"Resize: Enabled (max: {self.resize_max}px, {images_resized} images resized)")
else:
print(f"Resize: Disabled")
total_images = len(results)
images_with_errors = sum(1 for r in results if r.get('error'))
images_with_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) > 0)
images_without_jerseys = sum(1 for r in results if not r.get('error') and len(r.get('jerseys', [])) == 0)
total_jerseys = sum(len(r.get('jerseys', [])) for r in results if not r.get('error'))
total_hallucinated = sum(r.get('hallucinated_count', 0) for r in results if not r.get('error'))
total_raw_detections = total_jerseys + total_hallucinated
total_processing_time = sum(r.get('processing_time', 0) for r in results)
avg_processing_time = total_processing_time / total_images if total_images > 0 else 0
print(f"\nTotal images processed: {total_images}")
print(f" - Images with jerseys: {images_with_jerseys} ({images_with_jerseys/total_images*100:.1f}%)")
print(f" - Images without jerseys: {images_without_jerseys} ({images_without_jerseys/total_images*100:.1f}%)")
print(f" - Images with errors: {images_with_errors} ({images_with_errors/total_images*100:.1f}%)")
print(f"\nJersey detections:")
print(f" - Total raw detections: {total_raw_detections}")
print(f" - Valid jerseys (after filtering): {total_jerseys}")
print(f" - Hallucinations filtered out: {total_hallucinated}")
if images_with_jerseys > 0:
print(f" - Average valid jerseys per image (when detected): {total_jerseys/images_with_jerseys:.2f}")
# Empty response capability (important for evaluating model's ability to return empty results)
print(f"\nEmpty response capability:")
print(f" - Empty responses returned: {images_without_jerseys}")
print(f" - Percentage of images: {images_without_jerseys/total_images*100:.1f}%")
print(f" - Model can return empty results: {'✓ Yes' if images_without_jerseys > 0 else '✗ No (potential issue)'}")
if total_hallucinated > 0:
print(f"\nHallucination detection:")
print(f" - Total hallucinated detections filtered: {total_hallucinated}")
images_with_hallucinations = sum(1 for r in results if not r.get('error') and r.get('hallucinated_count', 0) > 0)
print(f" - Images with hallucinations: {images_with_hallucinations} ({images_with_hallucinations/total_images*100:.1f}%)")
# Ground truth statistics
results_without_errors = [r for r in results if not r.get('error')]
total_expected_jerseys = sum(len(r.get('expected_jerseys', [])) for r in results_without_errors)
if total_expected_jerseys > 0:
total_true_positives = sum(len(r.get('true_positives', [])) for r in results_without_errors)
total_false_positives = sum(len(r.get('false_positives', [])) for r in results_without_errors)
total_false_negatives = sum(len(r.get('false_negatives', [])) for r in results_without_errors)
# Calculate overall metrics
overall_precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0.0
overall_recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0.0
overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
# Calculate average per-image metrics
avg_precision = sum(r.get('precision', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
avg_recall = sum(r.get('recall', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
avg_f1 = sum(r.get('f1_score', 0.0) for r in results_without_errors) / len(results_without_errors) if results_without_errors else 0.0
print(f"\nGround truth performance:")
print(f" - Total expected jerseys: {total_expected_jerseys}")
print(f" - True positives: {total_true_positives}")
print(f" - False positives: {total_false_positives}")
print(f" - False negatives: {total_false_negatives}")
print(f"\n Overall metrics (across all jerseys):")
print(f" - Precision: {overall_precision:.2%}")
print(f" - Recall: {overall_recall:.2%}")
print(f" - F1 Score: {overall_f1:.2%}")
print(f"\n Average per-image metrics:")
print(f" - Avg Precision: {avg_precision:.2%}")
print(f" - Avg Recall: {avg_recall:.2%}")
print(f" - Avg F1 Score: {avg_f1:.2%}")
# Confidence calibration metrics
all_confidence_correct = []
all_confidence_incorrect = []
for r in results_without_errors:
if r.get('avg_confidence_correct') is not None:
count = r.get('confidence_correct_count', 0)
avg_conf = r.get('avg_confidence_correct')
all_confidence_correct.extend([avg_conf] * count)
if r.get('avg_confidence_incorrect') is not None:
count = r.get('confidence_incorrect_count', 0)
avg_conf = r.get('avg_confidence_incorrect')
all_confidence_incorrect.extend([avg_conf] * count)
if all_confidence_correct or all_confidence_incorrect:
print(f"\n Confidence calibration:")
if all_confidence_correct:
avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
print(f" - Avg confidence (correct detections): {avg_conf_correct:.2f} ({len(all_confidence_correct)} detections)")
else:
print(f" - Avg confidence (correct detections): N/A (no correct detections with confidence)")
if all_confidence_incorrect:
avg_conf_incorrect = sum(all_confidence_incorrect) / len(all_confidence_incorrect)
print(f" - Avg confidence (incorrect detections): {avg_conf_incorrect:.2f} ({len(all_confidence_incorrect)} detections)")
# Show confidence difference
if all_confidence_correct:
avg_conf_correct = sum(all_confidence_correct) / len(all_confidence_correct)
diff = avg_conf_correct - avg_conf_incorrect
if diff > 0:
print(f" - Confidence difference: +{diff:.2f} (correct > incorrect, good calibration)")
else:
print(f" - Confidence difference: {diff:.2f} (⚠ incorrect ≥ correct, poor calibration)")
else:
print(f" - Avg confidence (incorrect detections): N/A (no incorrect detections with confidence)")
print(f"\nProcessing time:")
print(f" - Total: {total_processing_time:.2f}s")
print(f" - Average per image: {avg_processing_time:.2f}s")
# Check for confidence values
has_confidence = any(
any('confidence' in jersey for jersey in r.get('jerseys', []))
for r in results if not r.get('error')
)
if has_confidence:
print(f"\nConfidence statistics:")
confidences = [
jersey.get('confidence')
for r in results if not r.get('error')
for jersey in r.get('jerseys', [])
if 'confidence' in jersey and jersey.get('confidence') is not None
]
if confidences:
avg_confidence = sum(confidences) / len(confidences)
min_confidence = min(confidences)
max_confidence = max(confidences)
print(f" - Total detections with confidence: {len(confidences)}")
print(f" - Average confidence: {avg_confidence:.2f}")
print(f" - Min confidence: {min_confidence:.2f}")
print(f" - Max confidence: {max_confidence:.2f}")
# Confidence distribution by bucket
print(f"\n Confidence distribution:")
buckets = {
'90-100 (Extremely clear)': (90, 100),
'70-89 (Clear, minor issues)': (70, 89),
'50-69 (Partially visible)': (50, 69),
'30-49 (Difficult to read)': (30, 49),
'0-29 (Very uncertain)': (0, 29)
}
for bucket_name, (min_val, max_val) in buckets.items():
count = sum(1 for c in confidences if min_val <= c <= max_val)
percentage = (count / len(confidences) * 100) if len(confidences) > 0 else 0
bar_length = int(percentage / 2) # Scale to max 50 chars
bar = '' * bar_length
print(f" {bucket_name}: {count:3d} ({percentage:5.1f}%) {bar}")
# List errors if any
if images_with_errors > 0:
print(f"\nErrors encountered:")
for r in results:
if r.get('error'):
print(f" - {Path(r['image_path']).name}: {r['error']}")
print()
def main():
"""Main entry point for the test script."""
# Get default server URL from config
default_server_url = get_llama_server_url_from_config() or 'http://192.168.1.34:8080'
parser = argparse.ArgumentParser(
description='Test jersey detection with different models and prompts',
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('image_directory', help='Path to directory containing test images')
parser.add_argument('prompt_file', help='Path to text file containing the prompt')
parser.add_argument('--model-name', default=None,
help='Name of the model being tested (auto-detected from server if not provided)')
parser.add_argument('--server-url', default=default_server_url,
help=f'llama.cpp server URL (default: {default_server_url})')
parser.add_argument('--output-file', default='jersey_detection_results.jsonl',
help='Output file for results (default: jersey_detection_results.jsonl)')
parser.add_argument('--resize', type=int, default=None, metavar='MAX_SIZE',
help='Resize images to maximum dimension (e.g., 1024) before processing')
parser.add_argument('--model-tag', default=None,
help='Model tag for llama-swap (e.g., "qwen2.5-vl-7b"). If not specified, uses whatever model is loaded.')
args = parser.parse_args()
# Validate inputs
if not os.path.isdir(args.image_directory):
print(f"Error: Directory not found: {args.image_directory}")
sys.exit(1)
if not os.path.isfile(args.prompt_file):
print(f"Error: Prompt file not found: {args.prompt_file}")
sys.exit(1)
# Load prompt
try:
with open(args.prompt_file, 'r') as f:
prompt = f.read()
except Exception as e:
print(f"Error reading prompt file: {e}")
sys.exit(1)
# Print test configuration
print("=" * 70)
print("JERSEY DETECTION TEST")
print("=" * 70)
print(f"Model name: {args.model_name if args.model_name else '(auto-detect)'}")
print(f"Model tag: {args.model_tag if args.model_tag else 'None (use loaded model)'}")
print(f"Server URL: {args.server_url}")
print(f"Image directory: {args.image_directory}")
print(f"Prompt file: {args.prompt_file}")
print(f"Prompt length: {len(prompt)} characters")
print(f"Output file: {args.output_file}")
print(f"Resize images: {f'Yes (max: {args.resize}px)' if args.resize else 'No'}")
print("=" * 70)
print()
# Check server health
print("Checking server health...")
try:
client = LlamaCppClient(base_url=args.server_url)
# Try health check (handle both JSON and plain text responses)
try:
health = client.health_check()
print(f"✓ Server is healthy: {health}")
except json.JSONDecodeError:
# llama-swap returns plain text "OK" instead of JSON
response = requests.get(f"{args.server_url}/health")
response.raise_for_status()
print(f"✓ Server is healthy: {response.text}")
# Determine model name to use
model_name = args.model_name
# If model_tag is provided, use it as the model name (unless user explicitly provided a model_name)
if args.model_tag and not args.model_name:
model_name = args.model_tag
print(f"✓ Using model tag as model name: {model_name}")
elif not model_name:
# Only auto-detect if neither model_tag nor model_name was provided
detected_model_name = None
try:
models = client.get_models()
if 'data' in models and len(models['data']) > 0:
model_id = models['data'][0].get('id', 'unknown')
print(f"✓ Active model: {model_id}")
# Extract just the model filename (without path)
if model_id and model_id != 'unknown':
# Remove path and get base filename
model_filename = os.path.basename(model_id)
# Remove common extensions (.gguf, .bin, etc.)
model_name_no_ext = os.path.splitext(model_filename)[0]
detected_model_name = model_name_no_ext
except:
pass
if detected_model_name:
model_name = detected_model_name
print(f"✓ Using auto-detected model name: {model_name}")
else:
model_name = "unknown"
print(f"⚠ Could not detect model name, using 'unknown'")
else:
# User explicitly provided model_name
print(f"✓ Using provided model name: {model_name}")
except Exception as e:
print(f"❌ Failed to connect to server: {e}")
print(f"Make sure llama.cpp server is running at {args.server_url}")
sys.exit(1)
print()
# Show model tag info if using llama-swap
if args.model_tag:
print(f"Requesting model from llama-swap: {args.model_tag}")
# Check currently running models on llama-swap
try:
running_response = requests.get(f"{args.server_url}/running")
if running_response.status_code == 200:
try:
running_models = running_response.json()
if running_models:
print(f"Currently running models: {running_models}")
except:
pass
except:
pass
print()
# Run tests
tester = JerseyDetectionTester(args.server_url, prompt, model_name, args.resize, args.model_tag)
results = tester.test_directory(args.image_directory)
# Print summary
if results:
tester.print_summary(results)
# Save results to file
tester.save_results_to_file(results, args.prompt_file, args.output_file)
if __name__ == '__main__':
main()