Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models
on jersey number detection using llama.cpp server.
This commit is contained in:
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions

View File

@ -0,0 +1,6 @@
{"timestamp": "2025-10-19T19:30:44.272849", "model_name": "LFM2-VL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 88, "images_without_jerseys": 110, "images_with_errors": 0, "total_raw_detections": 470, "total_valid_jerseys": 235, "total_hallucinated": 235, "avg_processing_time": 4.607636096501591, "total_processing_time": 912.3119471073151, "confidence_stats": {"avg": 84.14893617021276, "min": 0, "max": 100, "count": 235, "distribution": {"90-100": 138, "70-89": 70, "50-69": 8, "30-49": 8, "0-29": 11}}, "empty_response_capable": true}
{"timestamp": "2025-10-19T22:10:05.135029", "model_name": "ggml-org_Kimi-VL-A3B-Thinking-2506-GGUF_Kimi-VL-A3B-Thinking-2506-bf16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 28, "images_without_jerseys": 163, "images_with_errors": 7, "total_raw_detections": 49, "total_valid_jerseys": 49, "total_hallucinated": 0, "avg_processing_time": 29.11009831259949, "total_processing_time": 5763.799465894699, "confidence_stats": {"avg": 88.85714285714286, "min": 60, "max": 95, "count": 49, "distribution": {"90-100": 37, "70-89": 9, "50-69": 3, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
{"timestamp": "2025-10-20T01:20:31.076468", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-BF16", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 37.221905313356956, "total_processing_time": 7369.937252044678, "confidence_stats": {"avg": 90.81983805668017, "min": 70, "max": 95, "count": 494, "distribution": {"90-100": 362, "70-89": 132, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
{"timestamp": "2025-10-20T12:04:37.833650", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q8_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 496, "total_valid_jerseys": 496, "total_hallucinated": 0, "avg_processing_time": 20.684308366342023, "total_processing_time": 4095.493056535721, "confidence_stats": {"avg": 90.76612903225806, "min": 70, "max": 95, "count": 496, "distribution": {"90-100": 363, "70-89": 133, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
{"timestamp": "2025-10-20T13:01:42.747694", "model_name": "unsloth_Mistral-Small-3.2-24B-Instruct-2506-GGUF_Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 197, "images_without_jerseys": 1, "images_with_errors": 0, "total_raw_detections": 494, "total_valid_jerseys": 494, "total_hallucinated": 0, "avg_processing_time": 14.196594772916852, "total_processing_time": 2810.9257650375366, "confidence_stats": {"avg": 92.09514170040485, "min": 80, "max": 95, "count": 494, "distribution": {"90-100": 415, "70-89": 79, "50-69": 0, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}
{"timestamp": "2025-10-20T15:01:25.669340", "model_name": "unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-Q8_0", "prompt_file": "jersey_prompt_with_confidence.txt", "prompt_length": 2134, "total_images": 198, "images_with_jerseys": 185, "images_without_jerseys": 13, "images_with_errors": 0, "total_raw_detections": 428, "total_valid_jerseys": 428, "total_hallucinated": 0, "avg_processing_time": 18.127051142731098, "total_processing_time": 3589.1561262607574, "confidence_stats": {"avg": 87.14953271028037, "min": 55, "max": 100, "count": 428, "distribution": {"90-100": 250, "70-89": 166, "50-69": 12, "30-49": 0, "0-29": 0}}, "empty_response_capable": true}