Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models
on jersey number detection using llama.cpp server.
This commit is contained in:
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions

View File

@ -0,0 +1,149 @@
import json
import cv2
import numpy as np
from typing import Dict, Any, Optional
import logging
# Read the default jersey detection prompt
try:
with open('jersey_prompt.txt', 'r') as f:
DEFAULT_JERSEY_PROMPT = f.read()
except FileNotFoundError:
# Fallback prompt if file is not found
DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
CRITICAL INSTRUCTIONS:
1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
2. ONLY include jersey numbers that you can ACTUALLY READ in the image
3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
5. DO NOT include jerseys if you cannot clearly see the number
RESPONSE FORMAT:
Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
Use DOUBLE QUOTES (") for all JSON keys and string values.
The JSON must have a single key "jerseys" with an array of dictionaries.
Each dictionary must have exactly these three keys:
- "jersey_number": The number on the jersey (as a string, only if clearly visible)
- "jersey_color": The primary color of the jersey
- "number_color": The color of the number on the jersey
Example response for an image WITH visible jerseys:
{
"jerseys": [
{
"jersey_number": "101",
"jersey_color": "red",
"number_color": "white"
}
]
}
Example response for an image WITHOUT jerseys or with unclear numbers:
{"jerseys": []}
REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
Now analyze the image and return the JSON object."""
class DetectJerseys:
"""A class for detecting sports jerseys using a vision language model."""
def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
"""
Initialize the jersey detection class.
Args:
llama_cpp_base_url: Base URL for the llama.cpp server
logger: Logger instance for logging messages
prompt: Custom prompt to use for jersey detection (optional)
"""
self.logger = logger or logging.getLogger(__name__)
self.prompt = prompt or DEFAULT_JERSEY_PROMPT
# Import here to avoid circular dependencies
try:
from scan_utils.llama_cpp_client import LlamaCppClient
self.client = LlamaCppClient(base_url=llama_cpp_base_url)
self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
except ImportError as e:
self.logger.error(f"Failed to import LlamaCppClient: {e}")
raise
def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
"""
Detect jerseys in an image using the vision language model.
Args:
image: OpenCV image (numpy array) to analyze
temperature: Temperature value for the model (default: 0.1)
Returns:
Dictionary containing detected jerseys or empty dict if invalid
"""
try:
# Create multimodal message with image and prompt
message = self.client.create_multimodal_message(
role="user",
content=self.prompt,
images=[image]
)
# Send chat completion request
response = self.client.chat_completion(
messages=[message],
temperature=temperature,
max_tokens=1000
)
# Extract the response text
if 'choices' in response and len(response['choices']) > 0:
response_text = response['choices'][0]['message']['content']
# Log the raw response for debugging
self.logger.debug(f"Raw VLM response: {response_text}")
# Parse JSON response
try:
result = json.loads(response_text)
# Process jerseys to ensure they have all required fields
jerseys = result.get('jerseys', [])
# Hallucination detection: filter out example numbers from the prompt
# Using numbers > 100 as examples to avoid filtering valid jersey numbers
HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
processed_jerseys = []
for jersey in jerseys:
jersey_number = jersey.get('jersey_number', '')
# Check for hallucination (model returning example numbers)
if jersey_number in HALLUCINATION_NUMBERS:
self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
continue
# Ensure all required fields are present
processed_jersey = {
'jersey_number': jersey_number,
'jersey_color': jersey.get('jersey_color', ''),
'number_color': jersey.get('number_color', 'unknown') # Default to 'unknown' if missing
}
processed_jerseys.append(processed_jersey)
return {"jerseys": processed_jerseys}
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON response: {e}")
self.logger.debug(f"Response text was: {response_text}")
return {"jerseys": []}
else:
self.logger.warning("Empty response from VLM")
return {"jerseys": []}
except Exception as e:
self.logger.error(f"Error during jersey detection: {e}")
return {"jerseys": []}