Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models
on jersey number detection using llama.cpp server.
This commit is contained in:
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions

1
scan_utils/__init__.py Normal file
View File

@ -0,0 +1 @@
# Jersey detection scan utilities

View File

@ -0,0 +1,149 @@
import json
import cv2
import numpy as np
from typing import Dict, Any, Optional
import logging
# Read the default jersey detection prompt
try:
with open('jersey_prompt.txt', 'r') as f:
DEFAULT_JERSEY_PROMPT = f.read()
except FileNotFoundError:
# Fallback prompt if file is not found
DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
CRITICAL INSTRUCTIONS:
1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
2. ONLY include jersey numbers that you can ACTUALLY READ in the image
3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
5. DO NOT include jerseys if you cannot clearly see the number
RESPONSE FORMAT:
Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
Use DOUBLE QUOTES (") for all JSON keys and string values.
The JSON must have a single key "jerseys" with an array of dictionaries.
Each dictionary must have exactly these three keys:
- "jersey_number": The number on the jersey (as a string, only if clearly visible)
- "jersey_color": The primary color of the jersey
- "number_color": The color of the number on the jersey
Example response for an image WITH visible jerseys:
{
"jerseys": [
{
"jersey_number": "101",
"jersey_color": "red",
"number_color": "white"
}
]
}
Example response for an image WITHOUT jerseys or with unclear numbers:
{"jerseys": []}
REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
Now analyze the image and return the JSON object."""
class DetectJerseys:
"""A class for detecting sports jerseys using a vision language model."""
def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
"""
Initialize the jersey detection class.
Args:
llama_cpp_base_url: Base URL for the llama.cpp server
logger: Logger instance for logging messages
prompt: Custom prompt to use for jersey detection (optional)
"""
self.logger = logger or logging.getLogger(__name__)
self.prompt = prompt or DEFAULT_JERSEY_PROMPT
# Import here to avoid circular dependencies
try:
from scan_utils.llama_cpp_client import LlamaCppClient
self.client = LlamaCppClient(base_url=llama_cpp_base_url)
self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
except ImportError as e:
self.logger.error(f"Failed to import LlamaCppClient: {e}")
raise
def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
"""
Detect jerseys in an image using the vision language model.
Args:
image: OpenCV image (numpy array) to analyze
temperature: Temperature value for the model (default: 0.1)
Returns:
Dictionary containing detected jerseys or empty dict if invalid
"""
try:
# Create multimodal message with image and prompt
message = self.client.create_multimodal_message(
role="user",
content=self.prompt,
images=[image]
)
# Send chat completion request
response = self.client.chat_completion(
messages=[message],
temperature=temperature,
max_tokens=1000
)
# Extract the response text
if 'choices' in response and len(response['choices']) > 0:
response_text = response['choices'][0]['message']['content']
# Log the raw response for debugging
self.logger.debug(f"Raw VLM response: {response_text}")
# Parse JSON response
try:
result = json.loads(response_text)
# Process jerseys to ensure they have all required fields
jerseys = result.get('jerseys', [])
# Hallucination detection: filter out example numbers from the prompt
# Using numbers > 100 as examples to avoid filtering valid jersey numbers
HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
processed_jerseys = []
for jersey in jerseys:
jersey_number = jersey.get('jersey_number', '')
# Check for hallucination (model returning example numbers)
if jersey_number in HALLUCINATION_NUMBERS:
self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
continue
# Ensure all required fields are present
processed_jersey = {
'jersey_number': jersey_number,
'jersey_color': jersey.get('jersey_color', ''),
'number_color': jersey.get('number_color', 'unknown') # Default to 'unknown' if missing
}
processed_jerseys.append(processed_jersey)
return {"jerseys": processed_jerseys}
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON response: {e}")
self.logger.debug(f"Response text was: {response_text}")
return {"jerseys": []}
else:
self.logger.warning("Empty response from VLM")
return {"jerseys": []}
except Exception as e:
self.logger.error(f"Error during jersey detection: {e}")
return {"jerseys": []}

View File

@ -0,0 +1,237 @@
import base64
import json
import cv2
import numpy as np
import requests
from typing import List, Dict, Any, Optional, Union
class LlamaCppClient:
"""A Python client for interacting with a llama.cpp server."""
def __init__(self, base_url: str = "http://192.168.1.34:8080"):
"""
Initialize the client with the base URL of the llama.cpp server.
Args:
base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
"""
self.base_url = base_url.rstrip('/')
def health_check(self) -> Dict[str, Any]:
"""
Check the health status of the server.
Returns:
Health status response from the server
"""
response = requests.get(f"{self.base_url}/health")
response.raise_for_status()
return response.json()
def get_models(self) -> Dict[str, Any]:
"""
Get information about loaded models.
Returns:
Model information from the server
"""
response = requests.get(f"{self.base_url}/v1/models")
response.raise_for_status()
return response.json()
def chat_completion(
self,
messages: List[Dict[str, Any]],
temperature: float = 0.1,
min_p: float = 0.15,
repetition_penalty: float = 1.05,
min_image_tokens: int = 64,
max_image_tokens: int = 256,
do_image_splitting: bool = True,
max_tokens: int = -1,
stream: bool = False,
**kwargs
) -> Union[Dict[str, Any], requests.Response]:
"""
Generate a chat completion using the OpenAI-compatible API.
Args:
messages: List of message dictionaries with role and content
temperature: Sampling temperature (default: 0.1)
min_p: Minimum probability for sampling (default: 0.15)
repetition_penalty: Repetition penalty factor (default: 1.05)
min_image_tokens: Minimum image tokens (default: 64)
max_image_tokens: Maximum image tokens (default: 256)
do_image_splitting: Whether to split images (default: True)
max_tokens: Maximum tokens to generate (default: -1 for infinity)
stream: Whether to stream the response (default: False)
**kwargs: Additional parameters for the completion
Returns:
Completion response or streaming response
"""
payload = {
"messages": messages,
"temperature": temperature,
"min_p": min_p,
"repetition_penalty": repetition_penalty,
"min_image_tokens": min_image_tokens,
"max_image_tokens": max_image_tokens,
"do_image_splitting": do_image_splitting,
"max_tokens": max_tokens,
"cache_prompt": True,
"stream": stream,
**kwargs
}
# Debug: Show model parameter if present (for llama-swap debugging)
if 'model' in payload and payload['model']:
import os
if os.environ.get('DEBUG_LLAMA_SWAP'):
print(f"[DEBUG] Requesting model: {payload['model']}")
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Content-Type": "application/json"},
json=payload,
stream=stream
)
response.raise_for_status()
if stream:
return response
return response.json()
def completion(
self,
prompt: Union[str, List[Union[str, int]]],
temperature: float = 0.1,
min_p: float = 0.15,
repetition_penalty: float = 1.05,
min_image_tokens: int = 64,
max_image_tokens: int = 256,
do_image_splitting: bool = True,
max_tokens: int = -1,
stream: bool = False,
**kwargs
) -> Union[Dict[str, Any], requests.Response]:
"""
Generate a completion using the non-OAI compatible API.
Args:
prompt: The prompt string or list of tokens
temperature: Sampling temperature (default: 0.1)
min_p: Minimum probability for sampling (default: 0.15)
repetition_penalty: Repetition penalty factor (default: 1.05)
min_image_tokens: Minimum image tokens (default: 64)
max_image_tokens: Maximum image tokens (default: 256)
do_image_splitting: Whether to split images (default: True)
max_tokens: Maximum tokens to generate (default: -1 for infinity)
stream: Whether to stream the response (default: False)
**kwargs: Additional parameters for the completion
Returns:
Completion response or streaming response
"""
payload = {
"prompt": prompt,
"temperature": temperature,
"min_p": min_p,
"repeat_penalty": repetition_penalty,
"min_image_tokens": min_image_tokens,
"max_image_tokens": max_image_tokens,
"do_image_splitting": do_image_splitting,
"cache_prompt": True,
"n_predict": max_tokens,
"stream": stream,
**kwargs
}
response = requests.post(
f"{self.base_url}/completion",
headers={"Content-Type": "application/json"},
json=payload,
stream=stream
)
response.raise_for_status()
if stream:
return response
return response.json()
@staticmethod
def _encode_image_to_base64(image_path: str) -> str:
"""
Encode an image file to base64 string.
Args:
image_path: Path to the image file
Returns:
Base64 encoded image string
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
@staticmethod
def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
"""
Encode an OpenCV image to base64 string.
Args:
image: OpenCV image (numpy array)
Returns:
Base64 encoded image string
"""
_, buffer = cv2.imencode('.jpg', image)
return base64.b64encode(buffer).decode('utf-8')
def create_multimodal_message(
self,
role: str,
content: str,
images: Optional[List[Union[str, np.ndarray]]] = None
) -> Dict[str, Any]:
"""
Create a multimodal message with text and images.
Args:
role: Role of the message (system, user, assistant)
content: Text content of the message
images: List of image paths or OpenCV images (numpy arrays)
Returns:
Formatted message dictionary
"""
if not images:
return {"role": role, "content": content}
# Process images
image_data = []
for img in images:
if isinstance(img, str):
# Image path
encoded_image = self._encode_image_to_base64(img)
else:
# OpenCV image
encoded_image = self._encode_cv2_image_to_base64(img)
image_data.append(encoded_image)
# Create multimodal content
multimodal_content = [
{"type": "text", "text": content}
]
for img_data in image_data:
multimodal_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_data}"
}
})
return {"role": role, "content": multimodal_content}