Initial commit: Jersey detection test suite
Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
This commit is contained in:
1
scan_utils/__init__.py
Normal file
1
scan_utils/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# Jersey detection scan utilities
|
||||
149
scan_utils/jersey_detection.py
Normal file
149
scan_utils/jersey_detection.py
Normal file
@ -0,0 +1,149 @@
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
from typing import Dict, Any, Optional
|
||||
import logging
|
||||
|
||||
# Read the default jersey detection prompt
|
||||
try:
|
||||
with open('jersey_prompt.txt', 'r') as f:
|
||||
DEFAULT_JERSEY_PROMPT = f.read()
|
||||
except FileNotFoundError:
|
||||
# Fallback prompt if file is not found
|
||||
DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
|
||||
|
||||
CRITICAL INSTRUCTIONS:
|
||||
1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
|
||||
2. ONLY include jersey numbers that you can ACTUALLY READ in the image
|
||||
3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
|
||||
4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
|
||||
5. DO NOT include jerseys if you cannot clearly see the number
|
||||
|
||||
RESPONSE FORMAT:
|
||||
Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
|
||||
|
||||
Use DOUBLE QUOTES (") for all JSON keys and string values.
|
||||
|
||||
The JSON must have a single key "jerseys" with an array of dictionaries.
|
||||
|
||||
Each dictionary must have exactly these three keys:
|
||||
- "jersey_number": The number on the jersey (as a string, only if clearly visible)
|
||||
- "jersey_color": The primary color of the jersey
|
||||
- "number_color": The color of the number on the jersey
|
||||
|
||||
Example response for an image WITH visible jerseys:
|
||||
{
|
||||
"jerseys": [
|
||||
{
|
||||
"jersey_number": "101",
|
||||
"jersey_color": "red",
|
||||
"number_color": "white"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Example response for an image WITHOUT jerseys or with unclear numbers:
|
||||
{"jerseys": []}
|
||||
|
||||
REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
|
||||
|
||||
Now analyze the image and return the JSON object."""
|
||||
|
||||
|
||||
class DetectJerseys:
|
||||
"""A class for detecting sports jerseys using a vision language model."""
|
||||
|
||||
def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
|
||||
"""
|
||||
Initialize the jersey detection class.
|
||||
|
||||
Args:
|
||||
llama_cpp_base_url: Base URL for the llama.cpp server
|
||||
logger: Logger instance for logging messages
|
||||
prompt: Custom prompt to use for jersey detection (optional)
|
||||
"""
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
self.prompt = prompt or DEFAULT_JERSEY_PROMPT
|
||||
|
||||
# Import here to avoid circular dependencies
|
||||
try:
|
||||
from scan_utils.llama_cpp_client import LlamaCppClient
|
||||
self.client = LlamaCppClient(base_url=llama_cpp_base_url)
|
||||
self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
|
||||
except ImportError as e:
|
||||
self.logger.error(f"Failed to import LlamaCppClient: {e}")
|
||||
raise
|
||||
|
||||
def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
|
||||
"""
|
||||
Detect jerseys in an image using the vision language model.
|
||||
|
||||
Args:
|
||||
image: OpenCV image (numpy array) to analyze
|
||||
temperature: Temperature value for the model (default: 0.1)
|
||||
|
||||
Returns:
|
||||
Dictionary containing detected jerseys or empty dict if invalid
|
||||
"""
|
||||
try:
|
||||
# Create multimodal message with image and prompt
|
||||
message = self.client.create_multimodal_message(
|
||||
role="user",
|
||||
content=self.prompt,
|
||||
images=[image]
|
||||
)
|
||||
|
||||
# Send chat completion request
|
||||
response = self.client.chat_completion(
|
||||
messages=[message],
|
||||
temperature=temperature,
|
||||
max_tokens=1000
|
||||
)
|
||||
|
||||
# Extract the response text
|
||||
if 'choices' in response and len(response['choices']) > 0:
|
||||
response_text = response['choices'][0]['message']['content']
|
||||
|
||||
# Log the raw response for debugging
|
||||
self.logger.debug(f"Raw VLM response: {response_text}")
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
result = json.loads(response_text)
|
||||
|
||||
# Process jerseys to ensure they have all required fields
|
||||
jerseys = result.get('jerseys', [])
|
||||
|
||||
# Hallucination detection: filter out example numbers from the prompt
|
||||
# Using numbers > 100 as examples to avoid filtering valid jersey numbers
|
||||
HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
|
||||
|
||||
processed_jerseys = []
|
||||
for jersey in jerseys:
|
||||
jersey_number = jersey.get('jersey_number', '')
|
||||
|
||||
# Check for hallucination (model returning example numbers)
|
||||
if jersey_number in HALLUCINATION_NUMBERS:
|
||||
self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
|
||||
continue
|
||||
|
||||
# Ensure all required fields are present
|
||||
processed_jersey = {
|
||||
'jersey_number': jersey_number,
|
||||
'jersey_color': jersey.get('jersey_color', ''),
|
||||
'number_color': jersey.get('number_color', 'unknown') # Default to 'unknown' if missing
|
||||
}
|
||||
processed_jerseys.append(processed_jersey)
|
||||
|
||||
return {"jerseys": processed_jerseys}
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse JSON response: {e}")
|
||||
self.logger.debug(f"Response text was: {response_text}")
|
||||
return {"jerseys": []}
|
||||
else:
|
||||
self.logger.warning("Empty response from VLM")
|
||||
return {"jerseys": []}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error during jersey detection: {e}")
|
||||
return {"jerseys": []}
|
||||
237
scan_utils/llama_cpp_client.py
Normal file
237
scan_utils/llama_cpp_client.py
Normal file
@ -0,0 +1,237 @@
|
||||
import base64
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
import requests
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
|
||||
|
||||
class LlamaCppClient:
|
||||
"""A Python client for interacting with a llama.cpp server."""
|
||||
|
||||
def __init__(self, base_url: str = "http://192.168.1.34:8080"):
|
||||
"""
|
||||
Initialize the client with the base URL of the llama.cpp server.
|
||||
|
||||
Args:
|
||||
base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
|
||||
"""
|
||||
self.base_url = base_url.rstrip('/')
|
||||
|
||||
def health_check(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check the health status of the server.
|
||||
|
||||
Returns:
|
||||
Health status response from the server
|
||||
"""
|
||||
response = requests.get(f"{self.base_url}/health")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def get_models(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get information about loaded models.
|
||||
|
||||
Returns:
|
||||
Model information from the server
|
||||
"""
|
||||
response = requests.get(f"{self.base_url}/v1/models")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
temperature: float = 0.1,
|
||||
min_p: float = 0.15,
|
||||
repetition_penalty: float = 1.05,
|
||||
min_image_tokens: int = 64,
|
||||
max_image_tokens: int = 256,
|
||||
do_image_splitting: bool = True,
|
||||
max_tokens: int = -1,
|
||||
stream: bool = False,
|
||||
**kwargs
|
||||
) -> Union[Dict[str, Any], requests.Response]:
|
||||
"""
|
||||
Generate a chat completion using the OpenAI-compatible API.
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries with role and content
|
||||
temperature: Sampling temperature (default: 0.1)
|
||||
min_p: Minimum probability for sampling (default: 0.15)
|
||||
repetition_penalty: Repetition penalty factor (default: 1.05)
|
||||
min_image_tokens: Minimum image tokens (default: 64)
|
||||
max_image_tokens: Maximum image tokens (default: 256)
|
||||
do_image_splitting: Whether to split images (default: True)
|
||||
max_tokens: Maximum tokens to generate (default: -1 for infinity)
|
||||
stream: Whether to stream the response (default: False)
|
||||
**kwargs: Additional parameters for the completion
|
||||
|
||||
Returns:
|
||||
Completion response or streaming response
|
||||
"""
|
||||
payload = {
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"min_p": min_p,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
"min_image_tokens": min_image_tokens,
|
||||
"max_image_tokens": max_image_tokens,
|
||||
"do_image_splitting": do_image_splitting,
|
||||
"max_tokens": max_tokens,
|
||||
"cache_prompt": True,
|
||||
"stream": stream,
|
||||
**kwargs
|
||||
}
|
||||
|
||||
# Debug: Show model parameter if present (for llama-swap debugging)
|
||||
if 'model' in payload and payload['model']:
|
||||
import os
|
||||
if os.environ.get('DEBUG_LLAMA_SWAP'):
|
||||
print(f"[DEBUG] Requesting model: {payload['model']}")
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=payload,
|
||||
stream=stream
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
return response
|
||||
|
||||
return response.json()
|
||||
|
||||
def completion(
|
||||
self,
|
||||
prompt: Union[str, List[Union[str, int]]],
|
||||
temperature: float = 0.1,
|
||||
min_p: float = 0.15,
|
||||
repetition_penalty: float = 1.05,
|
||||
min_image_tokens: int = 64,
|
||||
max_image_tokens: int = 256,
|
||||
do_image_splitting: bool = True,
|
||||
max_tokens: int = -1,
|
||||
stream: bool = False,
|
||||
**kwargs
|
||||
) -> Union[Dict[str, Any], requests.Response]:
|
||||
"""
|
||||
Generate a completion using the non-OAI compatible API.
|
||||
|
||||
Args:
|
||||
prompt: The prompt string or list of tokens
|
||||
temperature: Sampling temperature (default: 0.1)
|
||||
min_p: Minimum probability for sampling (default: 0.15)
|
||||
repetition_penalty: Repetition penalty factor (default: 1.05)
|
||||
min_image_tokens: Minimum image tokens (default: 64)
|
||||
max_image_tokens: Maximum image tokens (default: 256)
|
||||
do_image_splitting: Whether to split images (default: True)
|
||||
max_tokens: Maximum tokens to generate (default: -1 for infinity)
|
||||
stream: Whether to stream the response (default: False)
|
||||
**kwargs: Additional parameters for the completion
|
||||
|
||||
Returns:
|
||||
Completion response or streaming response
|
||||
"""
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"temperature": temperature,
|
||||
"min_p": min_p,
|
||||
"repeat_penalty": repetition_penalty,
|
||||
"min_image_tokens": min_image_tokens,
|
||||
"max_image_tokens": max_image_tokens,
|
||||
"do_image_splitting": do_image_splitting,
|
||||
"cache_prompt": True,
|
||||
"n_predict": max_tokens,
|
||||
"stream": stream,
|
||||
**kwargs
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/completion",
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=payload,
|
||||
stream=stream
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
return response
|
||||
|
||||
return response.json()
|
||||
|
||||
@staticmethod
|
||||
def _encode_image_to_base64(image_path: str) -> str:
|
||||
"""
|
||||
Encode an image file to base64 string.
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string
|
||||
"""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
@staticmethod
|
||||
def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
|
||||
"""
|
||||
Encode an OpenCV image to base64 string.
|
||||
|
||||
Args:
|
||||
image: OpenCV image (numpy array)
|
||||
|
||||
Returns:
|
||||
Base64 encoded image string
|
||||
"""
|
||||
_, buffer = cv2.imencode('.jpg', image)
|
||||
return base64.b64encode(buffer).decode('utf-8')
|
||||
|
||||
def create_multimodal_message(
|
||||
self,
|
||||
role: str,
|
||||
content: str,
|
||||
images: Optional[List[Union[str, np.ndarray]]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a multimodal message with text and images.
|
||||
|
||||
Args:
|
||||
role: Role of the message (system, user, assistant)
|
||||
content: Text content of the message
|
||||
images: List of image paths or OpenCV images (numpy arrays)
|
||||
|
||||
Returns:
|
||||
Formatted message dictionary
|
||||
"""
|
||||
if not images:
|
||||
return {"role": role, "content": content}
|
||||
|
||||
# Process images
|
||||
image_data = []
|
||||
for img in images:
|
||||
if isinstance(img, str):
|
||||
# Image path
|
||||
encoded_image = self._encode_image_to_base64(img)
|
||||
else:
|
||||
# OpenCV image
|
||||
encoded_image = self._encode_cv2_image_to_base64(img)
|
||||
image_data.append(encoded_image)
|
||||
|
||||
# Create multimodal content
|
||||
multimodal_content = [
|
||||
{"type": "text", "text": content}
|
||||
]
|
||||
|
||||
for img_data in image_data:
|
||||
multimodal_content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{img_data}"
|
||||
}
|
||||
})
|
||||
|
||||
return {"role": role, "content": multimodal_content}
|
||||
Reference in New Issue
Block a user