Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions
--- a/scan_utils/init.py
+++ b/scan_utils/init.py
@ -0,0 +1 @@
+# Jersey detection scan utilities
--- a/scan_utils/jersey_detection.py
+++ b/scan_utils/jersey_detection.py
@ -0,0 +1,149 @@
+import json
+import cv2
+import numpy as np
+from typing import Dict, Any, Optional
+import logging
+
+# Read the default jersey detection prompt
+try:
+    with open('jersey_prompt.txt', 'r') as f:
+        DEFAULT_JERSEY_PROMPT = f.read()
+except FileNotFoundError:
+    # Fallback prompt if file is not found
+    DEFAULT_JERSEY_PROMPT = """You are an expert at detecting sports jerseys in images. Carefully examine the provided image and identify all visible sports jerseys.
+
+CRITICAL INSTRUCTIONS:
+1. ONLY detect jerseys that are CLEARLY VISIBLE in the image
+2. ONLY include jersey numbers that you can ACTUALLY READ in the image
+3. If you CANNOT see any jerseys, you MUST return {"jerseys": []}
+4. DO NOT make up, imagine, or guess jersey numbers that aren't visible
+5. DO NOT include jerseys if you cannot clearly see the number
+
+RESPONSE FORMAT:
+Respond ONLY with a valid JSON object. No explanations, no markdown, no extra text.
+
+Use DOUBLE QUOTES (") for all JSON keys and string values.
+
+The JSON must have a single key "jerseys" with an array of dictionaries.
+
+Each dictionary must have exactly these three keys:
+- "jersey_number": The number on the jersey (as a string, only if clearly visible)
+- "jersey_color": The primary color of the jersey
+- "number_color": The color of the number on the jersey
+
+Example response for an image WITH visible jerseys:
+{
+  "jerseys": [
+    {
+      "jersey_number": "101",
+      "jersey_color": "red",
+      "number_color": "white"
+    }
+  ]
+}
+
+Example response for an image WITHOUT jerseys or with unclear numbers:
+{"jerseys": []}
+
+REMEMBER: Only include jerseys with numbers you can ACTUALLY SEE in the image. When in doubt, return empty array.
+
+Now analyze the image and return the JSON object."""
+
+
+class DetectJerseys:
+    """A class for detecting sports jerseys using a vision language model."""
+
+    def __init__(self, llama_cpp_base_url: str = "http://192.168.1.34:8080", logger: Optional[logging.Logger] = None, prompt: Optional[str] = None):
+        """
+        Initialize the jersey detection class.
+        
+        Args:
+            llama_cpp_base_url: Base URL for the llama.cpp server
+            logger: Logger instance for logging messages
+            prompt: Custom prompt to use for jersey detection (optional)
+        """
+        self.logger = logger or logging.getLogger(__name__)
+        self.prompt = prompt or DEFAULT_JERSEY_PROMPT
+        
+        # Import here to avoid circular dependencies
+        try:
+            from scan_utils.llama_cpp_client import LlamaCppClient
+            self.client = LlamaCppClient(base_url=llama_cpp_base_url)
+            self.logger.info(f"Jersey detection initialized with llama.cpp server at {llama_cpp_base_url}")
+        except ImportError as e:
+            self.logger.error(f"Failed to import LlamaCppClient: {e}")
+            raise
+
+    def detect(self, image: np.ndarray, temperature: float = 0.1) -> Dict[str, Any]:
+        """
+        Detect jerseys in an image using the vision language model.
+
+        Args:
+            image: OpenCV image (numpy array) to analyze
+            temperature: Temperature value for the model (default: 0.1)
+
+        Returns:
+            Dictionary containing detected jerseys or empty dict if invalid
+        """
+        try:
+            # Create multimodal message with image and prompt
+            message = self.client.create_multimodal_message(
+                role="user",
+                content=self.prompt,
+                images=[image]
+            )
+            
+            # Send chat completion request
+            response = self.client.chat_completion(
+                messages=[message],
+                temperature=temperature,
+                max_tokens=1000
+            )
+            
+            # Extract the response text
+            if 'choices' in response and len(response['choices']) > 0:
+                response_text = response['choices'][0]['message']['content']
+                
+                # Log the raw response for debugging
+                self.logger.debug(f"Raw VLM response: {response_text}")
+                
+                # Parse JSON response
+                try:
+                    result = json.loads(response_text)
+
+                    # Process jerseys to ensure they have all required fields
+                    jerseys = result.get('jerseys', [])
+
+                    # Hallucination detection: filter out example numbers from the prompt
+                    # Using numbers > 100 as examples to avoid filtering valid jersey numbers
+                    HALLUCINATION_NUMBERS = {'101', '102', '103', '142', '199'}
+
+                    processed_jerseys = []
+                    for jersey in jerseys:
+                        jersey_number = jersey.get('jersey_number', '')
+
+                        # Check for hallucination (model returning example numbers)
+                        if jersey_number in HALLUCINATION_NUMBERS:
+                            self.logger.warning(f"Possible hallucination detected - jersey number {jersey_number} matches example pattern. Filtering out.")
+                            continue
+
+                        # Ensure all required fields are present
+                        processed_jersey = {
+                            'jersey_number': jersey_number,
+                            'jersey_color': jersey.get('jersey_color', ''),
+                            'number_color': jersey.get('number_color', 'unknown')  # Default to 'unknown' if missing
+                        }
+                        processed_jerseys.append(processed_jersey)
+
+                    return {"jerseys": processed_jerseys}
+                except json.JSONDecodeError as e:
+                    self.logger.error(f"Failed to parse JSON response: {e}")
+                    self.logger.debug(f"Response text was: {response_text}")
+                    return {"jerseys": []}
+            else:
+                self.logger.warning("Empty response from VLM")
+                return {"jerseys": []}
+                
+        except Exception as e:
+            self.logger.error(f"Error during jersey detection: {e}")
+            return {"jerseys": []}
--- a/scan_utils/llama_cpp_client.py
+++ b/scan_utils/llama_cpp_client.py
@ -0,0 +1,237 @@
+import base64
+import json
+import cv2
+import numpy as np
+import requests
+from typing import List, Dict, Any, Optional, Union
+
+
+class LlamaCppClient:
+    """A Python client for interacting with a llama.cpp server."""
+
+    def __init__(self, base_url: str = "http://192.168.1.34:8080"):
+        """
+        Initialize the client with the base URL of the llama.cpp server.
+
+        Args:
+            base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
+        """
+        self.base_url = base_url.rstrip('/')
+        
+    def health_check(self) -> Dict[str, Any]:
+        """
+        Check the health status of the server.
+        
+        Returns:
+            Health status response from the server
+        """
+        response = requests.get(f"{self.base_url}/health")
+        response.raise_for_status()
+        return response.json()
+        
+    def get_models(self) -> Dict[str, Any]:
+        """
+        Get information about loaded models.
+        
+        Returns:
+            Model information from the server
+        """
+        response = requests.get(f"{self.base_url}/v1/models")
+        response.raise_for_status()
+        return response.json()
+        
+    def chat_completion(
+        self,
+        messages: List[Dict[str, Any]],
+        temperature: float = 0.1,
+        min_p: float = 0.15,
+        repetition_penalty: float = 1.05,
+        min_image_tokens: int = 64,
+        max_image_tokens: int = 256,
+        do_image_splitting: bool = True,
+        max_tokens: int = -1,
+        stream: bool = False,
+        **kwargs
+    ) -> Union[Dict[str, Any], requests.Response]:
+        """
+        Generate a chat completion using the OpenAI-compatible API.
+        
+        Args:
+            messages: List of message dictionaries with role and content
+            temperature: Sampling temperature (default: 0.1)
+            min_p: Minimum probability for sampling (default: 0.15)
+            repetition_penalty: Repetition penalty factor (default: 1.05)
+            min_image_tokens: Minimum image tokens (default: 64)
+            max_image_tokens: Maximum image tokens (default: 256)
+            do_image_splitting: Whether to split images (default: True)
+            max_tokens: Maximum tokens to generate (default: -1 for infinity)
+            stream: Whether to stream the response (default: False)
+            **kwargs: Additional parameters for the completion
+            
+        Returns:
+            Completion response or streaming response
+        """
+        payload = {
+            "messages": messages,
+            "temperature": temperature,
+            "min_p": min_p,
+            "repetition_penalty": repetition_penalty,
+            "min_image_tokens": min_image_tokens,
+            "max_image_tokens": max_image_tokens,
+            "do_image_splitting": do_image_splitting,
+            "max_tokens": max_tokens,
+            "cache_prompt": True,
+            "stream": stream,
+            **kwargs
+        }
+
+        # Debug: Show model parameter if present (for llama-swap debugging)
+        if 'model' in payload and payload['model']:
+            import os
+            if os.environ.get('DEBUG_LLAMA_SWAP'):
+                print(f"[DEBUG] Requesting model: {payload['model']}")
+
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            stream=stream
+        )
+        response.raise_for_status()
+        
+        if stream:
+            return response
+            
+        return response.json()
+        
+    def completion(
+        self,
+        prompt: Union[str, List[Union[str, int]]],
+        temperature: float = 0.1,
+        min_p: float = 0.15,
+        repetition_penalty: float = 1.05,
+        min_image_tokens: int = 64,
+        max_image_tokens: int = 256,
+        do_image_splitting: bool = True,
+        max_tokens: int = -1,
+        stream: bool = False,
+        **kwargs
+    ) -> Union[Dict[str, Any], requests.Response]:
+        """
+        Generate a completion using the non-OAI compatible API.
+        
+        Args:
+            prompt: The prompt string or list of tokens
+            temperature: Sampling temperature (default: 0.1)
+            min_p: Minimum probability for sampling (default: 0.15)
+            repetition_penalty: Repetition penalty factor (default: 1.05)
+            min_image_tokens: Minimum image tokens (default: 64)
+            max_image_tokens: Maximum image tokens (default: 256)
+            do_image_splitting: Whether to split images (default: True)
+            max_tokens: Maximum tokens to generate (default: -1 for infinity)
+            stream: Whether to stream the response (default: False)
+            **kwargs: Additional parameters for the completion
+            
+        Returns:
+            Completion response or streaming response
+        """
+        payload = {
+            "prompt": prompt,
+            "temperature": temperature,
+            "min_p": min_p,
+            "repeat_penalty": repetition_penalty,
+            "min_image_tokens": min_image_tokens,
+            "max_image_tokens": max_image_tokens,
+            "do_image_splitting": do_image_splitting,
+            "cache_prompt": True,
+            "n_predict": max_tokens,
+            "stream": stream,
+            **kwargs
+        }
+        
+        response = requests.post(
+            f"{self.base_url}/completion",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            stream=stream
+        )
+        response.raise_for_status()
+        
+        if stream:
+            return response
+            
+        return response.json()
+        
+    @staticmethod
+    def _encode_image_to_base64(image_path: str) -> str:
+        """
+        Encode an image file to base64 string.
+        
+        Args:
+            image_path: Path to the image file
+            
+        Returns:
+            Base64 encoded image string
+        """
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+            
+    @staticmethod
+    def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
+        """
+        Encode an OpenCV image to base64 string.
+
+        Args:
+            image: OpenCV image (numpy array)
+
+        Returns:
+            Base64 encoded image string
+        """
+        _, buffer = cv2.imencode('.jpg', image)
+        return base64.b64encode(buffer).decode('utf-8')
+        
+    def create_multimodal_message(
+        self,
+        role: str,
+        content: str,
+        images: Optional[List[Union[str, np.ndarray]]] = None
+    ) -> Dict[str, Any]:
+        """
+        Create a multimodal message with text and images.
+
+        Args:
+            role: Role of the message (system, user, assistant)
+            content: Text content of the message
+            images: List of image paths or OpenCV images (numpy arrays)
+
+        Returns:
+            Formatted message dictionary
+        """
+        if not images:
+            return {"role": role, "content": content}
+            
+        # Process images
+        image_data = []
+        for img in images:
+            if isinstance(img, str):
+                # Image path
+                encoded_image = self._encode_image_to_base64(img)
+            else:
+                # OpenCV image
+                encoded_image = self._encode_cv2_image_to_base64(img)
+            image_data.append(encoded_image)
+            
+        # Create multimodal content
+        multimodal_content = [
+            {"type": "text", "text": content}
+        ]
+        
+        for img_data in image_data:
+            multimodal_content.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{img_data}"
+                }
+            })
+            
+        return {"role": role, "content": multimodal_content}