Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions
--- a/test_all_models.sh
+++ b/test_all_models.sh
@ -0,0 +1,263 @@
+#!/bin/bash
+# ==============================================================================
+# Test All Models Script for Jersey Detection
+# ==============================================================================
+# This script automatically tests all models defined in llama-swap-config.yaml
+# with the jersey detection test suite.
+#
+# Usage:
+#   ./test_all_models.sh
+#   ./test_all_models.sh /path/to/images
+#   RESIZE=2048 ./test_all_models.sh
+#   OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
+# ==============================================================================
+
+# Note: We don't use 'set -e' here because we have explicit error handling
+# in the test loop and want to give the user the option to continue on failures
+
+# ==============================================================================
+# Configuration Variables
+# ==============================================================================
+
+# Image directory containing test images
+IMAGES_DIR="${1:-./test_images}"
+
+# Prompt file to use for testing
+PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
+
+# Resize images to this max dimension (set to empty string to disable)
+RESIZE="${RESIZE:-1024}"
+
+# Output file for results
+OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
+
+# llama-swap configuration file
+LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
+
+# Server URL
+SERVER_URL="${SERVER_URL:-http://localhost:8080}"
+
+# ==============================================================================
+# Color codes for output
+# ==============================================================================
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# ==============================================================================
+# Helper Functions
+# ==============================================================================
+
+print_header() {
+    echo -e "${CYAN}============================================================================${NC}"
+    echo -e "${CYAN}$1${NC}"
+    echo -e "${CYAN}============================================================================${NC}"
+}
+
+print_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+# ==============================================================================
+# Validation
+# ==============================================================================
+
+print_header "Jersey Detection - Test All Models"
+
+# Check if images directory exists
+if [ ! -d "$IMAGES_DIR" ]; then
+    print_error "Image directory not found: $IMAGES_DIR"
+    echo "Usage: $0 <image_directory>"
+    exit 1
+fi
+
+# Check if prompt file exists
+if [ ! -f "$PROMPT_FILE" ]; then
+    print_error "Prompt file not found: $PROMPT_FILE"
+    exit 1
+fi
+
+# Check if llama-swap config exists
+if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
+    print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
+    exit 1
+fi
+
+# Check if test script exists
+if [ ! -f "test_jersey_detection.py" ]; then
+    print_error "test_jersey_detection.py not found in current directory"
+    exit 1
+fi
+
+# Check if server is running
+print_info "Checking if llama-swap server is running at $SERVER_URL..."
+if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
+    print_error "Cannot connect to llama-swap at $SERVER_URL"
+    echo ""
+    echo "Please start llama-swap first:"
+    echo "  llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
+    echo ""
+    exit 1
+fi
+print_success "Server is running"
+
+# ==============================================================================
+# Extract model tags from YAML
+# ==============================================================================
+
+print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
+
+# Extract model IDs (keys under 'models:')
+# This uses grep and sed to parse the YAML (simple parser, works for our format)
+MODEL_TAGS=$(grep "^  [a-z]" "$LLAMA_SWAP_CONFIG" | \
+             grep -v "    " | \
+             sed 's/:.*//' | \
+             sed 's/^  //')
+
+if [ -z "$MODEL_TAGS" ]; then
+    print_error "No model tags found in $LLAMA_SWAP_CONFIG"
+    exit 1
+fi
+
+# Convert to array
+readarray -t MODELS <<< "$MODEL_TAGS"
+
+MODEL_COUNT=${#MODELS[@]}
+print_success "Found $MODEL_COUNT models to test"
+
+# ==============================================================================
+# Display Configuration
+# ==============================================================================
+
+echo ""
+print_info "Test Configuration:"
+echo "  Images directory:  $IMAGES_DIR"
+echo "  Prompt file:       $PROMPT_FILE"
+echo "  Resize:            ${RESIZE:-Disabled}"
+echo "  Output file:       $OUTPUT_FILE"
+echo "  Server URL:        $SERVER_URL"
+echo "  Models to test:    $MODEL_COUNT"
+echo ""
+
+# List all models
+print_info "Models:"
+for i in "${!MODELS[@]}"; do
+    echo "  $((i+1)). ${MODELS[$i]}"
+done
+echo ""
+
+# ==============================================================================
+# Confirmation
+# ==============================================================================
+
+read -p "Continue with testing? (y/N) " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+    print_warning "Testing cancelled"
+    exit 0
+fi
+
+# ==============================================================================
+# Run Tests
+# ==============================================================================
+
+print_header "Starting Tests"
+
+START_TIME=$(date +%s)
+SUCCESSFUL=0
+FAILED=0
+
+for i in "${!MODELS[@]}"; do
+    MODEL="${MODELS[$i]}"
+    MODEL_NUM=$((i+1))
+
+    echo ""
+    print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
+
+    # Build command
+    CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
+    CMD="$CMD --model-tag \"$MODEL\""
+    CMD="$CMD --output-file \"$OUTPUT_FILE\""
+    CMD="$CMD --server-url \"$SERVER_URL\""
+
+    # Add resize if configured
+    if [ -n "$RESIZE" ]; then
+        CMD="$CMD --resize $RESIZE"
+    fi
+
+    print_info "Running: $CMD"
+    echo ""
+
+    # Run the test
+    if eval "$CMD"; then
+        print_success "Model $MODEL completed successfully"
+        SUCCESSFUL=$((SUCCESSFUL + 1))
+    else
+        print_error "Model $MODEL failed"
+        FAILED=$((FAILED + 1))
+
+        # Ask if user wants to continue
+        echo ""
+        read -p "Continue with remaining models? (Y/n) " -n 1 -r
+        echo
+        if [[ $REPLY =~ ^[Nn]$ ]]; then
+            print_warning "Testing stopped by user"
+            break
+        fi
+    fi
+
+    # Show progress
+    if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
+        print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
+    fi
+done
+
+# ==============================================================================
+# Summary
+# ==============================================================================
+
+END_TIME=$(date +%s)
+DURATION=$((END_TIME - START_TIME))
+MINUTES=$((DURATION / 60))
+SECONDS=$((DURATION % 60))
+
+echo ""
+print_header "Testing Complete"
+echo ""
+print_info "Summary:"
+echo "  Total models:      $MODEL_COUNT"
+echo "  Successful:        $SUCCESSFUL"
+echo "  Failed:            $FAILED"
+echo "  Total time:        ${MINUTES}m ${SECONDS}s"
+echo ""
+
+if [ $SUCCESSFUL -gt 0 ]; then
+    print_success "Results saved to: $OUTPUT_FILE"
+    echo ""
+    print_info "Analyze results with:"
+    echo "  python analyze_jersey_results.py $OUTPUT_FILE"
+fi
+
+echo ""
+
+# Exit with error code if any tests failed
+if [ $FAILED -gt 0 ]; then
+    exit 1
+fi
+
+exit 0