Initial commit: Jersey detection test suite

Test scripts and utilities for evaluating vision-language models
on jersey number detection using llama.cpp server.
This commit is contained in:
2026-01-20 13:37:01 -07:00
commit 8706edcd13
14 changed files with 3080 additions and 0 deletions

263
test_all_models.sh Executable file
View File

@ -0,0 +1,263 @@
#!/bin/bash
# ==============================================================================
# Test All Models Script for Jersey Detection
# ==============================================================================
# This script automatically tests all models defined in llama-swap-config.yaml
# with the jersey detection test suite.
#
# Usage:
# ./test_all_models.sh
# ./test_all_models.sh /path/to/images
# RESIZE=2048 ./test_all_models.sh
# OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
# ==============================================================================
# Note: We don't use 'set -e' here because we have explicit error handling
# in the test loop and want to give the user the option to continue on failures
# ==============================================================================
# Configuration Variables
# ==============================================================================
# Image directory containing test images
IMAGES_DIR="${1:-./test_images}"
# Prompt file to use for testing
PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
# Resize images to this max dimension (set to empty string to disable)
RESIZE="${RESIZE:-1024}"
# Output file for results
OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
# llama-swap configuration file
LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
# Server URL
SERVER_URL="${SERVER_URL:-http://localhost:8080}"
# ==============================================================================
# Color codes for output
# ==============================================================================
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# ==============================================================================
# Helper Functions
# ==============================================================================
print_header() {
echo -e "${CYAN}============================================================================${NC}"
echo -e "${CYAN}$1${NC}"
echo -e "${CYAN}============================================================================${NC}"
}
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
# ==============================================================================
# Validation
# ==============================================================================
print_header "Jersey Detection - Test All Models"
# Check if images directory exists
if [ ! -d "$IMAGES_DIR" ]; then
print_error "Image directory not found: $IMAGES_DIR"
echo "Usage: $0 <image_directory>"
exit 1
fi
# Check if prompt file exists
if [ ! -f "$PROMPT_FILE" ]; then
print_error "Prompt file not found: $PROMPT_FILE"
exit 1
fi
# Check if llama-swap config exists
if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
exit 1
fi
# Check if test script exists
if [ ! -f "test_jersey_detection.py" ]; then
print_error "test_jersey_detection.py not found in current directory"
exit 1
fi
# Check if server is running
print_info "Checking if llama-swap server is running at $SERVER_URL..."
if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
print_error "Cannot connect to llama-swap at $SERVER_URL"
echo ""
echo "Please start llama-swap first:"
echo " llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
echo ""
exit 1
fi
print_success "Server is running"
# ==============================================================================
# Extract model tags from YAML
# ==============================================================================
print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
# Extract model IDs (keys under 'models:')
# This uses grep and sed to parse the YAML (simple parser, works for our format)
MODEL_TAGS=$(grep "^ [a-z]" "$LLAMA_SWAP_CONFIG" | \
grep -v " " | \
sed 's/:.*//' | \
sed 's/^ //')
if [ -z "$MODEL_TAGS" ]; then
print_error "No model tags found in $LLAMA_SWAP_CONFIG"
exit 1
fi
# Convert to array
readarray -t MODELS <<< "$MODEL_TAGS"
MODEL_COUNT=${#MODELS[@]}
print_success "Found $MODEL_COUNT models to test"
# ==============================================================================
# Display Configuration
# ==============================================================================
echo ""
print_info "Test Configuration:"
echo " Images directory: $IMAGES_DIR"
echo " Prompt file: $PROMPT_FILE"
echo " Resize: ${RESIZE:-Disabled}"
echo " Output file: $OUTPUT_FILE"
echo " Server URL: $SERVER_URL"
echo " Models to test: $MODEL_COUNT"
echo ""
# List all models
print_info "Models:"
for i in "${!MODELS[@]}"; do
echo " $((i+1)). ${MODELS[$i]}"
done
echo ""
# ==============================================================================
# Confirmation
# ==============================================================================
read -p "Continue with testing? (y/N) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_warning "Testing cancelled"
exit 0
fi
# ==============================================================================
# Run Tests
# ==============================================================================
print_header "Starting Tests"
START_TIME=$(date +%s)
SUCCESSFUL=0
FAILED=0
for i in "${!MODELS[@]}"; do
MODEL="${MODELS[$i]}"
MODEL_NUM=$((i+1))
echo ""
print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
# Build command
CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
CMD="$CMD --model-tag \"$MODEL\""
CMD="$CMD --output-file \"$OUTPUT_FILE\""
CMD="$CMD --server-url \"$SERVER_URL\""
# Add resize if configured
if [ -n "$RESIZE" ]; then
CMD="$CMD --resize $RESIZE"
fi
print_info "Running: $CMD"
echo ""
# Run the test
if eval "$CMD"; then
print_success "Model $MODEL completed successfully"
SUCCESSFUL=$((SUCCESSFUL + 1))
else
print_error "Model $MODEL failed"
FAILED=$((FAILED + 1))
# Ask if user wants to continue
echo ""
read -p "Continue with remaining models? (Y/n) " -n 1 -r
echo
if [[ $REPLY =~ ^[Nn]$ ]]; then
print_warning "Testing stopped by user"
break
fi
fi
# Show progress
if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
fi
done
# ==============================================================================
# Summary
# ==============================================================================
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
MINUTES=$((DURATION / 60))
SECONDS=$((DURATION % 60))
echo ""
print_header "Testing Complete"
echo ""
print_info "Summary:"
echo " Total models: $MODEL_COUNT"
echo " Successful: $SUCCESSFUL"
echo " Failed: $FAILED"
echo " Total time: ${MINUTES}m ${SECONDS}s"
echo ""
if [ $SUCCESSFUL -gt 0 ]; then
print_success "Results saved to: $OUTPUT_FILE"
echo ""
print_info "Analyze results with:"
echo " python analyze_jersey_results.py $OUTPUT_FILE"
fi
echo ""
# Exit with error code if any tests failed
if [ $FAILED -gt 0 ]; then
exit 1
fi
exit 0