Initial commit: Jersey detection test suite
Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
This commit is contained in:
263
test_all_models.sh
Executable file
263
test_all_models.sh
Executable file
@ -0,0 +1,263 @@
|
||||
#!/bin/bash
|
||||
# ==============================================================================
|
||||
# Test All Models Script for Jersey Detection
|
||||
# ==============================================================================
|
||||
# This script automatically tests all models defined in llama-swap-config.yaml
|
||||
# with the jersey detection test suite.
|
||||
#
|
||||
# Usage:
|
||||
# ./test_all_models.sh
|
||||
# ./test_all_models.sh /path/to/images
|
||||
# RESIZE=2048 ./test_all_models.sh
|
||||
# OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
|
||||
# ==============================================================================
|
||||
|
||||
# Note: We don't use 'set -e' here because we have explicit error handling
|
||||
# in the test loop and want to give the user the option to continue on failures
|
||||
|
||||
# ==============================================================================
|
||||
# Configuration Variables
|
||||
# ==============================================================================
|
||||
|
||||
# Image directory containing test images
|
||||
IMAGES_DIR="${1:-./test_images}"
|
||||
|
||||
# Prompt file to use for testing
|
||||
PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
|
||||
|
||||
# Resize images to this max dimension (set to empty string to disable)
|
||||
RESIZE="${RESIZE:-1024}"
|
||||
|
||||
# Output file for results
|
||||
OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
|
||||
|
||||
# llama-swap configuration file
|
||||
LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
|
||||
|
||||
# Server URL
|
||||
SERVER_URL="${SERVER_URL:-http://localhost:8080}"
|
||||
|
||||
# ==============================================================================
|
||||
# Color codes for output
|
||||
# ==============================================================================
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# ==============================================================================
|
||||
# Helper Functions
|
||||
# ==============================================================================
|
||||
|
||||
print_header() {
|
||||
echo -e "${CYAN}============================================================================${NC}"
|
||||
echo -e "${CYAN}$1${NC}"
|
||||
echo -e "${CYAN}============================================================================${NC}"
|
||||
}
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# Validation
|
||||
# ==============================================================================
|
||||
|
||||
print_header "Jersey Detection - Test All Models"
|
||||
|
||||
# Check if images directory exists
|
||||
if [ ! -d "$IMAGES_DIR" ]; then
|
||||
print_error "Image directory not found: $IMAGES_DIR"
|
||||
echo "Usage: $0 <image_directory>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if prompt file exists
|
||||
if [ ! -f "$PROMPT_FILE" ]; then
|
||||
print_error "Prompt file not found: $PROMPT_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if llama-swap config exists
|
||||
if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
|
||||
print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if test script exists
|
||||
if [ ! -f "test_jersey_detection.py" ]; then
|
||||
print_error "test_jersey_detection.py not found in current directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if server is running
|
||||
print_info "Checking if llama-swap server is running at $SERVER_URL..."
|
||||
if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
|
||||
print_error "Cannot connect to llama-swap at $SERVER_URL"
|
||||
echo ""
|
||||
echo "Please start llama-swap first:"
|
||||
echo " llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
print_success "Server is running"
|
||||
|
||||
# ==============================================================================
|
||||
# Extract model tags from YAML
|
||||
# ==============================================================================
|
||||
|
||||
print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
|
||||
|
||||
# Extract model IDs (keys under 'models:')
|
||||
# This uses grep and sed to parse the YAML (simple parser, works for our format)
|
||||
MODEL_TAGS=$(grep "^ [a-z]" "$LLAMA_SWAP_CONFIG" | \
|
||||
grep -v " " | \
|
||||
sed 's/:.*//' | \
|
||||
sed 's/^ //')
|
||||
|
||||
if [ -z "$MODEL_TAGS" ]; then
|
||||
print_error "No model tags found in $LLAMA_SWAP_CONFIG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Convert to array
|
||||
readarray -t MODELS <<< "$MODEL_TAGS"
|
||||
|
||||
MODEL_COUNT=${#MODELS[@]}
|
||||
print_success "Found $MODEL_COUNT models to test"
|
||||
|
||||
# ==============================================================================
|
||||
# Display Configuration
|
||||
# ==============================================================================
|
||||
|
||||
echo ""
|
||||
print_info "Test Configuration:"
|
||||
echo " Images directory: $IMAGES_DIR"
|
||||
echo " Prompt file: $PROMPT_FILE"
|
||||
echo " Resize: ${RESIZE:-Disabled}"
|
||||
echo " Output file: $OUTPUT_FILE"
|
||||
echo " Server URL: $SERVER_URL"
|
||||
echo " Models to test: $MODEL_COUNT"
|
||||
echo ""
|
||||
|
||||
# List all models
|
||||
print_info "Models:"
|
||||
for i in "${!MODELS[@]}"; do
|
||||
echo " $((i+1)). ${MODELS[$i]}"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ==============================================================================
|
||||
# Confirmation
|
||||
# ==============================================================================
|
||||
|
||||
read -p "Continue with testing? (y/N) " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
print_warning "Testing cancelled"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ==============================================================================
|
||||
# Run Tests
|
||||
# ==============================================================================
|
||||
|
||||
print_header "Starting Tests"
|
||||
|
||||
START_TIME=$(date +%s)
|
||||
SUCCESSFUL=0
|
||||
FAILED=0
|
||||
|
||||
for i in "${!MODELS[@]}"; do
|
||||
MODEL="${MODELS[$i]}"
|
||||
MODEL_NUM=$((i+1))
|
||||
|
||||
echo ""
|
||||
print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
|
||||
|
||||
# Build command
|
||||
CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
|
||||
CMD="$CMD --model-tag \"$MODEL\""
|
||||
CMD="$CMD --output-file \"$OUTPUT_FILE\""
|
||||
CMD="$CMD --server-url \"$SERVER_URL\""
|
||||
|
||||
# Add resize if configured
|
||||
if [ -n "$RESIZE" ]; then
|
||||
CMD="$CMD --resize $RESIZE"
|
||||
fi
|
||||
|
||||
print_info "Running: $CMD"
|
||||
echo ""
|
||||
|
||||
# Run the test
|
||||
if eval "$CMD"; then
|
||||
print_success "Model $MODEL completed successfully"
|
||||
SUCCESSFUL=$((SUCCESSFUL + 1))
|
||||
else
|
||||
print_error "Model $MODEL failed"
|
||||
FAILED=$((FAILED + 1))
|
||||
|
||||
# Ask if user wants to continue
|
||||
echo ""
|
||||
read -p "Continue with remaining models? (Y/n) " -n 1 -r
|
||||
echo
|
||||
if [[ $REPLY =~ ^[Nn]$ ]]; then
|
||||
print_warning "Testing stopped by user"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
|
||||
# Show progress
|
||||
if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
|
||||
print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
|
||||
fi
|
||||
done
|
||||
|
||||
# ==============================================================================
|
||||
# Summary
|
||||
# ==============================================================================
|
||||
|
||||
END_TIME=$(date +%s)
|
||||
DURATION=$((END_TIME - START_TIME))
|
||||
MINUTES=$((DURATION / 60))
|
||||
SECONDS=$((DURATION % 60))
|
||||
|
||||
echo ""
|
||||
print_header "Testing Complete"
|
||||
echo ""
|
||||
print_info "Summary:"
|
||||
echo " Total models: $MODEL_COUNT"
|
||||
echo " Successful: $SUCCESSFUL"
|
||||
echo " Failed: $FAILED"
|
||||
echo " Total time: ${MINUTES}m ${SECONDS}s"
|
||||
echo ""
|
||||
|
||||
if [ $SUCCESSFUL -gt 0 ]; then
|
||||
print_success "Results saved to: $OUTPUT_FILE"
|
||||
echo ""
|
||||
print_info "Analyze results with:"
|
||||
echo " python analyze_jersey_results.py $OUTPUT_FILE"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Exit with error code if any tests failed
|
||||
if [ $FAILED -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
Reference in New Issue
Block a user