Test scripts and utilities for evaluating vision-language models on jersey number detection using llama.cpp server.
264 lines
7.5 KiB
Bash
Executable File
264 lines
7.5 KiB
Bash
Executable File
#!/bin/bash
|
|
# ==============================================================================
|
|
# Test All Models Script for Jersey Detection
|
|
# ==============================================================================
|
|
# This script automatically tests all models defined in llama-swap-config.yaml
|
|
# with the jersey detection test suite.
|
|
#
|
|
# Usage:
|
|
# ./test_all_models.sh
|
|
# ./test_all_models.sh /path/to/images
|
|
# RESIZE=2048 ./test_all_models.sh
|
|
# OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
|
|
# ==============================================================================
|
|
|
|
# Note: We don't use 'set -e' here because we have explicit error handling
|
|
# in the test loop and want to give the user the option to continue on failures
|
|
|
|
# ==============================================================================
|
|
# Configuration Variables
|
|
# ==============================================================================
|
|
|
|
# Image directory containing test images
|
|
IMAGES_DIR="${1:-./test_images}"
|
|
|
|
# Prompt file to use for testing
|
|
PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"
|
|
|
|
# Resize images to this max dimension (set to empty string to disable)
|
|
RESIZE="${RESIZE:-1024}"
|
|
|
|
# Output file for results
|
|
OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"
|
|
|
|
# llama-swap configuration file
|
|
LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"
|
|
|
|
# Server URL
|
|
SERVER_URL="${SERVER_URL:-http://localhost:8080}"
|
|
|
|
# ==============================================================================
|
|
# Color codes for output
|
|
# ==============================================================================
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# ==============================================================================
|
|
# Helper Functions
|
|
# ==============================================================================
|
|
|
|
print_header() {
|
|
echo -e "${CYAN}============================================================================${NC}"
|
|
echo -e "${CYAN}$1${NC}"
|
|
echo -e "${CYAN}============================================================================${NC}"
|
|
}
|
|
|
|
print_info() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
print_success() {
|
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
}
|
|
|
|
print_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
print_warning() {
|
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
}
|
|
|
|
# ==============================================================================
|
|
# Validation
|
|
# ==============================================================================
|
|
|
|
print_header "Jersey Detection - Test All Models"
|
|
|
|
# Check if images directory exists
|
|
if [ ! -d "$IMAGES_DIR" ]; then
|
|
print_error "Image directory not found: $IMAGES_DIR"
|
|
echo "Usage: $0 <image_directory>"
|
|
exit 1
|
|
fi
|
|
|
|
# Check if prompt file exists
|
|
if [ ! -f "$PROMPT_FILE" ]; then
|
|
print_error "Prompt file not found: $PROMPT_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
# Check if llama-swap config exists
|
|
if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
|
|
print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
|
|
exit 1
|
|
fi
|
|
|
|
# Check if test script exists
|
|
if [ ! -f "test_jersey_detection.py" ]; then
|
|
print_error "test_jersey_detection.py not found in current directory"
|
|
exit 1
|
|
fi
|
|
|
|
# Check if server is running
|
|
print_info "Checking if llama-swap server is running at $SERVER_URL..."
|
|
if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
|
|
print_error "Cannot connect to llama-swap at $SERVER_URL"
|
|
echo ""
|
|
echo "Please start llama-swap first:"
|
|
echo " llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
|
|
echo ""
|
|
exit 1
|
|
fi
|
|
print_success "Server is running"
|
|
|
|
# ==============================================================================
|
|
# Extract model tags from YAML
|
|
# ==============================================================================
|
|
|
|
print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."
|
|
|
|
# Extract model IDs (keys under 'models:')
|
|
# This uses grep and sed to parse the YAML (simple parser, works for our format)
|
|
MODEL_TAGS=$(grep "^ [a-z]" "$LLAMA_SWAP_CONFIG" | \
|
|
grep -v " " | \
|
|
sed 's/:.*//' | \
|
|
sed 's/^ //')
|
|
|
|
if [ -z "$MODEL_TAGS" ]; then
|
|
print_error "No model tags found in $LLAMA_SWAP_CONFIG"
|
|
exit 1
|
|
fi
|
|
|
|
# Convert to array
|
|
readarray -t MODELS <<< "$MODEL_TAGS"
|
|
|
|
MODEL_COUNT=${#MODELS[@]}
|
|
print_success "Found $MODEL_COUNT models to test"
|
|
|
|
# ==============================================================================
|
|
# Display Configuration
|
|
# ==============================================================================
|
|
|
|
echo ""
|
|
print_info "Test Configuration:"
|
|
echo " Images directory: $IMAGES_DIR"
|
|
echo " Prompt file: $PROMPT_FILE"
|
|
echo " Resize: ${RESIZE:-Disabled}"
|
|
echo " Output file: $OUTPUT_FILE"
|
|
echo " Server URL: $SERVER_URL"
|
|
echo " Models to test: $MODEL_COUNT"
|
|
echo ""
|
|
|
|
# List all models
|
|
print_info "Models:"
|
|
for i in "${!MODELS[@]}"; do
|
|
echo " $((i+1)). ${MODELS[$i]}"
|
|
done
|
|
echo ""
|
|
|
|
# ==============================================================================
|
|
# Confirmation
|
|
# ==============================================================================
|
|
|
|
read -p "Continue with testing? (y/N) " -n 1 -r
|
|
echo
|
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
|
print_warning "Testing cancelled"
|
|
exit 0
|
|
fi
|
|
|
|
# ==============================================================================
|
|
# Run Tests
|
|
# ==============================================================================
|
|
|
|
print_header "Starting Tests"
|
|
|
|
START_TIME=$(date +%s)
|
|
SUCCESSFUL=0
|
|
FAILED=0
|
|
|
|
for i in "${!MODELS[@]}"; do
|
|
MODEL="${MODELS[$i]}"
|
|
MODEL_NUM=$((i+1))
|
|
|
|
echo ""
|
|
print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"
|
|
|
|
# Build command
|
|
CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
|
|
CMD="$CMD --model-tag \"$MODEL\""
|
|
CMD="$CMD --output-file \"$OUTPUT_FILE\""
|
|
CMD="$CMD --server-url \"$SERVER_URL\""
|
|
|
|
# Add resize if configured
|
|
if [ -n "$RESIZE" ]; then
|
|
CMD="$CMD --resize $RESIZE"
|
|
fi
|
|
|
|
print_info "Running: $CMD"
|
|
echo ""
|
|
|
|
# Run the test
|
|
if eval "$CMD"; then
|
|
print_success "Model $MODEL completed successfully"
|
|
SUCCESSFUL=$((SUCCESSFUL + 1))
|
|
else
|
|
print_error "Model $MODEL failed"
|
|
FAILED=$((FAILED + 1))
|
|
|
|
# Ask if user wants to continue
|
|
echo ""
|
|
read -p "Continue with remaining models? (Y/n) " -n 1 -r
|
|
echo
|
|
if [[ $REPLY =~ ^[Nn]$ ]]; then
|
|
print_warning "Testing stopped by user"
|
|
break
|
|
fi
|
|
fi
|
|
|
|
# Show progress
|
|
if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
|
|
print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
|
|
fi
|
|
done
|
|
|
|
# ==============================================================================
|
|
# Summary
|
|
# ==============================================================================
|
|
|
|
END_TIME=$(date +%s)
|
|
DURATION=$((END_TIME - START_TIME))
|
|
MINUTES=$((DURATION / 60))
|
|
SECONDS=$((DURATION % 60))
|
|
|
|
echo ""
|
|
print_header "Testing Complete"
|
|
echo ""
|
|
print_info "Summary:"
|
|
echo " Total models: $MODEL_COUNT"
|
|
echo " Successful: $SUCCESSFUL"
|
|
echo " Failed: $FAILED"
|
|
echo " Total time: ${MINUTES}m ${SECONDS}s"
|
|
echo ""
|
|
|
|
if [ $SUCCESSFUL -gt 0 ]; then
|
|
print_success "Results saved to: $OUTPUT_FILE"
|
|
echo ""
|
|
print_info "Analyze results with:"
|
|
echo " python analyze_jersey_results.py $OUTPUT_FILE"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Exit with error code if any tests failed
|
|
if [ $FAILED -gt 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
exit 0
|