jersey_test/test_all_models.sh

#!/bin/bash
# ==============================================================================
# Test All Models Script for Jersey Detection
# ==============================================================================
# This script automatically tests all models defined in llama-swap-config.yaml
# with the jersey detection test suite.
#
# Usage:
#   ./test_all_models.sh
#   ./test_all_models.sh /path/to/images
#   RESIZE=2048 ./test_all_models.sh
#   OUTPUT_FILE=custom_results.jsonl ./test_all_models.sh
# ==============================================================================

# Note: We don't use 'set -e' here because we have explicit error handling
# in the test loop and want to give the user the option to continue on failures

# ==============================================================================
# Configuration Variables
# ==============================================================================

# Image directory containing test images
IMAGES_DIR="${1:-./test_images}"

# Prompt file to use for testing
PROMPT_FILE="${PROMPT_FILE:-jersey_prompt_with_confidence.txt}"

# Resize images to this max dimension (set to empty string to disable)
RESIZE="${RESIZE:-1024}"

# Output file for results
OUTPUT_FILE="${OUTPUT_FILE:-jersey_detection_results.jsonl}"

# llama-swap configuration file
LLAMA_SWAP_CONFIG="${LLAMA_SWAP_CONFIG:-llama-swap-config.yaml}"

# Server URL
SERVER_URL="${SERVER_URL:-http://localhost:8080}"

# ==============================================================================
# Color codes for output
# ==============================================================================
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

# ==============================================================================
# Helper Functions
# ==============================================================================

print_header() {
    echo -e "${CYAN}============================================================================${NC}"
    echo -e "${CYAN}$1${NC}"
    echo -e "${CYAN}============================================================================${NC}"
}

print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

# ==============================================================================
# Validation
# ==============================================================================

print_header "Jersey Detection - Test All Models"

# Check if images directory exists
if [ ! -d "$IMAGES_DIR" ]; then
    print_error "Image directory not found: $IMAGES_DIR"
    echo "Usage: $0 <image_directory>"
    exit 1
fi

# Check if prompt file exists
if [ ! -f "$PROMPT_FILE" ]; then
    print_error "Prompt file not found: $PROMPT_FILE"
    exit 1
fi

# Check if llama-swap config exists
if [ ! -f "$LLAMA_SWAP_CONFIG" ]; then
    print_error "llama-swap config not found: $LLAMA_SWAP_CONFIG"
    exit 1
fi

# Check if test script exists
if [ ! -f "test_jersey_detection.py" ]; then
    print_error "test_jersey_detection.py not found in current directory"
    exit 1
fi

# Check if server is running
print_info "Checking if llama-swap server is running at $SERVER_URL..."
if ! curl -s "$SERVER_URL/health" > /dev/null 2>&1; then
    print_error "Cannot connect to llama-swap at $SERVER_URL"
    echo ""
    echo "Please start llama-swap first:"
    echo "  llama-swap --config $LLAMA_SWAP_CONFIG --listen localhost:8080"
    echo ""
    exit 1
fi
print_success "Server is running"

# ==============================================================================
# Extract model tags from YAML
# ==============================================================================

print_info "Extracting model tags from $LLAMA_SWAP_CONFIG..."

# Extract model IDs (keys under 'models:')
# This uses grep and sed to parse the YAML (simple parser, works for our format)
MODEL_TAGS=$(grep "^  [a-z]" "$LLAMA_SWAP_CONFIG" | \
             grep -v "    " | \
             sed 's/:.*//' | \
             sed 's/^  //')

if [ -z "$MODEL_TAGS" ]; then
    print_error "No model tags found in $LLAMA_SWAP_CONFIG"
    exit 1
fi

# Convert to array
readarray -t MODELS <<< "$MODEL_TAGS"

MODEL_COUNT=${#MODELS[@]}
print_success "Found $MODEL_COUNT models to test"

# ==============================================================================
# Display Configuration
# ==============================================================================

echo ""
print_info "Test Configuration:"
echo "  Images directory:  $IMAGES_DIR"
echo "  Prompt file:       $PROMPT_FILE"
echo "  Resize:            ${RESIZE:-Disabled}"
echo "  Output file:       $OUTPUT_FILE"
echo "  Server URL:        $SERVER_URL"
echo "  Models to test:    $MODEL_COUNT"
echo ""

# List all models
print_info "Models:"
for i in "${!MODELS[@]}"; do
    echo "  $((i+1)). ${MODELS[$i]}"
done
echo ""

# ==============================================================================
# Confirmation
# ==============================================================================

read -p "Continue with testing? (y/N) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
    print_warning "Testing cancelled"
    exit 0
fi

# ==============================================================================
# Run Tests
# ==============================================================================

print_header "Starting Tests"

START_TIME=$(date +%s)
SUCCESSFUL=0
FAILED=0

for i in "${!MODELS[@]}"; do
    MODEL="${MODELS[$i]}"
    MODEL_NUM=$((i+1))

    echo ""
    print_header "Testing Model $MODEL_NUM/$MODEL_COUNT: $MODEL"

    # Build command
    CMD="python test_jersey_detection.py \"$IMAGES_DIR\" \"$PROMPT_FILE\""
    CMD="$CMD --model-tag \"$MODEL\""
    CMD="$CMD --output-file \"$OUTPUT_FILE\""
    CMD="$CMD --server-url \"$SERVER_URL\""

    # Add resize if configured
    if [ -n "$RESIZE" ]; then
        CMD="$CMD --resize $RESIZE"
    fi

    print_info "Running: $CMD"
    echo ""

    # Run the test
    if eval "$CMD"; then
        print_success "Model $MODEL completed successfully"
        SUCCESSFUL=$((SUCCESSFUL + 1))
    else
        print_error "Model $MODEL failed"
        FAILED=$((FAILED + 1))

        # Ask if user wants to continue
        echo ""
        read -p "Continue with remaining models? (Y/n) " -n 1 -r
        echo
        if [[ $REPLY =~ ^[Nn]$ ]]; then
            print_warning "Testing stopped by user"
            break
        fi
    fi

    # Show progress
    if [ $MODEL_NUM -lt $MODEL_COUNT ]; then
        print_info "Progress: $MODEL_NUM/$MODEL_COUNT models completed"
    fi
done

# ==============================================================================
# Summary
# ==============================================================================

END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
MINUTES=$((DURATION / 60))
SECONDS=$((DURATION % 60))

echo ""
print_header "Testing Complete"
echo ""
print_info "Summary:"
echo "  Total models:      $MODEL_COUNT"
echo "  Successful:        $SUCCESSFUL"
echo "  Failed:            $FAILED"
echo "  Total time:        ${MINUTES}m ${SECONDS}s"
echo ""

if [ $SUCCESSFUL -gt 0 ]; then
    print_success "Results saved to: $OUTPUT_FILE"
    echo ""
    print_info "Analyze results with:"
    echo "  python analyze_jersey_results.py $OUTPUT_FILE"
fi

echo ""

# Exit with error code if any tests failed
if [ $FAILED -gt 0 ]; then
    exit 1
fi

exit 0