jersey_test/scan_utils/llama_cpp_client.py

import base64
import json
import cv2
import numpy as np
import requests
from typing import List, Dict, Any, Optional, Union


class LlamaCppClient:
    """A Python client for interacting with a llama.cpp server."""

    def __init__(self, base_url: str = "http://192.168.1.34:8080"):
        """
        Initialize the client with the base URL of the llama.cpp server.

        Args:
            base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080)
        """
        self.base_url = base_url.rstrip('/')

    def health_check(self) -> Dict[str, Any]:
        """
        Check the health status of the server.

        Returns:
            Health status response from the server
        """
        response = requests.get(f"{self.base_url}/health")
        response.raise_for_status()
        return response.json()

    def get_models(self) -> Dict[str, Any]:
        """
        Get information about loaded models.

        Returns:
            Model information from the server
        """
        response = requests.get(f"{self.base_url}/v1/models")
        response.raise_for_status()
        return response.json()

    def chat_completion(
        self,
        messages: List[Dict[str, Any]],
        temperature: float = 0.1,
        min_p: float = 0.15,
        repetition_penalty: float = 1.05,
        min_image_tokens: int = 64,
        max_image_tokens: int = 256,
        do_image_splitting: bool = True,
        max_tokens: int = -1,
        stream: bool = False,
        **kwargs
    ) -> Union[Dict[str, Any], requests.Response]:
        """
        Generate a chat completion using the OpenAI-compatible API.

        Args:
            messages: List of message dictionaries with role and content
            temperature: Sampling temperature (default: 0.1)
            min_p: Minimum probability for sampling (default: 0.15)
            repetition_penalty: Repetition penalty factor (default: 1.05)
            min_image_tokens: Minimum image tokens (default: 64)
            max_image_tokens: Maximum image tokens (default: 256)
            do_image_splitting: Whether to split images (default: True)
            max_tokens: Maximum tokens to generate (default: -1 for infinity)
            stream: Whether to stream the response (default: False)
            **kwargs: Additional parameters for the completion

        Returns:
            Completion response or streaming response
        """
        payload = {
            "messages": messages,
            "temperature": temperature,
            "min_p": min_p,
            "repetition_penalty": repetition_penalty,
            "min_image_tokens": min_image_tokens,
            "max_image_tokens": max_image_tokens,
            "do_image_splitting": do_image_splitting,
            "max_tokens": max_tokens,
            "cache_prompt": True,
            "stream": stream,
            **kwargs
        }

        # Debug: Show model parameter if present (for llama-swap debugging)
        if 'model' in payload and payload['model']:
            import os
            if os.environ.get('DEBUG_LLAMA_SWAP'):
                print(f"[DEBUG] Requesting model: {payload['model']}")

        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json=payload,
            stream=stream
        )
        response.raise_for_status()

        if stream:
            return response

        return response.json()

    def completion(
        self,
        prompt: Union[str, List[Union[str, int]]],
        temperature: float = 0.1,
        min_p: float = 0.15,
        repetition_penalty: float = 1.05,
        min_image_tokens: int = 64,
        max_image_tokens: int = 256,
        do_image_splitting: bool = True,
        max_tokens: int = -1,
        stream: bool = False,
        **kwargs
    ) -> Union[Dict[str, Any], requests.Response]:
        """
        Generate a completion using the non-OAI compatible API.

        Args:
            prompt: The prompt string or list of tokens
            temperature: Sampling temperature (default: 0.1)
            min_p: Minimum probability for sampling (default: 0.15)
            repetition_penalty: Repetition penalty factor (default: 1.05)
            min_image_tokens: Minimum image tokens (default: 64)
            max_image_tokens: Maximum image tokens (default: 256)
            do_image_splitting: Whether to split images (default: True)
            max_tokens: Maximum tokens to generate (default: -1 for infinity)
            stream: Whether to stream the response (default: False)
            **kwargs: Additional parameters for the completion

        Returns:
            Completion response or streaming response
        """
        payload = {
            "prompt": prompt,
            "temperature": temperature,
            "min_p": min_p,
            "repeat_penalty": repetition_penalty,
            "min_image_tokens": min_image_tokens,
            "max_image_tokens": max_image_tokens,
            "do_image_splitting": do_image_splitting,
            "cache_prompt": True,
            "n_predict": max_tokens,
            "stream": stream,
            **kwargs
        }

        response = requests.post(
            f"{self.base_url}/completion",
            headers={"Content-Type": "application/json"},
            json=payload,
            stream=stream
        )
        response.raise_for_status()

        if stream:
            return response

        return response.json()

    @staticmethod
    def _encode_image_to_base64(image_path: str) -> str:
        """
        Encode an image file to base64 string.

        Args:
            image_path: Path to the image file

        Returns:
            Base64 encoded image string
        """
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    @staticmethod
    def _encode_cv2_image_to_base64(image: np.ndarray) -> str:
        """
        Encode an OpenCV image to base64 string.

        Args:
            image: OpenCV image (numpy array)

        Returns:
            Base64 encoded image string
        """
        _, buffer = cv2.imencode('.jpg', image)
        return base64.b64encode(buffer).decode('utf-8')

    def create_multimodal_message(
        self,
        role: str,
        content: str,
        images: Optional[List[Union[str, np.ndarray]]] = None
    ) -> Dict[str, Any]:
        """
        Create a multimodal message with text and images.

        Args:
            role: Role of the message (system, user, assistant)
            content: Text content of the message
            images: List of image paths or OpenCV images (numpy arrays)

        Returns:
            Formatted message dictionary
        """
        if not images:
            return {"role": role, "content": content}

        # Process images
        image_data = []
        for img in images:
            if isinstance(img, str):
                # Image path
                encoded_image = self._encode_image_to_base64(img)
            else:
                # OpenCV image
                encoded_image = self._encode_cv2_image_to_base64(img)
            image_data.append(encoded_image)

        # Create multimodal content
        multimodal_content = [
            {"type": "text", "text": content}
        ]

        for img_data in image_data:
            multimodal_content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{img_data}"
                }
            })

        return {"role": role, "content": multimodal_content}