import base64 import json import cv2 import numpy as np import requests from typing import List, Dict, Any, Optional, Union class LlamaCppClient: """A Python client for interacting with a llama.cpp server.""" def __init__(self, base_url: str = "http://192.168.1.34:8080"): """ Initialize the client with the base URL of the llama.cpp server. Args: base_url: The base URL of the llama.cpp server (default: http://192.168.1.34:8080) """ self.base_url = base_url.rstrip('/') def health_check(self) -> Dict[str, Any]: """ Check the health status of the server. Returns: Health status response from the server """ response = requests.get(f"{self.base_url}/health") response.raise_for_status() return response.json() def get_models(self) -> Dict[str, Any]: """ Get information about loaded models. Returns: Model information from the server """ response = requests.get(f"{self.base_url}/v1/models") response.raise_for_status() return response.json() def chat_completion( self, messages: List[Dict[str, Any]], temperature: float = 0.1, min_p: float = 0.15, repetition_penalty: float = 1.05, min_image_tokens: int = 64, max_image_tokens: int = 256, do_image_splitting: bool = True, max_tokens: int = -1, stream: bool = False, **kwargs ) -> Union[Dict[str, Any], requests.Response]: """ Generate a chat completion using the OpenAI-compatible API. Args: messages: List of message dictionaries with role and content temperature: Sampling temperature (default: 0.1) min_p: Minimum probability for sampling (default: 0.15) repetition_penalty: Repetition penalty factor (default: 1.05) min_image_tokens: Minimum image tokens (default: 64) max_image_tokens: Maximum image tokens (default: 256) do_image_splitting: Whether to split images (default: True) max_tokens: Maximum tokens to generate (default: -1 for infinity) stream: Whether to stream the response (default: False) **kwargs: Additional parameters for the completion Returns: Completion response or streaming response """ payload = { "messages": messages, "temperature": temperature, "min_p": min_p, "repetition_penalty": repetition_penalty, "min_image_tokens": min_image_tokens, "max_image_tokens": max_image_tokens, "do_image_splitting": do_image_splitting, "max_tokens": max_tokens, "cache_prompt": True, "stream": stream, **kwargs } # Debug: Show model parameter if present (for llama-swap debugging) if 'model' in payload and payload['model']: import os if os.environ.get('DEBUG_LLAMA_SWAP'): print(f"[DEBUG] Requesting model: {payload['model']}") response = requests.post( f"{self.base_url}/v1/chat/completions", headers={"Content-Type": "application/json"}, json=payload, stream=stream ) response.raise_for_status() if stream: return response return response.json() def completion( self, prompt: Union[str, List[Union[str, int]]], temperature: float = 0.1, min_p: float = 0.15, repetition_penalty: float = 1.05, min_image_tokens: int = 64, max_image_tokens: int = 256, do_image_splitting: bool = True, max_tokens: int = -1, stream: bool = False, **kwargs ) -> Union[Dict[str, Any], requests.Response]: """ Generate a completion using the non-OAI compatible API. Args: prompt: The prompt string or list of tokens temperature: Sampling temperature (default: 0.1) min_p: Minimum probability for sampling (default: 0.15) repetition_penalty: Repetition penalty factor (default: 1.05) min_image_tokens: Minimum image tokens (default: 64) max_image_tokens: Maximum image tokens (default: 256) do_image_splitting: Whether to split images (default: True) max_tokens: Maximum tokens to generate (default: -1 for infinity) stream: Whether to stream the response (default: False) **kwargs: Additional parameters for the completion Returns: Completion response or streaming response """ payload = { "prompt": prompt, "temperature": temperature, "min_p": min_p, "repeat_penalty": repetition_penalty, "min_image_tokens": min_image_tokens, "max_image_tokens": max_image_tokens, "do_image_splitting": do_image_splitting, "cache_prompt": True, "n_predict": max_tokens, "stream": stream, **kwargs } response = requests.post( f"{self.base_url}/completion", headers={"Content-Type": "application/json"}, json=payload, stream=stream ) response.raise_for_status() if stream: return response return response.json() @staticmethod def _encode_image_to_base64(image_path: str) -> str: """ Encode an image file to base64 string. Args: image_path: Path to the image file Returns: Base64 encoded image string """ with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') @staticmethod def _encode_cv2_image_to_base64(image: np.ndarray) -> str: """ Encode an OpenCV image to base64 string. Args: image: OpenCV image (numpy array) Returns: Base64 encoded image string """ _, buffer = cv2.imencode('.jpg', image) return base64.b64encode(buffer).decode('utf-8') def create_multimodal_message( self, role: str, content: str, images: Optional[List[Union[str, np.ndarray]]] = None ) -> Dict[str, Any]: """ Create a multimodal message with text and images. Args: role: Role of the message (system, user, assistant) content: Text content of the message images: List of image paths or OpenCV images (numpy arrays) Returns: Formatted message dictionary """ if not images: return {"role": role, "content": content} # Process images image_data = [] for img in images: if isinstance(img, str): # Image path encoded_image = self._encode_image_to_base64(img) else: # OpenCV image encoded_image = self._encode_cv2_image_to_base64(img) image_data.append(encoded_image) # Create multimodal content multimodal_content = [ {"type": "text", "text": content} ] for img_data in image_data: multimodal_content.append({ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{img_data}" } }) return {"role": role, "content": multimodal_content}