# inspiration from -> https://huggingface.co/spaces/sitammeur/Gemma-llamacpp import os import sys from typing import List, Tuple from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent from llama_cpp_agent.providers import LlamaCppPythonProvider from llama_cpp_agent.chat_history import BasicChatHistory from llama_cpp_agent.chat_history.messages import Roles from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers from huggingface_hub import hf_hub_download import gradio as gr # Load the Environment Variables from .env file huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # Download gguf model files if not os.path.exists("./models"): os.makedirs("./models") hf_hub_download( repo_id="SRP-base-model-training/gemma_3_800M_sft_v2_translation-kazparc_latest", filename="gemma_3_800M_sft_v2_translation-kazparc_latest.gguf", local_dir="./models", ) # Define the prompt markers for Gemma 3 gemma_3_prompt_markers = { Roles.system: PromptMarkers("system\n", "\n"), Roles.user: PromptMarkers("user\n", "\n"), Roles.assistant: PromptMarkers("assistant", ""), Roles.tool: PromptMarkers("", ""), } gemma_3_formatter = MessagesFormatter( pre_prompt="", prompt_markers=gemma_3_prompt_markers, include_sys_prompt_in_first_user_message=True, default_stop_sequences=["", ""], strip_prompt=False, bos_token="", eos_token="", ) # Translation direction to prompts mapping direction_to_prompts = { "English to Kazakh": { "system": "You are a professional translator. Translate the following sentence into қазақ.", "prefix": "" }, "Kazakh to English": { "system": "Сіз кәсіби аудармашысыз. Төмендегі сөйлемді English тіліне аударыңыз.", "prefix": "" }, "Kazakh to Russian": { "system": "Сіз кәсіби аудармашысыз. Төмендегі сөйлемді орыс тіліне аударыңыз.", "prefix": "" }, "Russian to Kazakh": { "system": "Вы профессиональный переводчик. Переведите следующее предложение на қазақ язык.", "prefix": "" } } llm = None llm_model = None def respond( message: str, history: List[Tuple[str, str]], model: str = "gemma_3_800M_sft_v2_translation-kazparc_latest.gguf", direction: str = "English to Kazakh", max_tokens: int = 64, temperature: float = 0.7, top_p: float = 0.95, top_k: int = 40, repeat_penalty: float = 1.1, ): """ Respond to a message by translating it using the specified direction. Args: message (str): The text to translate. history (List[Tuple[str, str]]): The chat history. direction (str): The translation direction (e.g., "English to Kazakh"). model (str): The model file to use. max_tokens (int): Maximum number of tokens to generate. temperature (float): Sampling temperature. top_p (float): Top-p sampling parameter. top_k (int): Top-k sampling parameter. repeat_penalty (float): Penalty for repetition. Yields: str: The translated text as it is generated. """ global llm, llm_model if llm is None or llm_model != model: model_path = f"models/{model}" if not os.path.exists(model_path): yield f"Error: Model file not found at {model_path}." return llm = Llama( model_path=model_path, flash_attn=False, n_gpu_layers=0, n_batch=8, n_ctx=2048, n_threads=8, n_threads_batch=8, ) llm_model = model provider = LlamaCppPythonProvider(llm) # Get system prompt and user prefix based on direction prompts = direction_to_prompts[direction] system_message = prompts["system"] user_prefix = prompts["prefix"] agent = LlamaCppAgent( provider, system_prompt=system_message, custom_messages_formatter=gemma_3_formatter, debug_output=True, ) settings = provider.get_provider_default_settings() settings.temperature = temperature settings.top_k = top_k settings.top_p = top_p settings.max_tokens = max_tokens settings.repeat_penalty = repeat_penalty settings.stream = True messages = BasicChatHistory() for user_msg, assistant_msg in history: full_user_msg = user_prefix + " " + user_msg messages.add_message({"role": Roles.user, "content": full_user_msg}) messages.add_message({"role": Roles.assistant, "content": assistant_msg}) full_message = user_prefix + " " + message stream = agent.get_chat_response( full_message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False, ) outputs = "" for output in stream: outputs += output yield outputs demo = gr.ChatInterface( respond, examples=[["Hello"], ["Сәлем"], ["Привет"]], additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Dropdown( choices=[ "gemma_3_800M_sft_v2_translation-kazparc_latest.gguf", ], value="gemma_3_800M_sft_v2_translation-kazparc_latest.gguf", label="Model", info="Select the AI model to use for chat", ), gr.Dropdown( choices=["English to Kazakh", "Kazakh to English", "Kazakh to Russian", "Russian to Kazakh"], label="Translation Direction", info="Select the direction of translation" ), gr.Slider( minimum=512, maximum=2048, value=1024, step=1, label="Max Tokens", info="Maximum length of the translation" ), gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Controls randomness (higher = more creative)" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p", info="Nucleus sampling threshold" ), gr.Slider( minimum=1, maximum=100, value=40, step=1, label="Top-k", info="Limits vocabulary to top K tokens" ), gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty", info="Penalizes repeated words" ), ], theme="Ocean", submit_btn="Translate", stop_btn="Stop", title="Kazakh Translation Model", description="Translate text between Kazakh, English, and Russian using a specialized language model.", chatbot=gr.Chatbot(scale=1, show_copy_button=True), cache_examples=False, ) if __name__ == "__main__": demo.launch( share=False, server_name="0.0.0.0", server_port=7860, show_api=False, )