import gradio as gr
from transformers import pipeline
import torch
import spaces

# Load the model pipeline
pipe = pipeline("text-generation", model="google/vaultgemma-1b", device="cuda", torch_dtype=torch.float16)

# Define the chat function
@spaces.GPU(duration=120)
def chat(message, history):
    # Format the conversation history for the model
    prompt = ""
    for user_msg, bot_msg in history:
        prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
    prompt += f"User: {message}\nAssistant:"
    
    # Generate response
    response = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9)
    generated_text = response[0]['generated_text']
    
    # Extract only the assistant's response
    assistant_response = generated_text.split("Assistant:")[-1].strip()
    
    return assistant_response

# Create the Gradio chat interface
demo = gr.ChatInterface(
    fn=chat,
    title="VaultGemma-1B Chatbot",
    description="A chatbot powered by Google's VaultGemma-1B model.",
    theme="soft",
    examples=[
        "What is the capital of France?",
        "Tell me a joke.",
        "Explain quantum computing in simple terms."
    ],
    concurrency_limit=1
)

# Launch the app
demo.launch()