import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# --- MODEL CONFIG ---
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"  # You can swap for Llama 3, Qwen, etc.
# For better speed on free GPUs, use a quantized version like:
# MODEL_NAME = "TheBloke/Mistral-7B-Instruct-v0.3-GGUF"  # GGUF + llama.cpp (requires different loader)
# But for simplicity & HF compatibility, we'll use the HF version with 4-bit quantization

# Load tokenizer and model with 4-bit quantization for low-memory usage
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True,
)

# System prompt to guide behavior (like HuggingChat)
SYSTEM_PROMPT = """You are RunAshChat, a helpful, honest, and harmless AI assistant.
You are open-source, privacy-respecting, and do not store any user data.
Answer clearly, concisely, and thoughtfully. Avoid harmful, unethical, or biased content.
If you don't know something, say so."""

def format_prompt(message, history):
    # Format for Mistral-Instruct: [INST] prompt [/INST]
    full_prompt = f"<s>[INST] {SYSTEM_PROMPT}\n\n"
    for user_msg, bot_msg in history:
        full_prompt += f"{user_msg} [/INST] {bot_msg}</s><s>[INST] "
    full_prompt += f"{message} [/INST]"
    return full_prompt

def respond(message, history):
    prompt = format_prompt(message, history)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's reply (after last [/INST])
    response = response.split("[/INST]")[-1].strip()
    return response

# --- GRADIO INTERFACE ---
with gr.Blocks(theme=gr.themes.Soft(), title="RunAshChat") as demo:
    gr.Markdown("""
    # 🚀 RunAshChat  
    *Your open-source, privacy-first AI chat companion — inspired by HuggingChat.*  
    """)
    
    chatbot = gr.Chatbot(
        height=600,
        bubble_full_width=False,
        avatar_images=(None, "https://huggingface.co/datasets/huggingface/branding/resolve/main/huggingface-logo.svg")
    )
    
    msg = gr.Textbox(
        placeholder="Ask me anything... (e.g., 'Explain quantum computing like I'm 10')",
        label="Your message",
        container=False
    )
    
    with gr.Row():
        clear = gr.Button("🧹 Clear Chat")
        export = gr.Button("💾 Export Chat")
    
    def clear_chat():
        return None, ""
    
    def export_chat(chat_history):
        if not chat_history:
            return "No conversation to export."
        export_text = "\n\n".join([f"👤 You: {q}\n🤖 RunAshChat: {a}" for q, a in chat_history])
        return export_text
    
    msg.submit(respond, [msg, chatbot], [chatbot])
    clear.click(clear_chat, None, [chatbot, msg])
    export.click(export_chat, chatbot, gr.Textbox(label="Exported Chat", lines=15))

demo.launch()