import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread # Model configuration MODEL_ID = "LiquidAI/LFM2-8B-A1B" # NOTE: This model requires transformers from source. # Add this to your requirements.txt: # git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6 # Load tokenizer globally (doesn't need GPU) print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False) # Load model globally (will be moved to GPU by ZeroGPU decorator) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cpu", # Load on CPU first, ZeroGPU will move it to GPU torch_dtype=torch.bfloat16, trust_remote_code=False, # attn_implementation="flash_attention_2" # Uncomment if you have compatible GPU ) print("Model loaded successfully!") @spaces.GPU(duration=120) def generate_response( message: str, history: list[dict[str, str]], system_message: str, max_new_tokens: int, temperature: float, min_p: float, repetition_penalty: float, ): """ Generate a response using the LiquidAI LFM2-8B model Args: message: The current user message history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}] system_message: System prompt to guide the model max_new_tokens: Maximum tokens to generate temperature: Sampling temperature min_p: Minimum probability threshold repetition_penalty: Penalty for repetition Yields: Generated text tokens (streaming) """ if not message.strip(): yield "Please enter a message." return # Move model to GPU (handled by ZeroGPU) model.to("cuda") # Build conversation history messages = [] # Add system message if provided if system_message.strip(): messages.append({"role": "system", "content": system_message}) # Add chat history for msg in history: messages.append(msg) # Add current user message messages.append({"role": "user", "content": message}) # Prepare input input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", tokenize=True, ).to(model.device) # Generate response with streaming streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) generation_kwargs = dict( input_ids=input_ids, streamer=streamer, do_sample=True, temperature=temperature, min_p=min_p, repetition_penalty=repetition_penalty, max_new_tokens=max_new_tokens, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the output streamer_output = "" for new_text in streamer: streamer_output += new_text yield streamer_output thread.join() # Create Gradio ChatInterface demo = gr.ChatInterface( generate_response, type="messages", title="🌊 LiquidAI LFM2-8B Chat", description=""" Chat with the **LiquidAI LFM2-8B-A1B** model using ZeroGPU. This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment. 💡 **Tip:** The first response may take a moment as the GPU is allocated. The model excels at: - Instruction following - Math and reasoning - Multi-turn conversations - Agentic tasks and data extraction ⚠️ **Note:** This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks. """, theme=gr.themes.Soft(), examples=[ ["What is C. elegans?"], ["Explain quantum entanglement in simple terms."], ["Write a short poem about artificial intelligence."], ["What are the main differences between Python and JavaScript?"], ["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"], ["Help me plan a 3-day itinerary for visiting Paris."], ], additional_inputs=[ gr.Textbox( value="You are a helpful assistant trained by Liquid AI.", label="System Message", info="Set the behavior and personality of the assistant" ), gr.Slider( minimum=64, maximum=2048, value=512, step=64, label="Max New Tokens", info="Maximum length of generated response" ), gr.Slider( minimum=0.1, maximum=2.0, value=0.3, step=0.1, label="Temperature", info="Higher values make output more random (recommended: 0.3)" ), gr.Slider( minimum=0.0, maximum=1.0, value=0.15, step=0.05, label="Min P", info="Minimum probability threshold for sampling (recommended: 0.15)" ), gr.Slider( minimum=1.0, maximum=2.0, value=1.05, step=0.05, label="Repetition Penalty", info="Penalty for repeating tokens (recommended: 1.05)" ), ], cache_examples=False, ) if __name__ == "__main__": demo.launch()