import os from threading import Thread from typing import Iterator from PIL import Image import gradio as gr import spaces import torch from transformers import ( AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer, ) import subprocess subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) DESCRIPTION = """\ # MamayLM-Gemma-3-12B-IT-v1.0 demo [πŸͺͺ **Model card**](https://huggingface.co/INSAIT-Institute/MamayLM-Gemma-3-12B-IT-v1.0) """ MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 2048 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) model_id = "INSAIT-Institute/MamayLM-Gemma-3-12B-IT-v1.0" processor = AutoProcessor.from_pretrained(model_id) attn_impl = "flash_attention_2" if torch.cuda.is_available() else "eager" model = Gemma3ForConditionalGeneration.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation=attn_impl, ) model.eval() eos_token_ids = [1, 106] print("Model loaded successfully.") @spaces.GPU(duration=120) # Increased duration for the larger model def generate( message: dict, chat_history: list[list], system_message: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.95, top_k: int = 50, repetition_penalty: float = 1.2, ) -> Iterator[str]: """ Generates a response from the model based on the user's message and chat history. This function is designed to work with Gradio's multimodal ChatInterface. """ conversation = [] all_images = [] # Add system message if provided. This guides the model's behavior. if system_message: conversation.append( {"role": "system", "content": [{"type": "text", "text": system_message}]} ) # Process past turns from Gradio's chat_history for user_turn, bot_turn in chat_history: # Reconstruct the user's turn, which might include an image user_content = [] print("---") print(user_turn) print(bot_turn) print("---") if isinstance(user_turn, tuple): # User turn with an image if len(user_turn) == 1: # TODO: IDK now how to fix this bug continue img_path, txt = user_turn pil_img = Image.open(img_path).convert("RGB") all_images.append(pil_img) user_content.append({"type": "image"}) if txt: user_content.append({"type": "text", "text": txt}) elif user_turn: # Text-only user turn user_content.append({"type": "text", "text": user_turn}) if user_content: conversation.append({"role": "user", "content": user_content}) # Reconstruct the assistant's turn if bot_turn: conversation.append( {"role": "assistant", "content": [{"type": "text", "text": bot_turn}]} ) # Process the current user message, which can include new images current_user_content = [] text = message["text"] for img_path in message["files"]: pil_img = Image.open(img_path).convert("RGB") all_images.append(pil_img) current_user_content.append({"type": "image"}) if text: current_user_content.append({"type": "text", "text": text}) if current_user_content: conversation.append({"role": "user", "content": current_user_content}) print("### DEBUG:") print(conversation) print("####") # Use the processor to create the prompt and preprocess images prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) if len(all_images) > 0: inputs = processor(text=prompt, images=all_images, return_tensors="pt").to( model.device ) else: inputs = processor(text=prompt, return_tensors="pt").to(model.device) # Set up the streamer for text generation streamer = TextIteratorStreamer( processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True ) generate_kwargs = dict( inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, eos_token_id=eos_token_ids, temperature=temperature if temperature > 0 else 0.001, # Temperature must be > 0 top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, ) # Run generation in a separate thread t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() # Yield generated text chunks outputs = [] for text_chunk in streamer: outputs.append(text_chunk) yield "".join(outputs) chat_interface = gr.ChatInterface( multimodal=True, fn=generate, additional_inputs=[ gr.Textbox( value="", label="System message", render=False, ), gr.Slider( label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, ), gr.Slider( label="Temperature", minimum=0, maximum=4.0, step=0.1, value=0.1, # default from https://huggingface.co/docs/transformers/en/main_classes/text_generation ), gr.Slider( label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=1, # from https://huggingface.co/google/gemma-3-270m-it/blob/main/generation_config.json ), gr.Slider( label="Top-k", minimum=1, maximum=1000, step=1, value=25, # from https://huggingface.co/google/gemma-3-270m-it/blob/main/generation_config.json ), gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1, # default from https://huggingface.co/docs/transformers/en/main_classes/text_generation ), ], stop_btn="Stop Generation", examples=[ ["ΠŸΡ€ΠΈΠ²Ρ–Ρ‚! Π―ΠΊ справи?"], [ "Плюси Ρ‚Π° мінуси довгострокових стосунків. ΠœΠ°Ρ€ΠΊΠΎΠ²Π°Π½ΠΈΠΉ список Ρ–Π· максимум 3 ΠΏΠ΅Ρ€Π΅Π²Π°Π³Π°ΠΌΠΈ Ρ‚Π° 3 Π½Π΅Π΄ΠΎΠ»Ρ–ΠΊΠ°ΠΌΠΈ, стисло." ], ["Π‘ΠΊΡ–Π»ΡŒΠΊΠΈ Π³ΠΎΠ΄ΠΈΠ½ ΠΏΠΎΡ‚Ρ€Ρ–Π±Π½ΠΎ Π»ΡŽΠ΄ΠΈΠ½Ρ–, Ρ‰ΠΎΠ± Π·'їсти Π³Π΅Π»Ρ–ΠΊΠΎΠΏΡ‚Π΅Ρ€?"], ["Π―ΠΊ Π²Ρ–Π΄ΠΊΡ€ΠΈΡ‚ΠΈ Ρ„Π°ΠΉΠ» JSON Ρƒ Python?"], [ "Π‘Ρ‚Π²ΠΎΡ€Ρ–Ρ‚ΡŒ ΠΌΠ°Ρ€ΠΊΠΎΠ²Π°Π½ΠΈΠΉ список ΠΏΠ΅Ρ€Π΅Π²Π°Π³ Ρ– Π½Π΅Π΄ΠΎΠ»Ρ–ΠΊΡ–Π² Тиття Π² Π‘Π°Π½-Ѐранциско. ΠœΠ°ΠΊΡΠΈΠΌΡƒΠΌ 2 ΠΏΠ΅Ρ€Π΅Π²Π°Π³ΠΈ Ρ‚Π° 2 Π½Π΅Π΄ΠΎΠ»Ρ–ΠΊΠΈ." ], ["ΠŸΡ€ΠΈΠ΄ΡƒΠΌΠ°ΠΉ ΠΊΠΎΡ€ΠΎΡ‚ΠΊΠ΅ оповідання Π· Ρ‚Π²Π°Ρ€ΠΈΠ½Π°ΠΌΠΈ ΠΏΡ€ΠΎ Ρ†Ρ–Π½Π½Ρ–ΡΡ‚ΡŒ Π΄Ρ€ΡƒΠΆΠ±ΠΈ."], ["Π§ΠΈ моТСш Ρ‚ΠΈ ΠΊΠΎΡ€ΠΎΡ‚ΠΊΠΎ пояснити, Ρ‰ΠΎ Ρ‚Π°ΠΊΠ΅ ΠΌΠΎΠ²Π° програмування Python?"], [ "ΠΠ°ΠΏΠΈΡˆΡ–Ρ‚ΡŒ ΡΡ‚Π°Ρ‚Ρ‚ΡŽ Π½Π° 100 слів Π½Π° Ρ‚Π΅ΠΌΡƒ 'ΠŸΠ΅Ρ€Π΅Π²Π°Π³ΠΈ Π²Ρ–Π΄ΠΊΡ€ΠΈΡ‚ΠΎΠ³ΠΎ ΠΊΠΎΠ΄Ρƒ Π² дослідТСннях Π¨Π†'." ], ], cache_examples=False, ) with gr.Blocks(css="style.css", fill_height=True, theme="soft") as demo: gr.Markdown(DESCRIPTION) chat_interface.render() if __name__ == "__main__": demo.queue(max_size=20).launch()