Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer # Removed BitsAndBytesConfig as we are not quantizing for CPU | |
| from peft import PeftModel | |
| import torch | |
| import os # Ensure os is imported for potential path joining if needed | |
| # --- Configuration --- | |
| base_model_id = "Qwen/Qwen-1_8B-Chat" | |
| lora_adapter_id = "jinv2/qwen-1_8b-hemiplegia-lora" # Your HF Model ID | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" # Will always be "cpu" on free tier | |
| device = "cpu" # Explicitly set to CPU for this configuration | |
| print(f"Using device: {device}") | |
| # --- Load Model and Tokenizer --- | |
| print("Loading tokenizer...") | |
| try: | |
| # Try loading tokenizer from your LoRA repo first, as it might contain specific settings | |
| tokenizer = AutoTokenizer.from_pretrained(lora_adapter_id, trust_remote_code=True) | |
| print(f"Successfully loaded tokenizer from {lora_adapter_id}.") | |
| except Exception as e_lora_tok: | |
| print(f"Could not load tokenizer from {lora_adapter_id} (Error: {e_lora_tok}), falling back to {base_model_id}.") | |
| tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True) | |
| # Set pad_token if not already set | |
| if tokenizer.pad_token_id is None: | |
| if tokenizer.eos_token_id is not None: | |
| tokenizer.pad_token_id = tokenizer.eos_token_id | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print(f"Set tokenizer.pad_token_id to eos_token_id: {tokenizer.pad_token_id}") | |
| else: | |
| # Fallback for Qwen, ensure this ID is correct for your Qwen version | |
| try: | |
| qwen_eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>") | |
| tokenizer.pad_token_id = qwen_eos_id | |
| tokenizer.pad_token = "<|endoftext|>" | |
| print(f"Set tokenizer.pad_token_id to ID of '<|endoftext|>: {tokenizer.pad_token_id}") | |
| except KeyError: | |
| tokenizer.pad_token_id = 0 # Absolute fallback, very risky | |
| tokenizer.pad_token = tokenizer.decode([0]) | |
| print(f"CRITICAL WARNING: Could not set pad_token_id reliably. Set to 0 ('{tokenizer.pad_token}').") | |
| tokenizer.padding_side = "left" # Important for generation | |
| print("Loading base model (NO QUANTIZATION as running on CPU)...") | |
| # IMPORTANT: For CPU, we cannot use bitsandbytes 4-bit quantization. | |
| # We load the model in its original precision (or try float16/bfloat16 if memory allows and CPU supports). | |
| # This will be much slower and more memory-intensive. | |
| try: | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| base_model_id, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float32, # Use float32 for CPU for max compatibility, bfloat16 might work on some newer CPUs | |
| # device_map="auto" will likely map to CPU. Can be explicit: device_map="cpu" | |
| device_map={"":device} # Ensure model parts are on the correct device | |
| ) | |
| print("Base model loaded.") | |
| except Exception as e_load_model: | |
| print(f"Error loading base model: {e_load_model}") | |
| raise # Re-raise the exception to stop the app if model loading fails | |
| print(f"Loading LoRA adapter: {lora_adapter_id}...") | |
| try: | |
| # For CPU, PEFT should still work. The model should be on the CPU before applying adapter. | |
| model = PeftModel.from_pretrained(base_model, lora_adapter_id) | |
| model.eval() # Set to evaluation mode | |
| model = model.to(device) # Ensure the final PEFT model is on the CPU | |
| print("LoRA adapter loaded and model is on CPU, ready for inference.") | |
| except Exception as e_load_adapter: | |
| print(f"Error loading LoRA adapter: {e_load_adapter}") | |
| raise | |
| # --- Prediction Function --- | |
| def get_response(user_query): | |
| system_prompt_content = "ไฝ ๆฏไธไธชไธๆณจไบๅ็ซใ่่กๆ ใๅ่บซไธ้้ขๅ็ๅป็้ฎ็ญๅฉๆใ" | |
| prompt = f"<|im_start|>system\n{system_prompt_content}<|im_end|>\n<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n" | |
| # Ensure inputs are on the same device as the model | |
| inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512-150).to(model.device) | |
| eos_token_ids_list = [] | |
| if isinstance(tokenizer.eos_token_id, int): | |
| eos_token_ids_list.append(tokenizer.eos_token_id) | |
| try: | |
| im_end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>") | |
| if im_end_token_id not in eos_token_ids_list: | |
| eos_token_ids_list.append(im_end_token_id) | |
| except KeyError: pass | |
| # Fallback if eos_token_ids_list is still empty | |
| if not eos_token_ids_list: | |
| if tokenizer.eos_token_id is not None: | |
| eos_token_ids_list = [tokenizer.eos_token_id] | |
| else: | |
| print("Warning: EOS token ID list is empty and eos_token_id is None. Generation might not stop correctly.") | |
| # Attempt to use a known Qwen EOS ID if possible, otherwise generation might be problematic. | |
| try: | |
| eos_token_ids_list = [tokenizer.convert_tokens_to_ids("<|endoftext|>")] | |
| except KeyError: | |
| eos_token_ids_list = [tokenizer.vocab_size - 1 if tokenizer.vocab_size else 0] # Very risky fallback | |
| print(f"Generating response for query: '{user_query}' on device: {model.device}") | |
| with torch.no_grad(): # Inference doesn't need gradient calculation | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=150, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=eos_token_ids_list if eos_token_ids_list else None, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| num_beams=1 | |
| ) | |
| response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| print(f"Raw response: '{response_text}'") | |
| return response_text.strip() | |
| # --- Gradio Interface --- | |
| iface = gr.Interface( | |
| fn=get_response, | |
| inputs=gr.Textbox(lines=3, placeholder="่ฏท่พๅ ฅๆจๅ ณไบๅ็ซใ่่กๆ ๆๅ่บซไธ้็้ฎ้ข...", label="ๆจ็้ฎ้ข (Your Question)"), | |
| outputs=gr.Textbox(lines=5, label="ๆจกๅๅ็ญ (Model Response)"), | |
| title="ๅ็ซ่่กๆ ้ฎ็ญๅฉๆ (CPU Version - Expect Slow Response)", | |
| description=( | |
| "็ฑ Qwen-1.8B-Chat LoRA ๅพฎ่ฐๅพๅฐ็ๆจกๅ (jinv2/qwen-1_8b-hemiplegia-lora)ใไธๅคฉ็ฎAI็ธๅ ณใ\n" | |
| "**้่ฆ๏ผๆญค็ๆฌ่ฟ่กๅจ CPU ไธ๏ผๆ ้ๅ๏ผๅๅบไผ้ๅธธๆ ขใๅป็ๅปบ่ฎฎ่ฏทๅจ่ฏขไธไธๅป็ใ**" | |
| ), | |
| examples=[ | |
| ["ๅ็ซๆฃ่ ็ๆฉๆๅบทๅค้ป็ผๆๅชไบ๏ผ"], | |
| ["ไปไนๆฏ่่กๆ ๏ผ"], | |
| ["ไธญ้ฃๅๅฆไฝ่ฟ่ก่ฏญ่จๆขๅค่ฎญ็ป๏ผ"] | |
| ], | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() # debug=True can be helpful for local testing but not for Spaces deployment |