jinv2's picture
Update app.py
14b9d25 verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer # Removed BitsAndBytesConfig as we are not quantizing for CPU
from peft import PeftModel
import torch
import os # Ensure os is imported for potential path joining if needed
# --- Configuration ---
base_model_id = "Qwen/Qwen-1_8B-Chat"
lora_adapter_id = "jinv2/qwen-1_8b-hemiplegia-lora" # Your HF Model ID
# device = "cuda" if torch.cuda.is_available() else "cpu" # Will always be "cpu" on free tier
device = "cpu" # Explicitly set to CPU for this configuration
print(f"Using device: {device}")
# --- Load Model and Tokenizer ---
print("Loading tokenizer...")
try:
# Try loading tokenizer from your LoRA repo first, as it might contain specific settings
tokenizer = AutoTokenizer.from_pretrained(lora_adapter_id, trust_remote_code=True)
print(f"Successfully loaded tokenizer from {lora_adapter_id}.")
except Exception as e_lora_tok:
print(f"Could not load tokenizer from {lora_adapter_id} (Error: {e_lora_tok}), falling back to {base_model_id}.")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
# Set pad_token if not already set
if tokenizer.pad_token_id is None:
if tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
print(f"Set tokenizer.pad_token_id to eos_token_id: {tokenizer.pad_token_id}")
else:
# Fallback for Qwen, ensure this ID is correct for your Qwen version
try:
qwen_eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
tokenizer.pad_token_id = qwen_eos_id
tokenizer.pad_token = "<|endoftext|>"
print(f"Set tokenizer.pad_token_id to ID of '<|endoftext|>: {tokenizer.pad_token_id}")
except KeyError:
tokenizer.pad_token_id = 0 # Absolute fallback, very risky
tokenizer.pad_token = tokenizer.decode([0])
print(f"CRITICAL WARNING: Could not set pad_token_id reliably. Set to 0 ('{tokenizer.pad_token}').")
tokenizer.padding_side = "left" # Important for generation
print("Loading base model (NO QUANTIZATION as running on CPU)...")
# IMPORTANT: For CPU, we cannot use bitsandbytes 4-bit quantization.
# We load the model in its original precision (or try float16/bfloat16 if memory allows and CPU supports).
# This will be much slower and more memory-intensive.
try:
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id,
trust_remote_code=True,
torch_dtype=torch.float32, # Use float32 for CPU for max compatibility, bfloat16 might work on some newer CPUs
# device_map="auto" will likely map to CPU. Can be explicit: device_map="cpu"
device_map={"":device} # Ensure model parts are on the correct device
)
print("Base model loaded.")
except Exception as e_load_model:
print(f"Error loading base model: {e_load_model}")
raise # Re-raise the exception to stop the app if model loading fails
print(f"Loading LoRA adapter: {lora_adapter_id}...")
try:
# For CPU, PEFT should still work. The model should be on the CPU before applying adapter.
model = PeftModel.from_pretrained(base_model, lora_adapter_id)
model.eval() # Set to evaluation mode
model = model.to(device) # Ensure the final PEFT model is on the CPU
print("LoRA adapter loaded and model is on CPU, ready for inference.")
except Exception as e_load_adapter:
print(f"Error loading LoRA adapter: {e_load_adapter}")
raise
# --- Prediction Function ---
def get_response(user_query):
system_prompt_content = "ไฝ ๆ˜ฏไธ€ไธชไธ“ๆณจไบŽๅ็˜ซใ€่„‘่ก€ๆ “ใ€ๅŠ่บซไธ้‚้ข†ๅŸŸ็š„ๅŒป็–—้—ฎ็ญ”ๅŠฉๆ‰‹ใ€‚"
prompt = f"<|im_start|>system\n{system_prompt_content}<|im_end|>\n<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n"
# Ensure inputs are on the same device as the model
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512-150).to(model.device)
eos_token_ids_list = []
if isinstance(tokenizer.eos_token_id, int):
eos_token_ids_list.append(tokenizer.eos_token_id)
try:
im_end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
if im_end_token_id not in eos_token_ids_list:
eos_token_ids_list.append(im_end_token_id)
except KeyError: pass
# Fallback if eos_token_ids_list is still empty
if not eos_token_ids_list:
if tokenizer.eos_token_id is not None:
eos_token_ids_list = [tokenizer.eos_token_id]
else:
print("Warning: EOS token ID list is empty and eos_token_id is None. Generation might not stop correctly.")
# Attempt to use a known Qwen EOS ID if possible, otherwise generation might be problematic.
try:
eos_token_ids_list = [tokenizer.convert_tokens_to_ids("<|endoftext|>")]
except KeyError:
eos_token_ids_list = [tokenizer.vocab_size - 1 if tokenizer.vocab_size else 0] # Very risky fallback
print(f"Generating response for query: '{user_query}' on device: {model.device}")
with torch.no_grad(): # Inference doesn't need gradient calculation
outputs = model.generate(
**inputs,
max_new_tokens=150,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=eos_token_ids_list if eos_token_ids_list else None,
temperature=0.7,
top_p=0.9,
do_sample=True,
num_beams=1
)
response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(f"Raw response: '{response_text}'")
return response_text.strip()
# --- Gradio Interface ---
iface = gr.Interface(
fn=get_response,
inputs=gr.Textbox(lines=3, placeholder="่ฏท่พ“ๅ…ฅๆ‚จๅ…ณไบŽๅ็˜ซใ€่„‘่ก€ๆ “ๆˆ–ๅŠ่บซไธ้‚็š„้—ฎ้ข˜...", label="ๆ‚จ็š„้—ฎ้ข˜ (Your Question)"),
outputs=gr.Textbox(lines=5, label="ๆจกๅž‹ๅ›ž็ญ” (Model Response)"),
title="ๅ็˜ซ่„‘่ก€ๆ “้—ฎ็ญ”ๅŠฉๆ‰‹ (CPU Version - Expect Slow Response)",
description=(
"็”ฑ Qwen-1.8B-Chat LoRA ๅพฎ่ฐƒๅพ—ๅˆฐ็š„ๆจกๅž‹ (jinv2/qwen-1_8b-hemiplegia-lora)ใ€‚ไธŽๅคฉ็ฎ—AI็›ธๅ…ณใ€‚\n"
"**้‡่ฆ๏ผšๆญค็‰ˆๆœฌ่ฟ่กŒๅœจ CPU ไธŠ๏ผŒๆ— ้‡ๅŒ–๏ผŒๅ“ๅบ”ไผš้žๅธธๆ…ขใ€‚ๅŒป็–—ๅปบ่ฎฎ่ฏทๅ’จ่ฏขไธ“ไธšๅŒป็”Ÿใ€‚**"
),
examples=[
["ๅ็˜ซๆ‚ฃ่€…็š„ๆ—ฉๆœŸๅบทๅค้”ป็‚ผๆœ‰ๅ“ชไบ›๏ผŸ"],
["ไป€ไนˆๆ˜ฏ่„‘่ก€ๆ “๏ผŸ"],
["ไธญ้ฃŽๅŽๅฆ‚ไฝ•่ฟ›่กŒ่ฏญ่จ€ๆขๅค่ฎญ็ปƒ๏ผŸ"]
],
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch() # debug=True can be helpful for local testing but not for Spaces deployment