Spaces:
Runtime error
Runtime error
File size: 6,714 Bytes
af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 af3f7dd 14b9d25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer # Removed BitsAndBytesConfig as we are not quantizing for CPU
from peft import PeftModel
import torch
import os # Ensure os is imported for potential path joining if needed
# --- Configuration ---
base_model_id = "Qwen/Qwen-1_8B-Chat"
lora_adapter_id = "jinv2/qwen-1_8b-hemiplegia-lora" # Your HF Model ID
# device = "cuda" if torch.cuda.is_available() else "cpu" # Will always be "cpu" on free tier
device = "cpu" # Explicitly set to CPU for this configuration
print(f"Using device: {device}")
# --- Load Model and Tokenizer ---
print("Loading tokenizer...")
try:
# Try loading tokenizer from your LoRA repo first, as it might contain specific settings
tokenizer = AutoTokenizer.from_pretrained(lora_adapter_id, trust_remote_code=True)
print(f"Successfully loaded tokenizer from {lora_adapter_id}.")
except Exception as e_lora_tok:
print(f"Could not load tokenizer from {lora_adapter_id} (Error: {e_lora_tok}), falling back to {base_model_id}.")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
# Set pad_token if not already set
if tokenizer.pad_token_id is None:
if tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
print(f"Set tokenizer.pad_token_id to eos_token_id: {tokenizer.pad_token_id}")
else:
# Fallback for Qwen, ensure this ID is correct for your Qwen version
try:
qwen_eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
tokenizer.pad_token_id = qwen_eos_id
tokenizer.pad_token = "<|endoftext|>"
print(f"Set tokenizer.pad_token_id to ID of '<|endoftext|>: {tokenizer.pad_token_id}")
except KeyError:
tokenizer.pad_token_id = 0 # Absolute fallback, very risky
tokenizer.pad_token = tokenizer.decode([0])
print(f"CRITICAL WARNING: Could not set pad_token_id reliably. Set to 0 ('{tokenizer.pad_token}').")
tokenizer.padding_side = "left" # Important for generation
print("Loading base model (NO QUANTIZATION as running on CPU)...")
# IMPORTANT: For CPU, we cannot use bitsandbytes 4-bit quantization.
# We load the model in its original precision (or try float16/bfloat16 if memory allows and CPU supports).
# This will be much slower and more memory-intensive.
try:
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id,
trust_remote_code=True,
torch_dtype=torch.float32, # Use float32 for CPU for max compatibility, bfloat16 might work on some newer CPUs
# device_map="auto" will likely map to CPU. Can be explicit: device_map="cpu"
device_map={"":device} # Ensure model parts are on the correct device
)
print("Base model loaded.")
except Exception as e_load_model:
print(f"Error loading base model: {e_load_model}")
raise # Re-raise the exception to stop the app if model loading fails
print(f"Loading LoRA adapter: {lora_adapter_id}...")
try:
# For CPU, PEFT should still work. The model should be on the CPU before applying adapter.
model = PeftModel.from_pretrained(base_model, lora_adapter_id)
model.eval() # Set to evaluation mode
model = model.to(device) # Ensure the final PEFT model is on the CPU
print("LoRA adapter loaded and model is on CPU, ready for inference.")
except Exception as e_load_adapter:
print(f"Error loading LoRA adapter: {e_load_adapter}")
raise
# --- Prediction Function ---
def get_response(user_query):
system_prompt_content = "ไฝ ๆฏไธไธชไธๆณจไบๅ็ซใ่่กๆ ใๅ่บซไธ้้ขๅ็ๅป็้ฎ็ญๅฉๆใ"
prompt = f"<|im_start|>system\n{system_prompt_content}<|im_end|>\n<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n"
# Ensure inputs are on the same device as the model
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512-150).to(model.device)
eos_token_ids_list = []
if isinstance(tokenizer.eos_token_id, int):
eos_token_ids_list.append(tokenizer.eos_token_id)
try:
im_end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
if im_end_token_id not in eos_token_ids_list:
eos_token_ids_list.append(im_end_token_id)
except KeyError: pass
# Fallback if eos_token_ids_list is still empty
if not eos_token_ids_list:
if tokenizer.eos_token_id is not None:
eos_token_ids_list = [tokenizer.eos_token_id]
else:
print("Warning: EOS token ID list is empty and eos_token_id is None. Generation might not stop correctly.")
# Attempt to use a known Qwen EOS ID if possible, otherwise generation might be problematic.
try:
eos_token_ids_list = [tokenizer.convert_tokens_to_ids("<|endoftext|>")]
except KeyError:
eos_token_ids_list = [tokenizer.vocab_size - 1 if tokenizer.vocab_size else 0] # Very risky fallback
print(f"Generating response for query: '{user_query}' on device: {model.device}")
with torch.no_grad(): # Inference doesn't need gradient calculation
outputs = model.generate(
**inputs,
max_new_tokens=150,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=eos_token_ids_list if eos_token_ids_list else None,
temperature=0.7,
top_p=0.9,
do_sample=True,
num_beams=1
)
response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(f"Raw response: '{response_text}'")
return response_text.strip()
# --- Gradio Interface ---
iface = gr.Interface(
fn=get_response,
inputs=gr.Textbox(lines=3, placeholder="่ฏท่พๅ
ฅๆจๅ
ณไบๅ็ซใ่่กๆ ๆๅ่บซไธ้็้ฎ้ข...", label="ๆจ็้ฎ้ข (Your Question)"),
outputs=gr.Textbox(lines=5, label="ๆจกๅๅ็ญ (Model Response)"),
title="ๅ็ซ่่กๆ ้ฎ็ญๅฉๆ (CPU Version - Expect Slow Response)",
description=(
"็ฑ Qwen-1.8B-Chat LoRA ๅพฎ่ฐๅพๅฐ็ๆจกๅ (jinv2/qwen-1_8b-hemiplegia-lora)ใไธๅคฉ็ฎAI็ธๅ
ณใ\n"
"**้่ฆ๏ผๆญค็ๆฌ่ฟ่กๅจ CPU ไธ๏ผๆ ้ๅ๏ผๅๅบไผ้ๅธธๆ
ขใๅป็ๅปบ่ฎฎ่ฏทๅจ่ฏขไธไธๅป็ใ**"
),
examples=[
["ๅ็ซๆฃ่
็ๆฉๆๅบทๅค้ป็ผๆๅชไบ๏ผ"],
["ไปไนๆฏ่่กๆ ๏ผ"],
["ไธญ้ฃๅๅฆไฝ่ฟ่ก่ฏญ่จๆขๅค่ฎญ็ป๏ผ"]
],
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch() # debug=True can be helpful for local testing but not for Spaces deployment |