File size: 6,714 Bytes
af3f7dd
14b9d25
af3f7dd
 
14b9d25
af3f7dd
 
 
 
14b9d25
 
af3f7dd
 
 
 
 
14b9d25
af3f7dd
 
14b9d25
 
af3f7dd
 
14b9d25
af3f7dd
 
 
14b9d25
 
 
 
 
 
 
 
 
 
 
 
 
af3f7dd
 
 
14b9d25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af3f7dd
 
14b9d25
 
 
 
 
 
 
 
 
af3f7dd
 
 
 
 
 
 
 
14b9d25
 
af3f7dd
 
 
 
 
 
 
 
 
14b9d25
 
 
 
 
 
 
 
 
 
 
 
 
af3f7dd
14b9d25
 
af3f7dd
 
 
 
14b9d25
af3f7dd
 
 
14b9d25
af3f7dd
 
 
 
 
 
 
 
 
 
 
14b9d25
 
 
 
 
af3f7dd
 
 
 
 
14b9d25
af3f7dd
 
 
14b9d25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer # Removed BitsAndBytesConfig as we are not quantizing for CPU
from peft import PeftModel
import torch
import os # Ensure os is imported for potential path joining if needed

# --- Configuration ---
base_model_id = "Qwen/Qwen-1_8B-Chat"
lora_adapter_id = "jinv2/qwen-1_8b-hemiplegia-lora" # Your HF Model ID
# device = "cuda" if torch.cuda.is_available() else "cpu" # Will always be "cpu" on free tier
device = "cpu" # Explicitly set to CPU for this configuration
print(f"Using device: {device}")

# --- Load Model and Tokenizer ---
print("Loading tokenizer...")
try:
    # Try loading tokenizer from your LoRA repo first, as it might contain specific settings
    tokenizer = AutoTokenizer.from_pretrained(lora_adapter_id, trust_remote_code=True)
    print(f"Successfully loaded tokenizer from {lora_adapter_id}.")
except Exception as e_lora_tok:
    print(f"Could not load tokenizer from {lora_adapter_id} (Error: {e_lora_tok}), falling back to {base_model_id}.")
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

# Set pad_token if not already set
if tokenizer.pad_token_id is None:
    if tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set tokenizer.pad_token_id to eos_token_id: {tokenizer.pad_token_id}")
    else:
        # Fallback for Qwen, ensure this ID is correct for your Qwen version
        try:
            qwen_eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
            tokenizer.pad_token_id = qwen_eos_id
            tokenizer.pad_token = "<|endoftext|>"
            print(f"Set tokenizer.pad_token_id to ID of '<|endoftext|>: {tokenizer.pad_token_id}")
        except KeyError:
            tokenizer.pad_token_id = 0 # Absolute fallback, very risky
            tokenizer.pad_token = tokenizer.decode([0])
            print(f"CRITICAL WARNING: Could not set pad_token_id reliably. Set to 0 ('{tokenizer.pad_token}').")

tokenizer.padding_side = "left" # Important for generation

print("Loading base model (NO QUANTIZATION as running on CPU)...")
# IMPORTANT: For CPU, we cannot use bitsandbytes 4-bit quantization.
# We load the model in its original precision (or try float16/bfloat16 if memory allows and CPU supports).
# This will be much slower and more memory-intensive.
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        trust_remote_code=True,
        torch_dtype=torch.float32,  # Use float32 for CPU for max compatibility, bfloat16 might work on some newer CPUs
        # device_map="auto" will likely map to CPU. Can be explicit: device_map="cpu"
        device_map={"":device} # Ensure model parts are on the correct device
    )
    print("Base model loaded.")
except Exception as e_load_model:
    print(f"Error loading base model: {e_load_model}")
    raise # Re-raise the exception to stop the app if model loading fails

print(f"Loading LoRA adapter: {lora_adapter_id}...")
try:
    # For CPU, PEFT should still work. The model should be on the CPU before applying adapter.
    model = PeftModel.from_pretrained(base_model, lora_adapter_id)
    model.eval() # Set to evaluation mode
    model = model.to(device) # Ensure the final PEFT model is on the CPU
    print("LoRA adapter loaded and model is on CPU, ready for inference.")
except Exception as e_load_adapter:
    print(f"Error loading LoRA adapter: {e_load_adapter}")
    raise


# --- Prediction Function ---
def get_response(user_query):
    system_prompt_content = "ไฝ ๆ˜ฏไธ€ไธชไธ“ๆณจไบŽๅ็˜ซใ€่„‘่ก€ๆ “ใ€ๅŠ่บซไธ้‚้ข†ๅŸŸ็š„ๅŒป็–—้—ฎ็ญ”ๅŠฉๆ‰‹ใ€‚"
    
    prompt = f"<|im_start|>system\n{system_prompt_content}<|im_end|>\n<|im_start|>user\n{user_query}<|im_end|>\n<|im_start|>assistant\n"
    
    # Ensure inputs are on the same device as the model
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512-150).to(model.device)

    eos_token_ids_list = []
    if isinstance(tokenizer.eos_token_id, int):
        eos_token_ids_list.append(tokenizer.eos_token_id)
    try:
        im_end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
        if im_end_token_id not in eos_token_ids_list:
            eos_token_ids_list.append(im_end_token_id)
    except KeyError: pass
    
    # Fallback if eos_token_ids_list is still empty
    if not eos_token_ids_list:
        if tokenizer.eos_token_id is not None:
             eos_token_ids_list = [tokenizer.eos_token_id]
        else:
            print("Warning: EOS token ID list is empty and eos_token_id is None. Generation might not stop correctly.")
            # Attempt to use a known Qwen EOS ID if possible, otherwise generation might be problematic.
            try:
                eos_token_ids_list = [tokenizer.convert_tokens_to_ids("<|endoftext|>")]
            except KeyError:
                eos_token_ids_list = [tokenizer.vocab_size - 1 if tokenizer.vocab_size else 0] # Very risky fallback


    print(f"Generating response for query: '{user_query}' on device: {model.device}")
    with torch.no_grad(): # Inference doesn't need gradient calculation
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=eos_token_ids_list if eos_token_ids_list else None,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            num_beams=1
        )
    
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    print(f"Raw response: '{response_text}'")
    return response_text.strip()

# --- Gradio Interface ---
iface = gr.Interface(
    fn=get_response,
    inputs=gr.Textbox(lines=3, placeholder="่ฏท่พ“ๅ…ฅๆ‚จๅ…ณไบŽๅ็˜ซใ€่„‘่ก€ๆ “ๆˆ–ๅŠ่บซไธ้‚็š„้—ฎ้ข˜...", label="ๆ‚จ็š„้—ฎ้ข˜ (Your Question)"),
    outputs=gr.Textbox(lines=5, label="ๆจกๅž‹ๅ›ž็ญ” (Model Response)"),
    title="ๅ็˜ซ่„‘่ก€ๆ “้—ฎ็ญ”ๅŠฉๆ‰‹ (CPU Version - Expect Slow Response)",
    description=(
        "็”ฑ Qwen-1.8B-Chat LoRA ๅพฎ่ฐƒๅพ—ๅˆฐ็š„ๆจกๅž‹ (jinv2/qwen-1_8b-hemiplegia-lora)ใ€‚ไธŽๅคฉ็ฎ—AI็›ธๅ…ณใ€‚\n"
        "**้‡่ฆ๏ผšๆญค็‰ˆๆœฌ่ฟ่กŒๅœจ CPU ไธŠ๏ผŒๆ— ้‡ๅŒ–๏ผŒๅ“ๅบ”ไผš้žๅธธๆ…ขใ€‚ๅŒป็–—ๅปบ่ฎฎ่ฏทๅ’จ่ฏขไธ“ไธšๅŒป็”Ÿใ€‚**"
    ),
    examples=[
        ["ๅ็˜ซๆ‚ฃ่€…็š„ๆ—ฉๆœŸๅบทๅค้”ป็‚ผๆœ‰ๅ“ชไบ›๏ผŸ"],
        ["ไป€ไนˆๆ˜ฏ่„‘่ก€ๆ “๏ผŸ"],
        ["ไธญ้ฃŽๅŽๅฆ‚ไฝ•่ฟ›่กŒ่ฏญ่จ€ๆขๅค่ฎญ็ปƒ๏ผŸ"]
    ],
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch() # debug=True can be helpful for local testing but not for Spaces deployment