Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
·
3cae1b6
1
Parent(s):
f0fb4eb
Update buff enabled
Browse files- .gitignore +1 -0
- app.py +39 -26
- run-app.sh +1 -0
.gitignore
CHANGED
|
@@ -9,3 +9,4 @@ models
|
|
| 9 |
.ruff_cache
|
| 10 |
run-nodemon.sh
|
| 11 |
app-.py
|
|
|
|
|
|
| 9 |
.ruff_cache
|
| 10 |
run-nodemon.sh
|
| 11 |
app-.py
|
| 12 |
+
nodemon.json
|
app.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
# ruff: noqa: E501
|
| 4 |
import os
|
| 5 |
import time
|
| 6 |
-
from dataclasses import asdict, dataclass
|
| 7 |
from pathlib import Path
|
| 8 |
from types import SimpleNamespace
|
| 9 |
|
|
@@ -39,9 +39,9 @@ URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main
|
|
| 39 |
|
| 40 |
url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
|
| 41 |
url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
|
|
|
|
|
|
|
| 42 |
url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
|
| 43 |
-
url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
|
| 44 |
-
url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" #
|
| 45 |
|
| 46 |
prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 47 |
|
|
@@ -50,9 +50,6 @@ prompt_template="""Below is an instruction that describes a task. Write a respon
|
|
| 50 |
### Response:
|
| 51 |
"""
|
| 52 |
|
| 53 |
-
prompt_template_qa = """Question: {question}
|
| 54 |
-
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
|
| 55 |
-
|
| 56 |
prompt_template = """System: You are a helpful,
|
| 57 |
respectful and honest assistant. Always answer as
|
| 58 |
helpfully as possible, while being safe. Your answers
|
|
@@ -67,9 +64,17 @@ information.
|
|
| 67 |
User: {prompt}
|
| 68 |
Assistant: """
|
| 69 |
|
| 70 |
-
|
|
|
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
logger.debug(f"{model_loc} {file_size}GB")
|
| 75 |
|
|
@@ -85,7 +90,7 @@ logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
|
|
| 85 |
LLM = None
|
| 86 |
LLM = AutoModelForCausalLM.from_pretrained(
|
| 87 |
model_loc,
|
| 88 |
-
model_type="llama",
|
| 89 |
threads=cpu_count,
|
| 90 |
)
|
| 91 |
|
|
@@ -100,7 +105,7 @@ except Exception:
|
|
| 100 |
|
| 101 |
ns = SimpleNamespace(
|
| 102 |
response="",
|
| 103 |
-
generator=[],
|
| 104 |
)
|
| 105 |
|
| 106 |
|
|
@@ -115,17 +120,17 @@ class GenerationConfig:
|
|
| 115 |
reset: bool = False
|
| 116 |
stream: bool = True
|
| 117 |
threads: int = cpu_count
|
| 118 |
-
stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 119 |
|
| 120 |
|
| 121 |
def generate(
|
| 122 |
-
|
| 123 |
-
llm
|
| 124 |
generation_config: GenerationConfig = GenerationConfig(),
|
| 125 |
):
|
| 126 |
"""Run model inference, will return a Generator if streaming is true."""
|
| 127 |
# if not user_prompt.strip():
|
| 128 |
-
_ = prompt_template.format(
|
| 129 |
print(_)
|
| 130 |
return llm(
|
| 131 |
_,
|
|
@@ -210,13 +215,13 @@ def predict(prompt, bot):
|
|
| 210 |
for word in generator:
|
| 211 |
# record first response time
|
| 212 |
if flag:
|
| 213 |
-
|
|
|
|
| 214 |
flag = 0
|
| 215 |
-
|
| 216 |
-
print(word, flush=True) # vertical stream
|
| 217 |
response += word
|
| 218 |
-
ns.response = response
|
| 219 |
-
buff.update(value=response)
|
| 220 |
print("")
|
| 221 |
logger.debug(f"{response=}")
|
| 222 |
except Exception as exc:
|
|
@@ -229,7 +234,7 @@ def predict(prompt, bot):
|
|
| 229 |
f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
|
| 230 |
)
|
| 231 |
|
| 232 |
-
bot.append([prompt, f"{response} {_}"])
|
| 233 |
|
| 234 |
return prompt, bot
|
| 235 |
|
|
@@ -247,9 +252,9 @@ def predict_api(prompt):
|
|
| 247 |
max_new_tokens=512, # adjust as needed
|
| 248 |
seed=42,
|
| 249 |
reset=False, # reset history (cache)
|
| 250 |
-
stream=True,
|
| 251 |
threads=cpu_count,
|
| 252 |
-
stop=prompt_prefix[1:2],
|
| 253 |
)
|
| 254 |
|
| 255 |
generator = generate(
|
|
@@ -274,6 +279,10 @@ def predict_api(prompt):
|
|
| 274 |
return response
|
| 275 |
|
| 276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
css = """
|
| 278 |
.importantButton {
|
| 279 |
background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
|
|
@@ -320,8 +329,9 @@ examples = [
|
|
| 320 |
["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
|
| 321 |
]
|
| 322 |
|
|
|
|
|
|
|
| 323 |
with gr.Blocks(
|
| 324 |
-
# title="mpt-30b-chat-ggml",
|
| 325 |
title=f"{Path(model_loc).name}",
|
| 326 |
theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
|
| 327 |
css=css,
|
|
@@ -343,7 +353,7 @@ with gr.Blocks(
|
|
| 343 |
|
| 344 |
# chatbot = gr.Chatbot().style(height=700) # 500
|
| 345 |
chatbot = gr.Chatbot(height=500)
|
| 346 |
-
buff = gr.Textbox(show_label=False, visible=
|
| 347 |
with gr.Row():
|
| 348 |
with gr.Column(scale=5):
|
| 349 |
msg = gr.Textbox(
|
|
@@ -359,12 +369,13 @@ with gr.Blocks(
|
|
| 359 |
with gr.Row(visible=False):
|
| 360 |
with gr.Accordion("Advanced Options:", open=False):
|
| 361 |
with gr.Row():
|
| 362 |
-
with gr.Column(scale=2):
|
| 363 |
system = gr.Textbox(
|
| 364 |
label="System Prompt",
|
| 365 |
value=prompt_template,
|
| 366 |
show_label=False,
|
| 367 |
-
).style(container=False)
|
|
|
|
| 368 |
with gr.Column():
|
| 369 |
with gr.Row():
|
| 370 |
change = gr.Button("Change System Prompt")
|
|
@@ -445,6 +456,8 @@ with gr.Blocks(
|
|
| 445 |
api_name="api",
|
| 446 |
)
|
| 447 |
|
|
|
|
|
|
|
| 448 |
# concurrency_count=5, max_size=20
|
| 449 |
# max_size=36, concurrency_count=14
|
| 450 |
block.queue(concurrency_count=5, max_size=20).launch(debug=True)
|
|
|
|
| 3 |
# ruff: noqa: E501
|
| 4 |
import os
|
| 5 |
import time
|
| 6 |
+
from dataclasses import asdict, dataclass
|
| 7 |
from pathlib import Path
|
| 8 |
from types import SimpleNamespace
|
| 9 |
|
|
|
|
| 39 |
|
| 40 |
url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
|
| 41 |
url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
|
| 42 |
+
# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
|
| 43 |
+
# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" # 7.87G
|
| 44 |
url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
|
|
|
|
|
|
|
| 45 |
|
| 46 |
prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
| 47 |
|
|
|
|
| 50 |
### Response:
|
| 51 |
"""
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
prompt_template = """System: You are a helpful,
|
| 54 |
respectful and honest assistant. Always answer as
|
| 55 |
helpfully as possible, while being safe. Your answers
|
|
|
|
| 64 |
User: {prompt}
|
| 65 |
Assistant: """
|
| 66 |
|
| 67 |
+
prompt_template = """Question: {question}
|
| 68 |
+
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
|
| 69 |
|
| 70 |
+
_ = [elm for elm in prompt_template.splitlines() if elm.strip()]
|
| 71 |
+
stop_string = [elm.split(":")[0] + ":" for elm in _][-2]
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
model_loc, file_size = dl_hf_model(url)
|
| 75 |
+
except Exception as exc_:
|
| 76 |
+
logger.error(exc_)
|
| 77 |
+
raise SystemExit(1) from exc_
|
| 78 |
|
| 79 |
logger.debug(f"{model_loc} {file_size}GB")
|
| 80 |
|
|
|
|
| 90 |
LLM = None
|
| 91 |
LLM = AutoModelForCausalLM.from_pretrained(
|
| 92 |
model_loc,
|
| 93 |
+
model_type="llama",
|
| 94 |
threads=cpu_count,
|
| 95 |
)
|
| 96 |
|
|
|
|
| 105 |
|
| 106 |
ns = SimpleNamespace(
|
| 107 |
response="",
|
| 108 |
+
generator=(_ for _ in []),
|
| 109 |
)
|
| 110 |
|
| 111 |
|
|
|
|
| 120 |
reset: bool = False
|
| 121 |
stream: bool = True
|
| 122 |
threads: int = cpu_count
|
| 123 |
+
# stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 124 |
|
| 125 |
|
| 126 |
def generate(
|
| 127 |
+
question: str,
|
| 128 |
+
llm=LLM,
|
| 129 |
generation_config: GenerationConfig = GenerationConfig(),
|
| 130 |
):
|
| 131 |
"""Run model inference, will return a Generator if streaming is true."""
|
| 132 |
# if not user_prompt.strip():
|
| 133 |
+
_ = prompt_template.format(question=question)
|
| 134 |
print(_)
|
| 135 |
return llm(
|
| 136 |
_,
|
|
|
|
| 215 |
for word in generator:
|
| 216 |
# record first response time
|
| 217 |
if flag:
|
| 218 |
+
fisrt_arr = f"{time.time() - then:.1f}s"
|
| 219 |
+
logger.debug(f"\t 1st arrival: {fisrt_arr}")
|
| 220 |
flag = 0
|
| 221 |
+
print(word, end="", flush=True)
|
| 222 |
+
# print(word, flush=True) # vertical stream
|
| 223 |
response += word
|
| 224 |
+
ns.response = f"({fisrt_arr}){response}"
|
|
|
|
| 225 |
print("")
|
| 226 |
logger.debug(f"{response=}")
|
| 227 |
except Exception as exc:
|
|
|
|
| 234 |
f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
|
| 235 |
)
|
| 236 |
|
| 237 |
+
bot.append([prompt, f"{response} \n{_}"])
|
| 238 |
|
| 239 |
return prompt, bot
|
| 240 |
|
|
|
|
| 252 |
max_new_tokens=512, # adjust as needed
|
| 253 |
seed=42,
|
| 254 |
reset=False, # reset history (cache)
|
| 255 |
+
stream=True,
|
| 256 |
threads=cpu_count,
|
| 257 |
+
# stop=prompt_prefix[1:2],
|
| 258 |
)
|
| 259 |
|
| 260 |
generator = generate(
|
|
|
|
| 279 |
return response
|
| 280 |
|
| 281 |
|
| 282 |
+
def update_buff():
|
| 283 |
+
return ns.response
|
| 284 |
+
|
| 285 |
+
|
| 286 |
css = """
|
| 287 |
.importantButton {
|
| 288 |
background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
|
|
|
|
| 329 |
["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
|
| 330 |
]
|
| 331 |
|
| 332 |
+
logger.info("start block")
|
| 333 |
+
|
| 334 |
with gr.Blocks(
|
|
|
|
| 335 |
title=f"{Path(model_loc).name}",
|
| 336 |
theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
|
| 337 |
css=css,
|
|
|
|
| 353 |
|
| 354 |
# chatbot = gr.Chatbot().style(height=700) # 500
|
| 355 |
chatbot = gr.Chatbot(height=500)
|
| 356 |
+
buff = gr.Textbox(show_label=False, visible=True)
|
| 357 |
with gr.Row():
|
| 358 |
with gr.Column(scale=5):
|
| 359 |
msg = gr.Textbox(
|
|
|
|
| 369 |
with gr.Row(visible=False):
|
| 370 |
with gr.Accordion("Advanced Options:", open=False):
|
| 371 |
with gr.Row():
|
| 372 |
+
with gr.Column(scale=2, container=False):
|
| 373 |
system = gr.Textbox(
|
| 374 |
label="System Prompt",
|
| 375 |
value=prompt_template,
|
| 376 |
show_label=False,
|
| 377 |
+
# ).style(container=False)
|
| 378 |
+
)
|
| 379 |
with gr.Column():
|
| 380 |
with gr.Row():
|
| 381 |
change = gr.Button("Change System Prompt")
|
|
|
|
| 456 |
api_name="api",
|
| 457 |
)
|
| 458 |
|
| 459 |
+
block.load(update_buff, [], buff, every=1)
|
| 460 |
+
|
| 461 |
# concurrency_count=5, max_size=20
|
| 462 |
# max_size=36, concurrency_count=14
|
| 463 |
block.queue(concurrency_count=5, max_size=20).launch(debug=True)
|
run-app.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
nodemon -w . -x python app.py
|