llama2-7b-chat-ggml

Runtime error

App Files Files Community

ffreemt commited on Jul 20, 2023

Commit

3cae1b6

1 Parent(s): f0fb4eb

Update buff enabled

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +39 -26
run-app.sh +1 -0

.gitignore CHANGED Viewed

@@ -9,3 +9,4 @@ models
 .ruff_cache
 run-nodemon.sh
 app-.py

 .ruff_cache
 run-nodemon.sh
 app-.py
+nodemon.json

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # ruff: noqa: E501
 import os
 import time
-from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from types import SimpleNamespace
@@ -39,9 +39,9 @@ URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main
 url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
 url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin"  # 7.37G
 url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
-url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin"  # 6.93G
-url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin"  #
 prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
@@ -50,9 +50,6 @@ prompt_template="""Below is an instruction that describes a task. Write a respon
 ### Response:
 """
-prompt_template_qa = """Question: {question}
-Answer: Let's work this out in a step by step way to be sure we have the right answer."""
 prompt_template = """System: You are a helpful,
 respectful and honest assistant. Always answer as
 helpfully as possible, while being safe.  Your answers
@@ -67,9 +64,17 @@ information.
 User: {prompt}
 Assistant: """
-stop_string = [elm.split(":")[0] + ":" for elm in prompt_template.splitlines()][-2]
-model_loc, file_size = dl_hf_model(url)
 logger.debug(f"{model_loc} {file_size}GB")
@@ -85,7 +90,7 @@ logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
 LLM = None
 LLM = AutoModelForCausalLM.from_pretrained(
     model_loc,
-    model_type="llama",   # "starcoder",  AutoConfig.from_pretrained(REPO_ID)
     threads=cpu_count,
 )
@@ -100,7 +105,7 @@ except Exception:
 ns = SimpleNamespace(
     response="",
-    generator=[],
 )
@@ -115,17 +120,17 @@ class GenerationConfig:
     reset: bool = False
     stream: bool = True
     threads: int = cpu_count
-    stop: list[str] = field(default_factory=lambda: [stop_string])
 def generate(
-    prompt: str,
-    llm: AutoModelForCausalLM = LLM,
     generation_config: GenerationConfig = GenerationConfig(),
 ):
     """Run model inference, will return a Generator if streaming is true."""
     # if not user_prompt.strip():
-    _ = prompt_template.format(prompt=prompt)
     print(_)
     return llm(
         _,
@@ -210,13 +215,13 @@ def predict(prompt, bot):
             for word in generator:
                 # record first response time
                 if flag:
-                    logger.debug(f"\t {time.time() - then:.1f}s")
                     flag = 0
-                # print(word, end="", flush=True)
-                print(word, flush=True)  # vertical stream
                 response += word
-                ns.response = response
-                buff.update(value=response)
             print("")
             logger.debug(f"{response=}")
         except Exception as exc:
@@ -229,7 +234,7 @@ def predict(prompt, bot):
         f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)"  # type: ignore
     )
-    bot.append([prompt, f"{response} {_}"])
     return prompt, bot
@@ -247,9 +252,9 @@ def predict_api(prompt):
             max_new_tokens=512,  # adjust as needed
             seed=42,
             reset=False,  # reset history (cache)
-            stream=True,  # TODO stream=False and generator
             threads=cpu_count,
-            stop=prompt_prefix[1:2],
         )
         generator = generate(
@@ -274,6 +279,10 @@ def predict_api(prompt):
     return response
 css = """
     .importantButton {
         background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
@@ -320,8 +329,9 @@ examples = [
     ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
 ]
 with gr.Blocks(
-    # title="mpt-30b-chat-ggml",
     title=f"{Path(model_loc).name}",
     theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
     css=css,
@@ -343,7 +353,7 @@ with gr.Blocks(
     # chatbot = gr.Chatbot().style(height=700)  # 500
     chatbot = gr.Chatbot(height=500)
-    buff = gr.Textbox(show_label=False, visible=False)
     with gr.Row():
         with gr.Column(scale=5):
             msg = gr.Textbox(
@@ -359,12 +369,13 @@ with gr.Blocks(
     with gr.Row(visible=False):
         with gr.Accordion("Advanced Options:", open=False):
             with gr.Row():
-                with gr.Column(scale=2):
                     system = gr.Textbox(
                         label="System Prompt",
                         value=prompt_template,
                         show_label=False,
-                    ).style(container=False)
                 with gr.Column():
                     with gr.Row():
                         change = gr.Button("Change System Prompt")
@@ -445,6 +456,8 @@ with gr.Blocks(
         api_name="api",
     )
 # concurrency_count=5, max_size=20
 # max_size=36, concurrency_count=14
 block.queue(concurrency_count=5, max_size=20).launch(debug=True)

 # ruff: noqa: E501
 import os
 import time
+from dataclasses import asdict, dataclass
 from pathlib import Path
 from types import SimpleNamespace
 url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
 url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin"  # 7.37G
+# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin"  # 6.93G
+# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin"  # 7.87G
 url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
 prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 ### Response:
 """
 prompt_template = """System: You are a helpful,
 respectful and honest assistant. Always answer as
 helpfully as possible, while being safe.  Your answers
 User: {prompt}
 Assistant: """
+prompt_template = """Question: {question}
+Answer: Let's work this out in a step by step way to be sure we have the right answer."""
+_ = [elm for elm in prompt_template.splitlines() if elm.strip()]
+stop_string = [elm.split(":")[0] + ":" for elm in _][-2]
+try:
+    model_loc, file_size = dl_hf_model(url)
+except Exception as exc_:
+    logger.error(exc_)
+    raise SystemExit(1) from exc_
 logger.debug(f"{model_loc} {file_size}GB")
 LLM = None
 LLM = AutoModelForCausalLM.from_pretrained(
     model_loc,
+    model_type="llama",
     threads=cpu_count,
 )
 ns = SimpleNamespace(
     response="",
+    generator=(_ for _ in []),
 )
     reset: bool = False
     stream: bool = True
     threads: int = cpu_count
+    # stop: list[str] = field(default_factory=lambda: [stop_string])
 def generate(
+    question: str,
+    llm=LLM,
     generation_config: GenerationConfig = GenerationConfig(),
 ):
     """Run model inference, will return a Generator if streaming is true."""
     # if not user_prompt.strip():
+    _ = prompt_template.format(question=question)
     print(_)
     return llm(
         _,
             for word in generator:
                 # record first response time
                 if flag:
+                    fisrt_arr = f"{time.time() - then:.1f}s"
+                    logger.debug(f"\t 1st arrival: {fisrt_arr}")
                     flag = 0
+                print(word, end="", flush=True)
+                # print(word, flush=True)  # vertical stream
                 response += word
+                ns.response = f"({fisrt_arr}){response}"
             print("")
             logger.debug(f"{response=}")
         except Exception as exc:
         f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)"  # type: ignore
     )
+    bot.append([prompt, f"{response} \n{_}"])
     return prompt, bot
             max_new_tokens=512,  # adjust as needed
             seed=42,
             reset=False,  # reset history (cache)
+            stream=True,
             threads=cpu_count,
+            # stop=prompt_prefix[1:2],
         )
         generator = generate(
     return response
+def update_buff():
+    return ns.response
 css = """
     .importantButton {
         background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
     ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
 ]
+logger.info("start block")
 with gr.Blocks(
     title=f"{Path(model_loc).name}",
     theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
     css=css,
     # chatbot = gr.Chatbot().style(height=700)  # 500
     chatbot = gr.Chatbot(height=500)
+    buff = gr.Textbox(show_label=False, visible=True)
     with gr.Row():
         with gr.Column(scale=5):
             msg = gr.Textbox(
     with gr.Row(visible=False):
         with gr.Accordion("Advanced Options:", open=False):
             with gr.Row():
+                with gr.Column(scale=2, container=False):
                     system = gr.Textbox(
                         label="System Prompt",
                         value=prompt_template,
                         show_label=False,
+                    # ).style(container=False)
+                    )
                 with gr.Column():
                     with gr.Row():
                         change = gr.Button("Change System Prompt")
         api_name="api",
     )
+    block.load(update_buff, [], buff, every=1)
 # concurrency_count=5, max_size=20
 # max_size=36, concurrency_count=14
 block.queue(concurrency_count=5, max_size=20).launch(debug=True)

run-app.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ nodemon -w . -x python app.py