WizardCoder

Sleeping

App Files Files Community

ffreemt commited on Jul 5, 2023

Commit

9314f7d

1 Parent(s): 896d8df

Update refactor

Browse files

Files changed (3) hide show

.flake8 +21 -0
.gitignore +1 -0
app.py +125 -29

.flake8 ADDED Viewed

	@@ -0,0 +1,21 @@

+[flake8]
+ignore =
+  # E203 whitespace before ':'
+  E203
+  D203
+  # line too long
+  E501
+per-file-ignores =
+  # imported but unused
+  # __init__.py: F401
+  test_*.py: F401
+exclude =
+  .git
+  __pycache__
+  docs/source/conf.py
+  old
+  build
+  dist
+  .venv
+  pad*.py app-.py
+max-complexity = 25

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ call-activate.bat

app.py CHANGED Viewed

@@ -1,14 +1,19 @@
 """Run codes"""
 # import gradio
 # gradio.load("models/WizardLM/WizardCoder-15B-V1.0").launch()
 import os
 import time
 from types import SimpleNamespace
 import gradio as gr
 from about_time import about_time
-from ctransformers import AutoConfig, AutoModelForCausalLM
 from huggingface_hub import hf_hub_download
 from loguru import logger
@@ -24,6 +29,11 @@ ns = SimpleNamespace(
     generator=[],
 )
 def predict(prompt, bot):
     # logger.debug(f"{prompt=}, {bot=}, {timeout=}")
@@ -33,7 +43,12 @@ def predict(prompt, bot):
     with about_time() as atime:  # type: ignore
         try:
             # user_prompt = prompt
-            generator = generate(llm, generation_config, system_prompt, prompt.strip())
             print(assistant_prefix, end=" ", flush=True)
             response = ""
@@ -67,7 +82,21 @@ def predict_api(prompt):
     ns.response = ""
     try:
         # user_prompt = prompt
-        generator = generate(llm, generation_config, system_prompt, prompt.strip())
         print(assistant_prefix, end=" ", flush=True)
         response = ""
@@ -98,6 +127,50 @@ def download_quant(destination_folder: str, repo_id: str, model_filename: str):
     )
 logger.info("start dl")
 _ = """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
@@ -109,16 +182,16 @@ mpt-30b-chat.ggmlv0.q5_0.bin 	q5_0 	5 	20.60 GB 	23.10 GB
 mpt-30b-chat.ggmlv0.q5_1.bin 	q5_1 	5 	22.47 GB 	24.97 GB
 mpt-30b-chat.ggmlv0.q8_0.bin 	q8_0 	8 	31.83 GB 	34.33 GB
 """
-model_filename = "mpt-30b-chat.ggmlv0.q4_1.bin"
-model_filename = "WizardCoder-15B-1.0.ggmlv3.q4_0.bin"  # 10.7G
-model_filename = "WizardCoder-15B-1.0.ggmlv3.q4_1.bin"  # 11.9G
-destination_folder = "models"
-repo_id = "TheBloke/mpt-30B-chat-GGML"
-if "WizardCoder" in model_filename:
-    repo_id = "TheBloke/WizardCoder-15B-1.0-GGML"
-download_quant(destination_folder, repo_id, model_filename)
 logger.info("done dl")
@@ -131,16 +204,40 @@ logger.info("done dl")
 #     )
 # https://huggingface.co/spaces/matthoffner/wizardcoder-ggml/blob/main/main.py
-if "WizardCoder" in model_filename:
-    llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
-                                            model_file="",
-                                            model_type="starcoder",
-                                            threads=8)
-default_system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
-user_prefix = "[user]: "
-assistant_prefix = "[assistant]: "
 css = """
     .importantButton {
@@ -157,7 +254,7 @@ css = """
 with gr.Blocks(
     # title="mpt-30b-chat-ggml",
-    title=f"{model_filename}",
     theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
     css=css,
 ) as block:
@@ -166,7 +263,7 @@ with gr.Blocks(
         #     """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
         # )
         gr.Markdown(
-            f"""<h4><center>{model_filename}</center></h4>
             Most examples are meant for another model. You probably should try
             some coder-related prompts.
@@ -177,17 +274,17 @@ with gr.Blocks(
             """,
             elem_classes="xsmall",
         )
-    conversation = Chat()
     chatbot = gr.Chatbot(scroll_to_output=True).style(height=700)  # 500
     buff = gr.Textbox(show_label=False)
     with gr.Row():
-        with gr.Column(scale=1):
             msg = gr.Textbox(
                 label="Chat Message Box",
                 placeholder="Ask me anything (press Enter or click Submit to send)",
                 show_label=False,
             ).style(container=False)
-        with gr.Column(scale=0.1):
             with gr.Row():
                 submit = gr.Button("Submit", elem_classes="xsmall")
                 stop = gr.Button("Stop", visible=False)
@@ -212,7 +309,7 @@ with gr.Blocks(
             examples=[
                 ["js 判断一个数是不是质数"],
                 ["js 实现python 的 range(10)"],
-                ["js 实现python 的 [*(range(10)]"],
                 ["Explain the plot of Cinderella in a sentence."],
                 [
                     "How long does it take to become proficient in French, and what are the best methods for retaining information?"
@@ -244,7 +341,7 @@ with gr.Blocks(
     # with gr.Row():
     with gr.Accordion("Disclaimer", open=False):
-        _ = "-".join(model_filename.split("-")[:2])
         gr.Markdown(
             f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
             "factually accurate information. {_} was trained on various public datasets; while great efforts "
@@ -292,4 +389,3 @@ with gr.Blocks(
 # concurrency_count=5, max_size=20
 # max_size=36, concurrency_count=14
 block.queue(concurrency_count=5, max_size=20).launch(debug=True)

 """Run codes"""
+# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
 # import gradio
 # gradio.load("models/WizardLM/WizardCoder-15B-V1.0").launch()
 import os
 import time
+from dataclasses import asdict, dataclass
 from types import SimpleNamespace
 import gradio as gr
 from about_time import about_time
+# from ctransformers import AutoConfig, AutoModelForCausalLM
+from ctransformers import AutoModelForCausalLM
 from huggingface_hub import hf_hub_download
 from loguru import logger
     generator=[],
 )
+default_system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
+user_prefix = "[user]: "
+assistant_prefix = "[assistant]: "
 def predict(prompt, bot):
     # logger.debug(f"{prompt=}, {bot=}, {timeout=}")
     with about_time() as atime:  # type: ignore
         try:
             # user_prompt = prompt
+            generator = generate(
+                LLM,
+                GENERATION_CONFIG,
+                system_prompt=default_system_prompt,
+                user_prompt=prompt.strip(),
+            )
             print(assistant_prefix, end=" ", flush=True)
             response = ""
     ns.response = ""
     try:
         # user_prompt = prompt
+        _ = GenerationConfig(
+            temperature=0.2,
+            top_k=0,
+            top_p=0.9,
+            repetition_penalty=1.0,
+            max_new_tokens=512,  # adjust as needed
+            seed=42,
+            reset=False,  # reset history (cache)
+            stream=False,  # streaming per word/token
+            threads=os.cpu_count() // 2,  # type: ignore  # adjust for your CPU
+            stop=["<|im_end|>", "|<"],
+        )
+        generator = generate(
+            LLM, _, system_prompt=default_system_prompt, user_prompt=prompt.strip()
+        )
         print(assistant_prefix, end=" ", flush=True)
         response = ""
     )
+@dataclass
+class GenerationConfig:
+    temperature: float
+    top_k: int
+    top_p: float
+    repetition_penalty: float
+    max_new_tokens: int
+    seed: int
+    reset: bool
+    stream: bool
+    threads: int
+    stop: list[str]
+def format_prompt(system_prompt: str, user_prompt: str):
+    """Format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py.
+    May need to be modified for WizardCoder: TODO
+    """
+    system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
+    assistant_prompt = "<|im_start|>assistant\n"
+    return f"{system_prompt}{user_prompt}{assistant_prompt}"
+def generate(
+    llm: AutoModelForCausalLM,
+    generation_config: GenerationConfig,
+    system_prompt: str = default_system_prompt,
+    user_prompt: str = "",
+):
+    """Run model inference, will return a Generator if streaming is true"""
+    # if not user_prompt.strip():
+    return llm(
+        format_prompt(
+            system_prompt,
+            user_prompt,
+        ),
+        **asdict(generation_config),
+    )
 logger.info("start dl")
 _ = """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
 mpt-30b-chat.ggmlv0.q5_1.bin 	q5_1 	5 	22.47 GB 	24.97 GB
 mpt-30b-chat.ggmlv0.q8_0.bin 	q8_0 	8 	31.83 GB 	34.33 GB
 """
+MODEL_FILENAME = "mpt-30b-chat.ggmlv0.q4_1.bin"
+MODEL_FILENAME = "WizardCoder-15B-1.0.ggmlv3.q4_0.bin"  # 10.7G
+MODEL_FILENAME = "WizardCoder-15B-1.0.ggmlv3.q4_1.bin"  # 11.9G
+DESTINATION_FOLDER = "models"
+REPO_ID = "TheBloke/mpt-30B-chat-GGML"
+if "WizardCoder" in MODEL_FILENAME:
+    REPO_ID = "TheBloke/WizardCoder-15B-1.0-GGML"
+download_quant(DESTINATION_FOLDER, REPO_ID, MODEL_FILENAME)
 logger.info("done dl")
 #     )
 # https://huggingface.co/spaces/matthoffner/wizardcoder-ggml/blob/main/main.py
+_ = """
+llm = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/WizardCoder-15B-1.0-GGML",
+    model_file="",
+    model_type="starcoder",
+    threads=8
+)
+# """
+if "WizardCoder" in MODEL_FILENAME:
+    LLM = AutoModelForCausalLM.from_pretrained(
+        "TheBloke/WizardCoder-15B-1.0-GGML",
+        model_file=MODEL_FILENAME,
+        model_type="starcoder",
+        threads=os.cpu_count() // 2,  # type: ignore
+    )
+LLM = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/WizardCoder-15B-1.0-GGML",
+    model_file="",
+    model_type="starcoder",
+    threads=os.cpu_count() // 2  # type: ignore
+)
+GENERATION_CONFIG = GenerationConfig(
+    temperature=0.2,
+    top_k=0,
+    top_p=0.9,
+    repetition_penalty=1.0,
+    max_new_tokens=512,  # adjust as needed
+    seed=42,
+    reset=False,  # reset history (cache)
+    stream=True,  # streaming per word/token
+    threads=os.cpu_count() // 2,  # type: ignore  # adjust for your CPU
+    stop=["<|im_end|>", "|<"],
+)
 css = """
     .importantButton {
 with gr.Blocks(
     # title="mpt-30b-chat-ggml",
+    title=f"{MODEL_FILENAME}",
     theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
     css=css,
 ) as block:
         #     """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
         # )
         gr.Markdown(
+            f"""<h4><center>{MODEL_FILENAME}</center></h4>
             Most examples are meant for another model. You probably should try
             some coder-related prompts.
             """,
             elem_classes="xsmall",
         )
     chatbot = gr.Chatbot(scroll_to_output=True).style(height=700)  # 500
     buff = gr.Textbox(show_label=False)
     with gr.Row():
+        with gr.Column(scale=4):
             msg = gr.Textbox(
                 label="Chat Message Box",
                 placeholder="Ask me anything (press Enter or click Submit to send)",
                 show_label=False,
             ).style(container=False)
+        with gr.Column(scale=1):
             with gr.Row():
                 submit = gr.Button("Submit", elem_classes="xsmall")
                 stop = gr.Button("Stop", visible=False)
             examples=[
                 ["js 判断一个数是不是质数"],
                 ["js 实现python 的 range(10)"],
+                ["js 实现python 的 [*(range(10)]"],
                 ["Explain the plot of Cinderella in a sentence."],
                 [
                     "How long does it take to become proficient in French, and what are the best methods for retaining information?"
     # with gr.Row():
     with gr.Accordion("Disclaimer", open=False):
+        _ = "-".join(MODEL_FILENAME.split("-")[:2])
         gr.Markdown(
             f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
             "factually accurate information. {_} was trained on various public datasets; while great efforts "
 # concurrency_count=5, max_size=20
 # max_size=36, concurrency_count=14
 block.queue(concurrency_count=5, max_size=20).launch(debug=True)