Spaces:
Running
on
Zero
Running
on
Zero
| import re | |
| import threading | |
| import gradio as gr | |
| import spaces | |
| import transformers | |
| from transformers import pipeline | |
| # loading model and tokenizer | |
| model_name = "Qwen/Qwen2-1.5B-Instruct" | |
| if gr.NO_RELOAD: | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model_name, | |
| device_map="auto", | |
| torch_dtype="auto", | |
| ) | |
| # the answer marker to detect final answer | |
| ANSWER_MARKER = "**ANSWER**" | |
| # the sentences starting the reasoning step by step | |
| rethink_prepends = [ | |
| "OK, I need to figure out ", | |
| "I think ", | |
| "Wait, I think ", | |
| "Let me check if ", | |
| "I should also remember that ", | |
| "Another thing to note is that ", | |
| "I also recall that ", | |
| "I think I have a good grasp ", | |
| "Now, using all the above information, I can answer the question using the original language used for the question:" | |
| "\n{question}\n" | |
| f"\n{ANSWER_MARKER}\n", | |
| ] | |
| # to fix some problems with math display | |
| latex_delimiters = [ | |
| {"left": "$$", "right": "$$", "display": True}, | |
| {"left": "$", "right": "$", "display": False}, | |
| ] | |
| def reformat_math(text): | |
| """Fix MathJax delimiters to use the Gradio syntax (Katex). | |
| This is a workaround to display math formulas in Gradio. For now, I havn't found a way to | |
| make it work as expected using others latex_delimiters... | |
| """ | |
| text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL) | |
| text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL) | |
| return text | |
| def user_input(message, history: list): | |
| """Append the user input in the history and clean the input textbox""" | |
| return "", history + [ | |
| gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, "")) | |
| ] | |
| def rebuild_messages(history: list): | |
| """Rebuid the messages from the history to be used by the model without the intermediate thoughs""" | |
| messages = [] | |
| for h in history: | |
| if isinstance(h, dict) and not h.get("metadata", {}).get("title", False): | |
| messages.append(h) | |
| elif ( | |
| isinstance(h, gr.ChatMessage) | |
| and h.metadata.get("title") | |
| and isinstance(h.content, str) | |
| ): | |
| messages.append({"role": h.role, "content": h.content}) | |
| return messages | |
| def bot( | |
| history: list, | |
| max_num_tokens: int, | |
| final_num_tokens: int, | |
| do_sample: bool, | |
| temperature: float, | |
| ): | |
| """Make the model answering the question""" | |
| # to get token as a stream, later in a thread | |
| streamer = transformers.TextIteratorStreamer( | |
| pipe.tokenizer, # pyright: ignore | |
| skip_special_tokens=True, | |
| skip_prompt=True, | |
| ) | |
| # to reinsert the question in the reasoning if needed | |
| question = history[-1]["content"] | |
| # prepare the assistant message | |
| history.append( | |
| gr.ChatMessage( | |
| role="assistant", | |
| content=str(""), | |
| metadata={"title": "🧠 Thinking...", "status": "pending"}, | |
| ) | |
| ) | |
| # for the moment, make the reasoning to be displayed in the chat | |
| messages = rebuild_messages(history) | |
| for i, prepend in enumerate(rethink_prepends): | |
| if i > 0: | |
| messages[-1]["content"] += "\n\n" | |
| messages[-1]["content"] += prepend.format(question=question) | |
| num_tokens = int( | |
| max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens | |
| ) | |
| t = threading.Thread( | |
| target=pipe, | |
| args=(messages,), | |
| kwargs=dict( | |
| max_new_tokens=num_tokens, | |
| streamer=streamer, | |
| do_sample=do_sample, | |
| temperature=temperature, | |
| ), | |
| ) | |
| t.start() | |
| # rebuild the history with the new content | |
| history[-1].content += prepend.format(question=question) | |
| if ANSWER_MARKER in prepend: | |
| history[-1].metadata = {"title": "💭 Thoughs", "status": "done"} | |
| # stop thinking, this is the answer now (no metadata for intermediate steps) | |
| history.append(gr.ChatMessage(role="assistant", content="")) | |
| for token in streamer: | |
| history[-1].content += token | |
| history[-1].content = reformat_math(history[-1].content) | |
| yield history | |
| t.join() | |
| yield history | |
| with gr.Blocks(fill_height=True, title="Making any LLM model reasoning") as demo: | |
| with gr.Row(scale=1): | |
| with gr.Column(scale=5): | |
| gr.Markdown(f""" | |
| # Force reasoning for any LLM | |
| This is a simple proof-of-concept to get any LLM (Large language Model) to reason ahead of its response. | |
| This interface uses *{model_name}* model **which is not a reasoning model**. The used method | |
| is only to force some "reasoning" steps with prefixes to help the model to enhance the answer. | |
| See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning) | |
| """) | |
| chatbot = gr.Chatbot( | |
| scale=1, | |
| type="messages", | |
| latex_delimiters=latex_delimiters, | |
| ) | |
| msg = gr.Textbox( | |
| submit_btn=True, | |
| label="", | |
| show_label=False, | |
| placeholder="Type your question here.", | |
| autofocus=True, | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("""## Tweaking""") | |
| num_tokens = gr.Slider( | |
| 50, | |
| 1024, | |
| 100, | |
| step=1, | |
| label="Max tokens per reasoning step", | |
| interactive=True, | |
| ) | |
| final_num_tokens = gr.Slider( | |
| 50, | |
| 1024, | |
| 512, | |
| step=1, | |
| label="Max token for the final answer", | |
| interactive=True, | |
| ) | |
| do_sample = gr.Checkbox(True, label="Do sample") | |
| temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="Temperature") | |
| gr.Markdown(""" | |
| Using smaller number of tokens in the reasoning steps will make the model | |
| faster to answer, but it may not be able to go deep enough in its reasoning. | |
| A good value is 100 to 512. | |
| Using smaller number of tokens for the final answer will make the model | |
| to be less verbose, but it may not be able to give a complete answer. | |
| A good value is 512 to 1024. | |
| **Do sample** uses another strategie to select the next token to complete the | |
| answer. It's commonly better to leave it checked. | |
| **Temperature** indicates how much the model could be "creative". 0.7 is a common value. | |
| If you set a too high value (like 1.0) the model could be incoherent. With a low value | |
| (like 0.3), the model will produce very predictives answers. | |
| """) | |
| gr.Markdown(""" | |
| This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop). | |
| Feel free to fork the application and try others instruct models. | |
| """) | |
| # when the user submit a message, the bot will answer | |
| msg.submit( | |
| user_input, | |
| [msg, chatbot], # inputs | |
| [msg, chatbot], # outputs | |
| ).then( | |
| bot, | |
| [ | |
| chatbot, | |
| num_tokens, | |
| final_num_tokens, | |
| do_sample, | |
| temperature, | |
| ], # actually, the "history" input | |
| chatbot, # to store the new history from the output | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |