Spaces:

chr1ce
/

Video-XL-2

Sleeping

App Files Files Community

chrisx599 commited on Jun 26

Commit

214a439

1 Parent(s): ae3cc24

first commit

Browse files

Files changed (24) hide show

.gitattributes +1 -0
.gitignore +3 -0
app.py +348 -0
requirements.txt +83 -0
serve/__init__.py +0 -0
serve/__pycache__/__init__.cpython-310.pyc +0 -0
serve/__pycache__/chat_utils.cpython-310.pyc +0 -0
serve/__pycache__/examples.cpython-310.pyc +0 -0
serve/__pycache__/frontend.cpython-310.pyc +0 -0
serve/__pycache__/gradio_utils.cpython-310.pyc +0 -0
serve/__pycache__/utils.cpython-310.pyc +0 -0
serve/assets/Kelpy-Codos.js +100 -0
serve/assets/avatar.png +0 -0
serve/assets/custom.css +355 -0
serve/assets/custom.js +22 -0
serve/assets/favicon.ico +0 -0
serve/chat_utils.py +497 -0
serve/examples.py +73 -0
serve/frontend.py +126 -0
serve/gradio_utils.py +95 -0
serve/inference.py +268 -0
serve/utils.py +290 -0
videos/demo1.mp4 +3 -0
videos/demo2.mp4 +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ videos/*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+logs
+effi.py
+wo_effi.py

app.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import argparse
+import gradio as gr
+import os
+from PIL import Image
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from serve.frontend import reload_javascript
+from serve.utils import (
+    configure_logger,
+)
+from serve.gradio_utils import (
+    cancel_outputing,
+    delete_last_conversation,
+    reset_state,
+    reset_textbox,
+    transfer_input,
+    wrap_gen_fn,
+)
+from serve.chat_utils import compress_video_to_base64
+from serve.examples import get_examples
+import logging
+TITLE = """<h1 align="left" style="min-width:200px; margin-top:0;">Chat with Video-XL-2 </h1>"""
+DESCRIPTION_TOP = """<a href="https://unabletousegit.github.io/video-xl2.github.io" target="_blank">Video-XL-2</a>, a better, faster, and high-frame-count model for long video understanding."""
+DESCRIPTION = """"""
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEPLOY_MODELS = dict()
+logger = configure_logger()
+DEFAULT_IMAGE_TOKEN = "<image>"
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="Video-XL-2")
+    parser.add_argument(
+        "--local-path",
+        type=str,
+        default="/share/project/minghao/Share_1/Models/Video-XL-2",
+        help="huggingface ckpt, optional",
+    )
+    parser.add_argument("--ip", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=7860)
+    return parser.parse_args()
+def fetch_model(model_name: str):
+    global DEPLOY_MODELS
+    local_model_path = '/share/project/minghao/Share_1/Models/Video-XL-2'
+    if model_name in DEPLOY_MODELS:
+        model_info = DEPLOY_MODELS[model_name]
+        print(f"{model_name} has been loaded.")
+    else:
+        print(f"{model_name} is loading...")
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            local_model_path,
+            trust_remote_code=True,
+            device_map=device,
+            quantization_config=None,
+            attn_implementation="sdpa",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
+        )
+        DEPLOY_MODELS[model_name] = (model, tokenizer)
+        print(f"Load {model_name} successfully...")
+        model_info = DEPLOY_MODELS[model_name]
+    return model_info
+def preview_images(files) -> list[str]:
+    if files is None:
+        return []
+    image_paths = []
+    for file in files:
+        image_paths.append(file.name)
+    return image_paths
+@wrap_gen_fn
+def predict(
+    text,
+    images,
+    chatbot,
+    history,
+    top_p,
+    temperature,
+    max_generate_length,
+    max_context_length_tokens,
+    video_nframes,
+    chunk_size: int = 512,
+):
+    """
+    Predict the response for the input text and images.
+    Args:
+        text (str): The input text.
+        images (list[PIL.Image.Image]): The input images.
+        chatbot (list): The chatbot.
+        history (list): The history.
+        top_p (float): The top-p value.
+        temperature (float): The temperature value.
+        repetition_penalty (float): The repetition penalty value.
+        max_generate_length (int): The max length tokens.
+        max_context_length_tokens (int): The max context length tokens.
+        chunk_size (int): The chunk size.
+    """
+    if images is None:
+        pil_images = history["video_path"]
+    else:
+        pil_images = images[0].name
+    print("running the prediction function")
+    try:
+        logger.info("fetching model")
+        model, tokenizer = fetch_model(args.model)
+        logger.info("model fetched")
+        if text == "":
+            yield chatbot, history, "Empty context."
+            return
+    except KeyError:
+        logger.info("no model found")
+        yield [[text, "No Model Found"]], [], "No Model Found"
+        return
+    gen_kwargs = {
+        "do_sample": True if temperature > 1e-2 else False,
+        "temperature": temperature,
+        "top_p": top_p,
+        "num_beams": 1,
+        "use_cache": True,
+        "max_new_tokens": max_generate_length,
+    }
+    # Check if this is the very first turn with an image
+    is_first_image_turn = (len(history) == 0 and pil_images)
+    if is_first_image_turn:
+        history["video_path"] = pil_images
+        history["context"] = None
+    response, temp_history = model.chat(
+        history["video_path"] if "video_path" in history else pil_images,
+        tokenizer,
+        text,
+        chat_history=history["context"],
+        return_history=True,
+        max_num_frames=video_nframes,
+        sample_fps=None,
+        max_sample_fps=None,
+        generation_config=gen_kwargs
+    )
+    text_for_history = text
+    if is_first_image_turn:
+        media_str = ""
+        b64 = compress_video_to_base64(history["video_path"] if "video_path" in history else pil_images)
+        media_str += (
+            f'<video controls style="max-width:300px;height:auto;" '
+            f'src="data:video/mp4;base64,{b64}"></video>'
+        )
+        text_for_history = media_str + text_for_history
+        chatbot.append([text_for_history, response])
+    else:
+        chatbot.append([text_for_history, response])
+    history["context"] = (temp_history)
+    logger.info("flushed result to gradio")
+    print(
+        f"temperature: {temperature}, "
+        f"top_p: {top_p}, "
+        f"max_generate_length: {max_generate_length}"
+    )
+    yield chatbot, history, "Generate: Success"
+def retry(
+    text, # This `text` is the current text box content, not the last user input
+    images,
+    chatbot,
+    full_history, # This is the full history
+    top_p,
+    temperature,
+    max_generate_length,
+    max_context_length_tokens,
+    video_nframes,
+    chunk_size: int = 512,
+):
+    """
+    Retry the response for the input text and images.
+    """
+    history = full_history["context"]
+    if len(history) == 0:
+        yield (chatbot, history, "Empty context")
+        return
+    # Get the last user input before popping
+    # print("history:", history)
+    last_user_input = history[-2]["content"]
+    # Remove the last turn from chatbot and history
+    chatbot.pop()
+    history.pop()
+    full_history["context"] = history
+    # Now call predict with the last user input and the modified history
+    yield from predict(
+        last_user_input, # Pass the last user input as the current text
+        images, # Images should be the same as the last turn
+        chatbot, # Updated chatbot
+        full_history, # Updated history
+        top_p,
+        temperature,
+        max_generate_length,
+        max_context_length_tokens,
+        video_nframes,
+        chunk_size,
+    )
+def build_demo(args: argparse.Namespace) -> gr.Blocks:
+    with gr.Blocks(theme=gr.themes.Soft(), delete_cache=(1800, 1800)) as demo:
+        history = gr.State(dict())
+        input_text = gr.State()
+        input_images = gr.State()
+        with gr.Row():
+            gr.HTML(TITLE)
+            status_display = gr.Markdown("Success", elem_id="status_display")
+        gr.Markdown(DESCRIPTION_TOP)
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=4):
+                with gr.Row():
+                    chatbot = gr.Chatbot(
+                        elem_id="Video-XL-2_Demo-chatbot",
+                        show_share_button=True,
+                        bubble_full_width=False,
+                        height=600,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=4):
+                        text_box = gr.Textbox(show_label=False, placeholder="Enter text", container=False)
+                    with gr.Column(min_width=70):
+                        submit_btn = gr.Button("Send")
+                    with gr.Column(min_width=70):
+                        cancel_btn = gr.Button("Stop")
+                with gr.Row():
+                    empty_btn = gr.Button("🧹 New Conversation")
+                    retry_btn = gr.Button("🔄 Regenerate")
+                    del_last_btn = gr.Button("🗑️ Remove Last Turn")
+            with gr.Column():
+                # add note no more than 2 images once
+                gr.Markdown("Note: you can upload images or videos!")
+                upload_images = gr.Files(file_types=["image", "video"], show_label=True)
+                gallery = gr.Gallery(columns=[3], height="200px", show_label=True)
+                upload_images.change(preview_images, inputs=upload_images, outputs=gallery)
+                # Parameter Setting Tab for control the generation parameters
+                with gr.Tab(label="Parameter Setting"):
+                    top_p = gr.Slider(minimum=-0, maximum=1.0, value=0.001, step=0.05, interactive=True, label="Top-p")
+                    temperature = gr.Slider(
+                        minimum=0, maximum=1.0, value=0.01, step=0.1, interactive=True, label="Temperature"
+                    )
+                    max_generate_length = gr.Slider(
+                        minimum=512, maximum=8192, value=4096, step=64, interactive=True, label="Max Generate Length"
+                    )
+                    max_context_length_tokens = gr.Slider(
+                        minimum=512, maximum=65536, value=16384, step=64, interactive=True, label="Max Context Length Tokens"
+                    )
+                    video_nframes = gr.Slider(
+                        minimum=1, maximum=128, value=128, step=1, interactive=True, label="Video Nframes"
+                    )
+                    show_images = gr.HTML(visible=False)
+                gr.Markdown("This demo is based on `moonshotai/Kimi-VL-A3B-Thinking` & `deepseek-ai/deepseek-vl2-small` and extends it by adding support for video input.")
+        gr.Examples(
+            examples=get_examples(ROOT_DIR),
+            inputs=[upload_images, show_images, text_box],
+        )
+        gr.Markdown()
+        input_widgets = [
+            input_text,
+            input_images,
+            chatbot,
+            history,
+            top_p,
+            temperature,
+            max_generate_length,
+            max_context_length_tokens,
+            video_nframes
+        ]
+        output_widgets = [chatbot, history, status_display]
+        transfer_input_args = dict(
+            fn=transfer_input,
+            inputs=[text_box, upload_images],
+            outputs=[input_text, input_images, text_box, upload_images, submit_btn],
+            show_progress=True,
+        )
+        predict_args = dict(fn=predict, inputs=input_widgets, outputs=output_widgets, show_progress=True)
+        retry_args = dict(fn=retry, inputs=input_widgets, outputs=output_widgets, show_progress=True)
+        reset_args = dict(fn=reset_textbox, inputs=[], outputs=[text_box, status_display])
+        predict_events = [
+            text_box.submit(**transfer_input_args).then(**predict_args),
+            submit_btn.click(**transfer_input_args).then(**predict_args),
+        ]
+        empty_btn.click(reset_state, outputs=output_widgets, show_progress=True)
+        empty_btn.click(**reset_args)
+        retry_btn.click(**retry_args)
+        del_last_btn.click(delete_last_conversation, [chatbot, history], output_widgets, show_progress=True)
+        cancel_btn.click(cancel_outputing, [], [status_display], cancels=predict_events)
+    demo.title = "Video-XL-2_Demo Chatbot"
+    return demo
+def main(args: argparse.Namespace):
+    demo = build_demo(args)
+    reload_javascript()
+    # concurrency_count=CONCURRENT_COUNT, max_size=MAX_EVENTS
+    favicon_path = os.path.join("serve/assets/favicon.ico")
+    demo.queue().launch(
+        favicon_path=favicon_path if os.path.exists(favicon_path) else None,
+        server_name=args.ip,
+        server_port=args.port,
+    )
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,83 @@

+accelerate==0.30.0
+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+certifi==2025.6.15
+charset-normalizer==3.4.2
+click==8.2.1
+colorama==0.4.6
+decorator==4.4.2
+decord==0.6.0
+einops==0.8.1
+exceptiongroup==1.3.0
+fastapi==0.115.13
+ffmpy==0.6.0
+filelock==3.18.0
+fsspec==2025.5.1
+gradio==5.25.2
+gradio_client==1.8.0
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.33.0
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+Jinja2==3.1.4
+latex2mathml==3.78.0
+Markdown==3.8.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdtex2html==1.3.1
+mdurl==0.1.2
+moviepy==2.2.1
+mpmath==1.3.0
+networkx==3.3
+numpy==1.26.4
+opencv-python==4.11.0.86
+orjson==3.10.18
+packaging==25.0
+pandas==2.3.0
+pillow==11.0.0
+proglog==0.1.12
+psutil==7.0.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.2
+pypinyin==0.54.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+ruff==0.12.0
+safehttpx==0.1.6
+safetensors==0.5.3
+semantic-version==2.10.0
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.46.2
+sympy==1.13.3
+tokenizers==0.19.1
+tomlkit==0.13.3
+torch==2.1.2+cu121
+torchaudio==2.1.2+cu121
+torchvision==0.16.2+cu121
+tqdm==4.67.1
+transformers==4.43.0
+triton==2.1.0
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.34.3
+websockets==15.0.1

serve/__init__.py ADDED Viewed

File without changes

serve/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (147 Bytes). View file

serve/__pycache__/chat_utils.cpython-310.pyc ADDED Viewed

Binary file (11.7 kB). View file

serve/__pycache__/examples.cpython-310.pyc ADDED Viewed

Binary file (2.16 kB). View file

serve/__pycache__/frontend.cpython-310.pyc ADDED Viewed

Binary file (4.02 kB). View file

serve/__pycache__/gradio_utils.cpython-310.pyc ADDED Viewed

Binary file (2.56 kB). View file

serve/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (8.54 kB). View file

serve/assets/Kelpy-Codos.js ADDED Viewed

	@@ -0,0 +1,100 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// ==UserScript==
+// @name         Kelpy Codos
+// @namespace    https://github.com/Keldos-Li/Kelpy-Codos
+// @version      1.0.5
+// @author       Keldos; https://keldos.me/
+// @description  Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially.
+//               Based on Chuanhu ChatGPT version: ac04408 (2023-3-22)
+// @license      GPL-3.0
+// @grant        none
+// ==/UserScript==
+(function () {
+  "use strict";
+  function addCopyButton(pre) {
+    var code = pre.querySelector("code");
+    if (!code) {
+      return; // 如果没有找到 <code> 元素，则不添加按钮
+    }
+    var firstChild = code.firstChild;
+    if (!firstChild) {
+      return; // 如果 <code> 元素没有子节点，则不添加按钮
+    }
+    var button = document.createElement("button");
+    button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本
+    button.style.position = "relative";
+    button.style.float = "right";
+    button.style.fontSize = "1em"; // 可选：调整按钮大小
+    button.style.background = "none"; // 可选：去掉背景颜色
+    button.style.border = "none"; // 可选：去掉边框
+    button.style.cursor = "pointer"; // 可选：显示指针样式
+    button.addEventListener("click", function () {
+      var range = document.createRange();
+      range.selectNodeContents(code);
+      range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前
+      var selection = window.getSelection();
+      selection.removeAllRanges();
+      selection.addRange(range);
+      try {
+        var success = document.execCommand("copy");
+        if (success) {
+          button.textContent = "\u2714";
+          setTimeout(function () {
+            button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制”
+          }, 2000);
+        } else {
+          button.textContent = "\u2716";
+        }
+      } catch (e) {
+        console.error(e);
+        button.textContent = "\u2716";
+      }
+      selection.removeAllRanges();
+    });
+    code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前
+  }
+  function handleNewElements(mutationsList, observer) {
+    for (var mutation of mutationsList) {
+      if (mutation.type === "childList") {
+        for (var node of mutation.addedNodes) {
+          if (node.nodeName === "PRE") {
+            addCopyButton(node);
+          }
+        }
+      }
+    }
+  }
+  var observer = new MutationObserver(handleNewElements);
+  observer.observe(document.documentElement, {
+    childList: true,
+    subtree: true,
+  });
+  document.querySelectorAll("pre").forEach(addCopyButton);
+})();

serve/assets/avatar.png ADDED Viewed

serve/assets/custom.css ADDED Viewed

	@@ -0,0 +1,355 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+:root {
+  --chatbot-color-light: #f3f3f3;
+  --chatbot-color-dark: #121111;
+}
+/* status_display */
+#status_display {
+  display: flex;
+  min-height: 2.5em;
+  align-items: flex-end;
+  justify-content: flex-end;
+}
+#status_display p {
+  font-size: 0.85em;
+  font-family: monospace;
+  color: var(--body-text-color-subdued);
+}
+/* usage_display */
+#usage_display {
+  height: 1em;
+}
+#usage_display p {
+  padding: 0 1em;
+  font-size: 0.85em;
+  font-family: monospace;
+  color: var(--body-text-color-subdued);
+}
+/* list */
+ol:not(.options),
+ul:not(.options) {
+  padding-inline-start: 2em !important;
+}
+/* Thank @Keldos-Li for fixing it */
+/* Light mode (default) */
+#deepseek_chatbot {
+  background-color: var(--chatbot-color-light) !important;
+  color: #000000 !important;
+}
+[data-testid="bot"] {
+  background-color: #ffffff !important;
+}
+[data-testid="user"] {
+  background-color: #95ec69 !important;
+}
+/* Dark mode */
+.dark #deepseek_chatbot {
+  background-color: var(--chatbot-color-dark) !important;
+  color: #ffffff !important;
+}
+.dark [data-testid="bot"] {
+  background-color: #2c2c2c !important;
+}
+.dark [data-testid="user"] {
+  background-color: #26b561 !important;
+}
+#deepseek_chatbot {
+  height: 100%;
+  min-height: 800px;
+  flex-grow: 1;
+  overflow: auto;
+}
+[class*="message"] {
+  border-radius: var(--radius-xl) !important;
+  border: none;
+  padding: var(--spacing-xl) !important;
+  font-size: var(--text-md) !important;
+  line-height: var(--line-md) !important;
+  min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
+  min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
+}
+[data-testid="bot"] {
+  max-width: 85%;
+  border-bottom-left-radius: 0 !important;
+}
+[data-testid="user"] {
+  max-width: 85%;
+  width: auto !important;
+  border-bottom-right-radius: 0 !important;
+}
+/* Table */
+table {
+  margin: 1em 0;
+  border-collapse: collapse;
+  empty-cells: show;
+}
+td,
+th {
+  border: 1.2px solid var(--border-color-primary) !important;
+  padding: 0.2em;
+}
+thead {
+  background-color: rgba(175, 184, 193, 0.2);
+}
+thead th {
+  padding: 0.5em 0.2em;
+}
+/* Inline code */
+#deepseek_chatbot code {
+  display: inline;
+  white-space: break-spaces;
+  border-radius: 6px;
+  margin: 0 2px 0 2px;
+  padding: 0.2em 0.4em 0.1em 0.4em;
+  background-color: rgba(175, 184, 193, 0.2);
+}
+/* Code block */
+#deepseek_chatbot pre code {
+  display: block;
+  overflow: auto;
+  white-space: pre;
+  background-color: #1c1d1e !important;
+  border-radius: 10px;
+  padding: 1.4em 1.2em 0em 1.4em;
+  margin: 1.2em 2em 1.2em 0.5em;
+  color: #fdf8f8;
+  box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
+}
+/* Hightlight */
+#deepseek_chatbot .highlight {
+  background-color: transparent;
+}
+#deepseek_chatbot .highlight .hll {
+  background-color: #49483e;
+}
+#deepseek_chatbot .highlight .c {
+  color: #75715e;
+} /* Comment */
+#deepseek_chatbot .highlight .err {
+  color: #960050;
+  background-color: #1e0010;
+} /* Error */
+#deepseek_chatbot .highlight .k {
+  color: #66d9ef;
+} /* Keyword */
+#deepseek_chatbot .highlight .l {
+  color: #ae81ff;
+} /* Literal */
+#deepseek_chatbot .highlight .n {
+  color: #f8f8f2;
+} /* Name */
+#deepseek_chatbot .highlight .o {
+  color: #f92672;
+} /* Operator */
+#deepseek_chatbot .highlight .p {
+  color: #f8f8f2;
+} /* Punctuation */
+#deepseek_chatbot .highlight .ch {
+  color: #75715e;
+} /* Comment.Hashbang */
+#deepseek_chatbot .highlight .cm {
+  color: #75715e;
+} /* Comment.Multiline */
+#deepseek_chatbot .highlight .cp {
+  color: #75715e;
+} /* Comment.Preproc */
+#deepseek_chatbot .highlight .cpf {
+  color: #75715e;
+} /* Comment.PreprocFile */
+#deepseek_chatbot .highlight .c1 {
+  color: #75715e;
+} /* Comment.Single */
+#deepseek_chatbot .highlight .cs {
+  color: #75715e;
+} /* Comment.Special */
+#deepseek_chatbot .highlight .gd {
+  color: #f92672;
+} /* Generic.Deleted */
+#deepseek_chatbot .highlight .ge {
+  font-style: italic;
+} /* Generic.Emph */
+#deepseek_chatbot .highlight .gi {
+  color: #a6e22e;
+} /* Generic.Inserted */
+#deepseek_chatbot .highlight .gs {
+  font-weight: bold;
+} /* Generic.Strong */
+#deepseek_chatbot .highlight .gu {
+  color: #75715e;
+} /* Generic.Subheading */
+#deepseek_chatbot .highlight .kc {
+  color: #66d9ef;
+} /* Keyword.Constant */
+#deepseek_chatbot .highlight .kd {
+  color: #66d9ef;
+} /* Keyword.Declaration */
+#deepseek_chatbot .highlight .kn {
+  color: #f92672;
+} /* Keyword.Namespace */
+#deepseek_chatbot .highlight .kp {
+  color: #66d9ef;
+} /* Keyword.Pseudo */
+#deepseek_chatbot .highlight .kr {
+  color: #66d9ef;
+} /* Keyword.Reserved */
+#deepseek_chatbot .highlight .kt {
+  color: #66d9ef;
+} /* Keyword.Type */
+#deepseek_chatbot .highlight .ld {
+  color: #e6db74;
+} /* Literal.Date */
+#deepseek_chatbot .highlight .m {
+  color: #ae81ff;
+} /* Literal.Number */
+#deepseek_chatbot .highlight .s {
+  color: #e6db74;
+} /* Literal.String */
+#deepseek_chatbot .highlight .na {
+  color: #a6e22e;
+} /* Name.Attribute */
+#deepseek_chatbot .highlight .nb {
+  color: #f8f8f2;
+} /* Name.Builtin */
+#deepseek_chatbot .highlight .nc {
+  color: #a6e22e;
+} /* Name.Class */
+#deepseek_chatbot .highlight .no {
+  color: #66d9ef;
+} /* Name.Constant */
+#deepseek_chatbot .highlight .nd {
+  color: #a6e22e;
+} /* Name.Decorator */
+#deepseek_chatbot .highlight .ni {
+  color: #f8f8f2;
+} /* Name.Entity */
+#deepseek_chatbot .highlight .ne {
+  color: #a6e22e;
+} /* Name.Exception */
+#deepseek_chatbot .highlight .nf {
+  color: #a6e22e;
+} /* Name.Function */
+#deepseek_chatbot .highlight .nl {
+  color: #f8f8f2;
+} /* Name.Label */
+#deepseek_chatbot .highlight .nn {
+  color: #f8f8f2;
+} /* Name.Namespace */
+#deepseek_chatbot .highlight .nx {
+  color: #a6e22e;
+} /* Name.Other */
+#deepseek_chatbot .highlight .py {
+  color: #f8f8f2;
+} /* Name.Property */
+#deepseek_chatbot .highlight .nt {
+  color: #f92672;
+} /* Name.Tag */
+#deepseek_chatbot .highlight .nv {
+  color: #f8f8f2;
+} /* Name.Variable */
+#deepseek_chatbot .highlight .ow {
+  color: #f92672;
+} /* Operator.Word */
+#deepseek_chatbot .highlight .w {
+  color: #f8f8f2;
+} /* Text.Whitespace */
+#deepseek_chatbot .highlight .mb {
+  color: #ae81ff;
+} /* Literal.Number.Bin */
+#deepseek_chatbot .highlight .mf {
+  color: #ae81ff;
+} /* Literal.Number.Float */
+#deepseek_chatbot .highlight .mh {
+  color: #ae81ff;
+} /* Literal.Number.Hex */
+#deepseek_chatbot .highlight .mi {
+  color: #ae81ff;
+} /* Literal.Number.Integer */
+#deepseek_chatbot .highlight .mo {
+  color: #ae81ff;
+} /* Literal.Number.Oct */
+#deepseek_chatbot .highlight .sa {
+  color: #e6db74;
+} /* Literal.String.Affix */
+#deepseek_chatbot .highlight .sb {
+  color: #e6db74;
+} /* Literal.String.Backtick */
+#deepseek_chatbot .highlight .sc {
+  color: #e6db74;
+} /* Literal.String.Char */
+#deepseek_chatbot .highlight .dl {
+  color: #e6db74;
+} /* Literal.String.Delimiter */
+#deepseek_chatbot .highlight .sd {
+  color: #e6db74;
+} /* Literal.String.Doc */
+#deepseek_chatbot .highlight .s2 {
+  color: #e6db74;
+} /* Literal.String.Double */
+#deepseek_chatbot .highlight .se {
+  color: #ae81ff;
+} /* Literal.String.Escape */
+#deepseek_chatbot .highlight .sh {
+  color: #e6db74;
+} /* Literal.String.Heredoc */
+#deepseek_chatbot .highlight .si {
+  color: #e6db74;
+} /* Literal.String.Interpol */
+#deepseek_chatbot .highlight .sx {
+  color: #e6db74;
+} /* Literal.String.Other */
+#deepseek_chatbot .highlight .sr {
+  color: #e6db74;
+} /* Literal.String.Regex */
+#deepseek_chatbot .highlight .s1 {
+  color: #e6db74;
+} /* Literal.String.Single */
+#deepseek_chatbot .highlight .ss {
+  color: #e6db74;
+} /* Literal.String.Symbol */
+#deepseek_chatbot .highlight .bp {
+  color: #f8f8f2;
+} /* Name.Builtin.Pseudo */
+#deepseek_chatbot .highlight .fm {
+  color: #a6e22e;
+} /* Name.Function.Magic */
+#deepseek_chatbot .highlight .vc {
+  color: #f8f8f2;
+} /* Name.Variable.Class */
+#deepseek_chatbot .highlight .vg {
+  color: #f8f8f2;
+} /* Name.Variable.Global */
+#deepseek_chatbot .highlight .vi {
+  color: #f8f8f2;
+} /* Name.Variable.Instance */
+#deepseek_chatbot .highlight .vm {
+  color: #f8f8f2;
+} /* Name.Variable.Magic */
+#deepseek_chatbot .highlight .il {
+  color: #ae81ff;
+} /* Literal.Number.Integer.Long */

serve/assets/custom.js ADDED Viewed

	@@ -0,0 +1,22 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// custom javascript here

serve/assets/favicon.ico ADDED Viewed

serve/chat_utils.py ADDED Viewed

	@@ -0,0 +1,497 @@

+"""
+From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+"""
+import dataclasses
+import logging
+import copy
+from enum import IntEnum, auto
+from typing import Dict, List
+import base64
+import gradio as gr
+import torch
+import os
+from .utils import pil_to_base64
+import mimetypes
+IMAGE_TOKEN = "<image>"
+logger = logging.getLogger("gradio_logger")
+import cv2
+import base64
+import tempfile
+import os
+import imageio
+def compress_video_to_base64(
+    video_path: str,
+    max_frames: int = 600,
+    resolution: tuple = (960, 540),
+    target_crf: int = 28
+) -> str:
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"无法打开视频：{video_path}")
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or None
+    original_fps = cap.get(cv2.CAP_PROP_FPS) or None
+    if not total_frames or not original_fps:
+        cap.release()
+        raise RuntimeError("无法获取视频帧数或帧率，请检查视频文件或使用 ffprobe。")
+    step = max(1, total_frames // max_frames)
+    new_fps = max(1, round(original_fps / step))
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
+        tmp_path = tmp.name
+    writer = imageio.get_writer(
+        tmp_path,
+        fps=new_fps,
+        codec='libx264',
+        ffmpeg_params=[
+            '-crf', str(target_crf),
+            '-pix_fmt', 'yuv420p'
+        ]
+    )
+    frame_idx = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_idx % step == 0:
+            small = cv2.resize(frame, resolution)
+            writer.append_data(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+        frame_idx += 1
+    cap.release()
+    writer.close()
+    with open(tmp_path, "rb") as f:
+        data = f.read()
+    os.remove(tmp_path)
+    return base64.b64encode(data).decode("utf-8")
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    PLAIN = auto()
+    ALIGNMENT = auto()
+    KIMI_VL = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: List[str] = (("USER", "ASSISTANT"),)
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.PLAIN
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if i % 2 == 0:
+                        ret += message + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.ALIGNMENT:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i % 2 == 0:
+                        ret += '<image>\n' + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.KIMI_VL:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if role == "user":
+                        ret += message + self.sep
+                    else:
+                        if self.sep2 is not None:
+                            ret += message + self.sep2
+                        else:
+                            ret += message
+                else:
+                    ret = ret
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def reset_message(self):
+        """Reset a new message."""
+        self.messages = []
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        ret = [{"role": "system", "content": system_prompt}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert template.name not in conv_templates, f"{template.name} has been registered."
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+register_conv_template(
+    Conversation(
+        name="plain",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.PLAIN,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="alignment",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ALIGNMENT,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_template="{system_message}",
+        system_message="You are a helpful assistant",
+        roles=("user", "assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.KIMI_VL,
+        sep="<|im_end|>",
+        sep2=None,
+        stop_token_ids=None,
+        stop_str=["<|im_end|>"],
+    )
+)
+def new_chat_template(sft_format: str = "kimi-vl"):
+    return get_conv_template(sft_format)
+def get_prompt(conv: Conversation) -> str:
+    """Get the prompt for generation."""
+    return conv.get_prompt()
+def generate_prompt_with_history(text, images, history, processor, max_length=2048):
+    """
+    Generate a prompt with the chat history.
+    Args:
+        text (str): The text prompt.
+        images (list[PIL.Image.Image]): The image prompt.
+        history (list): List of previous conversation messages.
+        processor (KimiVLProcessor): The chat processor used for encoding the prompt.
+        max_length (int): The maximum length of the prompt.
+    """
+    global IMAGE_TOKEN
+    user_role_ind = 0
+    bot_role_ind = 1
+    # Initialize conversation
+    conversation = new_chat_template(sft_format="kimi-vl")
+    if history:
+        conversation.messages = history
+    if images is not None and len(images) > 0:
+        # num_image_tags = text.count(IMAGE_TOKEN)
+        # num_images = len(images)
+        # if num_images > num_image_tags:
+        #     pad_image_tags = num_images - num_image_tags
+        #     image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
+        #     # append the <image> in a new line after the text prompt
+        #     text = image_tokens + "\n" + text
+        # elif num_images < num_image_tags:
+        #     remove_image_tags = num_image_tags - num_images
+        #     text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
+        print(f"prompt = {text}, len(images) = {len(images)}")
+        text = (text, images)
+    conversation.append_message(conversation.roles[user_role_ind], text)
+    conversation.append_message(conversation.roles[bot_role_ind], "")
+    # Create a copy of the conversation to avoid history truncation in the UI
+    conversation_copy = conversation.copy()
+    logger.info("=" * 80)
+    logger.info(get_prompt(conversation))
+    rounds = len(conversation.messages) // 2
+    for _ in range(rounds):
+        current_prompt = get_prompt(conversation)
+        assert isinstance(current_prompt, str) and len(current_prompt) > 0, f"current_prompt = {current_prompt}"
+        if torch.tensor(processor.tokenizer.encode(current_prompt)).size(-1) <= max_length:
+            return conversation_copy
+        if len(conversation.messages) % 2 != 0:
+            gr.Error("The messages between user and assistant are not paired.")
+            return
+        try:
+            for _ in range(2):  # pop out two messages in a row
+                conversation.messages.pop(0)
+        except IndexError:
+            gr.Error("Input text processing failed, unable to respond in this round.")
+            return None
+    gr.Error("Prompt could not be generated within max_length limit.")
+    return None
+def convert_conversation_to_prompts(conversation: Conversation):
+    """
+    Convert the conversation to prompts.
+    """
+    conv_prompts = []
+    last_image = None
+    messages = conversation.messages
+    for i in range(0, len(messages), 2):
+        if isinstance(messages[i][1], tuple):
+            text, images = messages[i][1]
+            last_image = images[-1]
+        else:
+            text, images = messages[i][1], []
+        prompt = {"role": messages[i][0], "content": text, "images": images}
+        response = {"role": messages[i + 1][0], "content": messages[i + 1][1]}
+        conv_prompts.extend([prompt, response])
+    return conv_prompts, last_image
+def to_gradio_chatbot2(conversation: Conversation) -> list:
+    """Convert the conversation to gradio chatbot format."""
+    ret = []
+    for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
+        if i % 2 == 0:
+            if type(msg) is tuple:
+                msg, images = copy.deepcopy(msg)
+                if isinstance(images, list):
+                    img_str = ""
+                    for j, image in enumerate(images):
+                        if isinstance(image, str):
+                            with open(image, "rb") as f:
+                                data = f.read()
+                            img_b64_str = base64.b64encode(data).decode()
+                            image_str = (
+                                f'<img src="data:image/png;base64,{img_b64_str}" '
+                                f'alt="user upload image" style="max-width: 300px; height: auto;" />'
+                            )
+                        else:
+                            image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
+                        img_str += image_str
+                    msg = img_str + msg
+                else:
+                    pass
+            ret.append([msg, None])
+        else:
+            ret[-1][-1] = msg
+    return ret
+def to_gradio_chatbot(conversation: Conversation) -> list:
+    """Convert the conversation to gradio chatbot format, supporting images and video."""
+    ret = []
+    for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
+        # User message
+        if i % 2 == 0:
+            if isinstance(msg, tuple):
+                msg_text, media = copy.deepcopy(msg)
+                media_str = ""
+                # Handle list of media items
+                if isinstance(media, list):
+                    items = media
+                else:
+                    items = [media]
+                for j, item in enumerate(items):
+                    # If string path, determine type
+                    if isinstance(item, str) and (not item.endswith((".mp4", ".mov", ".avi", ".webm"))):
+                        mime, _ = mimetypes.guess_type(item)
+                        with open(item, "rb") as f:
+                            data = f.read()
+                        b64 = base64.b64encode(data).decode()
+                        if mime and mime.startswith("image/"):
+                            media_str += (
+                                f'<img src="data:{mime};base64,{b64}" '
+                                f'alt="user upload image_{j}" '
+                                f'style="max-width:300px;height:auto;" />'
+                            )
+                        else:
+                            # Fallback to link
+                            media_str += f'<a href="{item}" target="_blank">{item}</a>'
+                    elif isinstance(item, str) and (item.endswith((".mp4", ".mov", ".avi", ".webm"))):
+                        try:
+                            b64 = compress_video_to_base64(item)
+                            media_str += (
+                                f'<video controls style="max-width:300px;height:auto;" '
+                                f'src="data:video/mp4;base64,{b64}"></video>'
+                            )
+                        except:
+                            pass
+                    # If PIL image
+                    else:
+                        media_str += pil_to_base64(item, f"user upload image_{j}", max_size=800, min_size=400)
+                msg = media_str + msg_text
+            # Append user side
+            ret.append([msg, None])
+        else:
+            # Assistant side, fill previous tuple
+            ret[-1][-1] = msg
+    return ret
+def to_gradio_history(conversation: Conversation):
+    """Convert the conversation to gradio history format."""
+    return conversation.messages[conversation.offset :]

serve/examples.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import io
+import base64
+import decord
+from PIL import Image
+EXAMPLES_LIST = [
+    [
+        ["videos/demo1.mp4"],
+        "What's the third criteria promoted by Nick Mortimer that a continent must meet?"
+    ],
+    [
+        ["videos/demo2.mp4"],
+        "When does the deer appear in the video? Give me the specific time range in seconds."
+    ]
+]
+def display_example(file_list, root_dir: str = None):
+    media_html = ""
+    for _, file_path in enumerate(file_list):
+        if root_dir is not None:
+            full_file_path = os.path.join(root_dir, file_path)
+        else:
+            full_file_path = file_path
+        # # Check if it's a video file
+        # if full_file_path.endswith((".mp4", ".mov", ".avi", ".webm")):
+        file_ext = os.path.splitext(full_file_path)[1].lower()
+        # 对于视频文件，提取第一帧作为预览图
+        if file_ext in [".mp4", ".mov", ".avi", ".webm"]:
+            try:
+                # 使用 decord 读取视频
+                vr = decord.VideoReader(full_file_path)
+                # 提取第一帧 (索引为 0)
+                frame = vr[0].asnumpy()
+                # 将 NumPy 数组 (RGB) 转换为 PIL Image
+                image = Image.fromarray(frame)
+                buffered = io.BytesIO()
+                image.save(buffered, format="PNG")
+                img_b64_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                # 使用 img 标签显示这一帧
+                media_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="Video preview for {file_path}" style="height:80px; margin-right: 10px;" />'
+            except Exception as e:
+                media_str = f"<span>Error processing video {file_path}: {e}</span>"
+        else:
+            # Assume it's an image
+            image = Image.open(full_file_path)
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG", quality=100)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            media_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{file_path}" style="height:80px; margin-right: 10px;" />'
+        media_html += media_str
+    result_html = f"""
+    <div style="display: flex; align-items: center; margin-bottom: 10px;">
+        <div style="flex: 1; margin-right: 10px;">{media_html}</div>
+    </div>
+    """
+    return result_html
+def get_examples(root_dir: str = None):
+    examples = []
+    for files, texts in EXAMPLES_LIST:
+        examples.append([files, display_example(files, root_dir), texts])
+    return examples

serve/frontend.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import logging
+import os
+from typing import List, Tuple
+import gradio as gr
+from serve.utils import convert_asis, convert_mdtext, detect_converted_mark
+ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
+small_and_beautiful_theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        name="nvidia-green",
+        c50="#F3FAE6",
+        c100="#E3F3C2",
+        c200="#C9E98D",
+        c300="#AFDD59",
+        c400="#95D124",
+        c500="#76B900",  # NVIDIA green
+        c600="#6AA600",
+        c700="#5C9300",
+        c800="#4F8000",
+        c900="#426D00",
+        c950="#2E5500",
+    ),
+    secondary_hue=gr.themes.Color(
+        c50="#d3e3d3",
+        c100="#bfd6bf",
+        c200="#a9c7a9",
+        c300="#93b893",
+        c400="#7da97d",
+        c500="#689A68",
+        c600="#538B53",
+        c700="#3E7C3E",
+        c800="#296D29",
+        c900="#145E14",
+        c950="#0A4A0A",
+    ),
+    neutral_hue=gr.themes.Color(
+        name="gray",
+        c50="#f6f7f8",
+        c100="#F2F2F2",
+        c200="#e5e7eb",
+        c300="#d1d5db",
+        c400="#B2B2B2",
+        c500="#808080",
+        c600="#636363",
+        c700="#515151",
+        c800="#393939",
+        c900="#2B2B2B",
+        c950="#171717",
+    ),
+    radius_size=gr.themes.sizes.radius_sm,
+).set(
+    button_primary_background_fill_dark="*primary_600",
+    button_primary_border_color_dark="*primary_600",
+    button_primary_text_color="white",
+    button_primary_text_color_dark="white",
+    button_secondary_background_fill="*neutral_100",
+    button_secondary_background_fill_hover="*neutral_50",
+    button_secondary_background_fill_dark="*neutral_900",
+    button_secondary_text_color="*neutral_800",
+    button_secondary_text_color_dark="white",
+    block_title_background_fill_dark="*primary_900",
+    block_label_background_fill_dark="*primary_900",
+    input_background_fill="#F6F6F6",
+)
+def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]:
+    logging.debug("Compacting text chunks...🚀🚀🚀")
+    combined_str = [c.strip() for c in text_chunks if c.strip()]
+    combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
+    combined_str = "\n\n".join(combined_str)
+    # resplit based on self.max_chunk_overlap
+    text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
+    return text_splitter.split_text(combined_str)
+def postprocess(y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]:
+    """
+    Parameters:
+        y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.
+    Returns:
+        List of tuples representing the message and response. Each message and response will be a string of HTML.
+    """
+    if y is None or y == []:
+        return []
+    temp = []
+    for x in y:
+        user, bot = x
+        if not detect_converted_mark(user):
+            user = convert_asis(user)
+        if not detect_converted_mark(bot):
+            bot = convert_mdtext(bot)
+        temp.append((user, bot))
+    return temp
+custom_js_path = os.path.join(ROOT_PATH, "assets/custom.js")
+kelpy_codos_path = os.path.join(ROOT_PATH, "assets/Kelpy-Codos.js")
+with (
+    open(custom_js_path, "r", encoding="utf-8") as f,
+    open(kelpy_codos_path, "r", encoding="utf-8") as f2,
+):
+    customJS = f.read()
+    kelpyCodos = f2.read()
+def reload_javascript():
+    print("Reloading javascript...")
+    js = f"<script>{customJS}</script><script>{kelpyCodos}</script>"
+    def template_response(*args, **kwargs):
+        res = GradioTemplateResponseOriginal(*args, **kwargs)
+        res.body = res.body.replace(b"</html>", f"{js}</html>".encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse

serve/gradio_utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Gradio utils for the Kimi-VL application.
+"""
+import functools
+from typing import Callable
+import traceback
+import gradio as gr
+IMAGE_TOKEN = "<image>"
+def transfer_input(input_text, input_images):
+    """
+    Transfer the input text and images to the input text and images.
+    """
+    return (input_text, input_images, gr.update(value=""), gr.update(value=None), gr.Button(visible=True))
+def delete_last_conversation(chatbot, full_history):
+    """
+    Delete the last conversation from the chatbot and history.
+    Args:
+        chatbot (list): The chatbot list.
+        history (list): The history list.
+    """
+    history = full_history["context"]
+    if len(history) % 2 != 0:
+        gr.Error("history length is not even")
+        return (
+            chatbot,
+            full_history,
+            "Delete Done",
+        )
+    if len(chatbot) > 0:
+        chatbot.pop()
+    if len(history) > 0 and len(history) % 2 == 0:
+        history.pop()
+        history.pop()
+    full_history["context"] = history
+    return (
+        chatbot,
+        full_history,
+        "Delete Done",
+    )
+def reset_state():
+    return [], {}, "Reset Done"
+def reset_textbox():
+    return gr.update(value=""), ""
+def cancel_outputing():
+    return "Stop Done"
+class State:
+    interrupted = False
+    def interrupt(self):
+        self.interrupted = True
+    def recover(self):
+        self.interrupted = False
+shared_state = State()
+def wrap_gen_fn(gen_fn: Callable):
+    """
+    Wrap the generator function to handle errors.
+    """
+    @functools.wraps(gen_fn)
+    def wrapped_gen_fn(prompt, *args, **kwargs):
+        try:
+            yield from gen_fn(prompt, *args, **kwargs)
+        except gr.Error as g_err:
+            traceback.print_exc()
+            raise g_err
+        except Exception as e:
+            traceback.print_exc()
+            raise gr.Error(f"Failed to generate text: {e}") from e
+    return wrapped_gen_fn

serve/inference.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import logging
+import re
+from threading import Thread
+from typing import List, Optional
+import os
+import torch
+from transformers import (
+    AutoModel,
+    AutoProcessor,
+    AutoConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+)
+from PIL import Image
+from .chat_utils import Conversation, get_conv_template
+logger = logging.getLogger(__name__)
+def load_model_from_nv(model_path: str = "nvidia/Eagle-2-8B"):
+    token = os.environ.get("HF_TOKEN")
+    # hotfix the model to use flash attention 2
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
+    config._attn_implementation = "flash_attention_2"
+    config.vision_config._attn_implementation = "flash_attention_2"
+    config.text_config._attn_implementation = "flash_attention_2"
+    print("Successfully set the attn_implementation to flash_attention_2")
+    logger.info(f"token = {token[:4]}***{token[-2:]}")
+    model = AutoModel.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        token=token
+    )
+    model.to("cuda")
+    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
+    return model, processor
+def load_model_from_eagle(model_path: str = "NVEagle/Eagle2-8B"):
+    token = os.environ.get("HF_TOKEN")
+    logger.info(f"token = {token[:4]}***{token[-2:]}")
+    # hotfix the model to use flash attention 2
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
+    config._attn_implementation = "flash_attention_2"
+    config.vision_config._attn_implementation = "flash_attention_2"
+    config.text_config._attn_implementation = "flash_attention_2"
+    print("Successfully set the attn_implementation to flash_attention_2")
+    model = AutoModel.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        token=token
+    )
+    model.to("cuda")
+    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
+    return model, processor
+def load_model(model_path: str = "nvidia/Eagle2-8B"):
+    try:
+        model, processor = load_model_from_nv(model_path)
+    except Exception as e:
+        logger.error(f"Failed to load model from HF, trying to load from eagle: {e}")
+        model, processor = load_model_from_eagle()
+    return model, processor
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = [stop.to("cuda") for stop in stops]
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        for stop in self.stops:
+            if input_ids.shape[-1] < len(stop):
+                continue
+            if torch.all((stop == input_ids[0][-len(stop) :])).item():
+                return True
+        return False
+def preprocess(
+    messages: list[dict],
+    processor,
+    video_nframes: int = 16,
+):
+    """
+    Build messages from the conversations and images.
+    """
+    # get images from conversations
+    results = [
+        {
+            "role": "system",
+            "content": """You are Eagle 2, a cutting-edge large language model developed by NVIDIA. You are highly capable, efficient, and aligned, specialized in understanding complex multimodal inputs and providing expert-level responses across domains.
+Always be concise, accurate, and helpful. You respond like a reliable co-pilot to researchers, developers, and engineers, offering deep technical insight, step-by-step reasoning, and practical suggestions.
+You can interpret long contexts, follow nuanced instructions, and dynamically adjust your tone to match the user's intent. If the user does not specify a tone, default to a professional, technical, yet friendly style.
+You understand you are Eagle 2, and may refer to yourself as such when asked."""}
+    ]
+    # get texts from conversations
+    # converstion = get_conv_template(sft_format)
+    # only use the last 3 round of messages
+    # latest_messages = messages[-3:]
+    all_images_num = 0
+    for mid, message in enumerate(messages):
+        if message["role"] == "user":
+            record = {
+                "role": message["role"],
+                "content": [],
+            }
+            if "images" in message:
+                per_round_images = message["images"]
+                for image in per_round_images:
+                    if isinstance(image, Image.Image) and all_images_num < 128:
+                        record["content"].append(
+                            {
+                                "type": "image",
+                                "image": image,
+                            }
+                        )
+                        all_images_num+=1
+                    elif isinstance(image, str) and image.endswith((".jpeg", ".jpg", ".png", ".gif")) and all_images_num < 128:
+                        record["content"].append(
+                            {
+                                "type": "image",
+                                "image": image,
+                            }
+                        )
+                        all_images_num+=1
+                    elif isinstance(image, str) and image.endswith((".mp4", ".mov", ".avi", ".webm")) and all_images_num < 128-video_nframes:
+                        record["content"].append(
+                            {
+                                "type": "video",
+                                "video": image,
+                                "nframes": video_nframes,
+                            }
+                        )
+                        all_images_num+=video_nframes
+            if 'content' in message:
+                record["content"].append(
+                    {
+                        "type": "text",
+                        "text": str(message["content"]).strip(),
+                    }
+                )
+            results.append(record)
+        elif message["role"] == "assistant":
+            formatted_answer = message["content"].strip()
+            # ◁think▷用户说了“你好”，这是一个非常简单的问候，通常用于开启对话。我需要判断用户的意图。可能性一：用户只是礼貌性地打招呼，想要开启一段对话；可能性二：用户可能有更具体的需求，比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息，我需要保持开放，同时引导用户进一步说明他们的需求。
+            # 我的回复需要既友好又开放，不能显得过于正式或冷漠。同时，我需要避免假设用户的具体需求，而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好！很高兴见到你。有什么我可以帮助你的吗
+            # delete all the texts between ◁think▷ and ◁/think▷
+            # FIXME: this is a hack to remove the thinking texts
+            # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
+            think_end_token = '◁/think▷'
+            formatted_answer = formatted_answer.split(think_end_token)[-1]
+            results.append(
+                {
+                    "role": message["role"],
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": formatted_answer,
+                        }
+                    ],
+                }
+            )
+            assert (
+                formatted_answer.count(processor.image_token) == 0
+            ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
+    # print(f"messages = {results}")
+    text = processor.apply_chat_template(results, add_generation_prompt=False)
+    # print(f"raw text = {text}")
+    image_inputs, video_inputs, video_kwargs = processor.process_vision_info(results, return_video_kwargs=True)
+    inputs = processor(
+        images=image_inputs,
+        videos=video_inputs,
+        text=[text],
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        videos_kwargs=video_kwargs,
+    )
+    return inputs
+@torch.no_grad()
+@torch.inference_mode()
+def eagle_vl_generate(
+    model: torch.nn.Module,
+    processor: AutoProcessor,
+    conversations: list[Conversation],
+    stop_words: list,
+    max_length: int = 256,
+    temperature: float = 1.0,
+    top_p: float = 1.0,
+    chunk_size: int = -1,
+    video_nframes: int = 16,
+):
+    # convert conversation to inputs
+    print(f"conversations = {conversations}")
+    inputs = preprocess(conversations, processor=processor, video_nframes=video_nframes)
+    inputs = inputs.to(model.device)
+    return generate(
+        model,
+        processor,
+        inputs,
+        max_gen_len=max_length,
+        temperature=temperature,
+        top_p=top_p,
+        stop_words=stop_words,
+        chunk_size=chunk_size,
+    )
+def generate(
+    model,
+    processor,
+    inputs,
+    max_gen_len: int = 256,
+    temperature: float = 0,
+    top_p: float = 0.95,
+    stop_words: List[str] = [],
+    chunk_size: int = -1
+):
+    """Stream the text output from the multimodality model with prompt and image inputs."""
+    tokenizer = processor.tokenizer
+    stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
+    kwargs = dict(
+        **inputs,
+        max_new_tokens=max_gen_len,
+        do_sample=True,
+        streamer=streamer,
+        stopping_criteria=stopping_criteria,
+    )
+    if temperature > 0:
+        kwargs.update(
+            {
+                "do_sample": True,
+                "top_p": top_p,
+                "temperature": temperature,
+            }
+        )
+    else:
+        kwargs["do_sample"] = False
+    thread = Thread(target=model.generate, kwargs=kwargs)
+    thread.start()
+    yield from streamer

serve/utils.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from __future__ import annotations
+import html
+import logging
+import io
+import os
+import re
+import base64
+import time
+from PIL import Image, ImageDraw, ImageFont
+import mdtex2html
+from markdown import markdown
+from pygments import highlight
+from pygments.formatters import HtmlFormatter
+from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
+ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
+BOX2COLOR = {
+    0: (255, 0, 0),
+    1: (0, 255, 0),
+    2: (0, 0, 255),
+}
+MAX_IMAGE_SIZE = 1024
+MIN_IMAGE_SIZE = 1024
+logger = logging.getLogger("gradio_logger")
+def configure_logger(log_dir: str = "logs"):
+    logger = logging.getLogger("gradio_logger")
+    logger.setLevel(logging.DEBUG)
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+    os.makedirs(log_dir, exist_ok=True)
+    file_handler = logging.FileHandler(f"{log_dir}/{timestr}_gradio_log.log")
+    console_handler = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    console_handler.setFormatter(formatter)
+    file_handler.setFormatter(formatter)
+    console_handler.setLevel(logging.INFO)
+    file_handler.setLevel(logging.INFO)
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    return logger
+def strip_stop_words(x, stop_words):
+    for w in stop_words:
+        if w in x:
+            return x[: x.index(w)].strip()
+    return x.strip()
+def format_output(history, text, x):
+    updated_history = history + [[text, x]]
+    a = [[y[0], convert_to_markdown(y[1])] for y in updated_history]
+    return a, updated_history
+def markdown_to_html_with_syntax_highlight(md_str):  # deprecated
+    def replacer(match):
+        lang = match.group(1) or "text"
+        code = match.group(2)
+        try:
+            lexer = get_lexer_by_name(lang, stripall=True)
+        except ValueError:
+            lexer = get_lexer_by_name("text", stripall=True)
+        formatter = HtmlFormatter()
+        highlighted_code = highlight(code, lexer, formatter)
+        return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
+    code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
+    md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
+    html_str = markdown(md_str)
+    return html_str
+def normalize_markdown(md_text: str) -> str:  # deprecated
+    lines = md_text.split("\n")
+    normalized_lines = []
+    inside_list = False
+    for i, line in enumerate(lines):
+        if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
+            if not inside_list and i > 0 and lines[i - 1].strip() != "":
+                normalized_lines.append("")
+            inside_list = True
+            normalized_lines.append(line)
+        elif inside_list and line.strip() == "":
+            if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
+                normalized_lines.append(line)
+            continue
+        else:
+            inside_list = False
+            normalized_lines.append(line)
+    return "\n".join(normalized_lines)
+def convert_mdtext(md_text):
+    code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
+    inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
+    code_blocks = code_block_pattern.findall(md_text)
+    non_code_parts = code_block_pattern.split(md_text)[::2]
+    result = []
+    for non_code, code in zip(non_code_parts, code_blocks + [""]):
+        if non_code.strip():
+            non_code = normalize_markdown(non_code)
+            if inline_code_pattern.search(non_code):
+                result.append(markdown(non_code, extensions=["tables"]))
+            else:
+                result.append(mdtex2html.convert(non_code, extensions=["tables"]))
+        if code.strip():
+            code = f"\n```{code}\n\n```"
+            code = markdown_to_html_with_syntax_highlight(code)
+            result.append(code)
+    result = "".join(result)
+    result += ALREADY_CONVERTED_MARK
+    return result
+def convert_asis(userinput):
+    return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>{ALREADY_CONVERTED_MARK}'
+def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
+    return any(s.endswith(stop_word) for stop_word in stop_words)
+def detect_converted_mark(userinput):
+    return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
+def detect_language(code):
+    first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0]
+    language = first_line.lower() if first_line else ""
+    code_without_language = code[len(first_line) :].lstrip() if first_line else code
+    return language, code_without_language
+def convert_to_markdown(text):
+    text = text.replace("$", "&#36;")
+    text = text.replace("\r\n", "\n")
+    def replace_leading_tabs_and_spaces(line):
+        new_line = []
+        for char in line:
+            if char == "\t":
+                new_line.append("&#9;")
+            elif char == " ":
+                new_line.append("&nbsp;")
+            else:
+                break
+        return "".join(new_line) + line[len(new_line) :]
+    markdown_text = ""
+    lines = text.split("\n")
+    in_code_block = False
+    for line in lines:
+        if in_code_block is False and line.startswith("```"):
+            in_code_block = True
+            markdown_text += f"{line}\n"
+        elif in_code_block is True and line.startswith("```"):
+            in_code_block = False
+            markdown_text += f"{line}\n"
+        elif in_code_block:
+            markdown_text += f"{line}\n"
+        else:
+            line = replace_leading_tabs_and_spaces(line)
+            line = re.sub(r"^(#)", r"\\\1", line)
+            markdown_text += f"{line}  \n"
+    return markdown_text
+def add_language_tag(text):
+    def detect_language(code_block):
+        try:
+            lexer = guess_lexer(code_block)
+            return lexer.name.lower()
+        except ClassNotFound:
+            return ""
+    code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
+    def replacement(match):
+        code_block = match.group(2)
+        if match.group(2).startswith("\n"):
+            language = detect_language(code_block)
+            return f"```{language}{code_block}```" if language else f"```\n{code_block}```"
+        else:
+            return match.group(1) + code_block + "```"
+    text2 = code_block_pattern.sub(replacement, text)
+    return text2
+def is_variable_assigned(var_name: str) -> bool:
+    return var_name in locals()
+def pil_to_base64(
+    image: Image.Image,
+    alt: str = "user upload image",
+    resize: bool = True,
+    max_size: int = MAX_IMAGE_SIZE,
+    min_size: int = MIN_IMAGE_SIZE,
+    format: str = "JPEG",
+    quality: int = 95,
+) -> str:
+    """
+    Convert a PIL image to a base64 string.
+    """
+    if resize:
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        shortest_edge = int(min(max_size / aspect_ratio, min_size, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+    buffered = io.BytesIO()
+    image.save(buffered, format=format, quality=quality)
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{alt}" />'
+    return img_str
+def parse_ref_bbox(response, image: Image.Image):
+    try:
+        image = image.copy()
+        image_h, image_w = image.size
+        draw = ImageDraw.Draw(image)
+        ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
+        bbox = re.findall(r'<\|det\|>.*?<\|/det\|>', response)
+        assert len(ref) == len(bbox)
+        if len(ref) == 0:
+            return None
+        boxes, labels = [], []
+        for box, label in zip(bbox, ref):
+            box = box.replace('<|det|>', '').replace('<|/det|>', '')
+            label = label.replace('<|ref|>', '').replace('<|/ref|>', '')
+            box = box[1:-1]
+            for onebox in re.findall(r'\[.*?\]', box):
+                boxes.append(eval(onebox))
+                labels.append(label)
+        for indice, (box, label) in enumerate(zip(boxes, labels)):
+            box = (
+                int(box[0] / 999 * image_h),
+                int(box[1] / 999 * image_w),
+                int(box[2] / 999 * image_h),
+                int(box[3] / 999 * image_w),
+            )
+            box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
+            box_width = 3
+            draw.rectangle(box, outline=box_color, width=box_width)
+            text_x = box[0]
+            text_y = box[1] - 20
+            text_color = box_color
+            font = ImageFont.truetype("eagle_vl/serve/assets/simsun.ttc", size=20)
+            draw.text((text_x, text_y), label, font=font, fill=text_color)
+        return image
+    except Exception as e:
+        logger.error(f"Error parsing reference bounding boxes: {e}")
+        return None

videos/demo1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdd8b0474a50d8ee91daf2b203f657b54054b5177a40af5b7f8838858a633fec
+size 40465978

videos/demo2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdfff3c21bfc92d8dee611795e79fb7a6270fa8dfe3ae87de82c0879dbc0f177
+size 18403395