Spaces:

hysts
/

ViTPose_video

Running

App Files Files Community

hysts HF Staff commited on Jun 10, 2024

Commit

e6d1870

1 Parent(s): 7db2ef4

Update

Browse files

Files changed (4) hide show

.pre-commit-config.yaml +59 -36
README.md +1 -1
app.py +74 -77
model.py +91 -116

.pre-commit-config.yaml CHANGED Viewed

@@ -1,37 +1,60 @@
-exclude: ^(ViTPose/|mmdet_configs/configs/)
 repos:
-- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.2.0
-  hooks:
-  - id: check-executables-have-shebangs
-  - id: check-json
-  - id: check-merge-conflict
-  - id: check-shebang-scripts-are-executable
-  - id: check-toml
-  - id: check-yaml
-  - id: double-quote-string-fixer
-  - id: end-of-file-fixer
-  - id: mixed-line-ending
-    args: ['--fix=lf']
-  - id: requirements-txt-fixer
-  - id: trailing-whitespace
-- repo: https://github.com/myint/docformatter
-  rev: v1.4
-  hooks:
-  - id: docformatter
-    args: ['--in-place']
-- repo: https://github.com/pycqa/isort
-  rev: 5.12.0
-  hooks:
-    - id: isort
-- repo: https://github.com/pre-commit/mirrors-mypy
-  rev: v0.991
-  hooks:
-    - id: mypy
-      args: ['--ignore-missing-imports']
-      additional_dependencies: ['types-python-slugify']
-- repo: https://github.com/google/yapf
-  rev: v0.32.0
-  hooks:
-  - id: yapf
-    args: ['--parallel', '--in-place']

 repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/myint/docformatter
+    rev: v1.7.5
+    hooks:
+      - id: docformatter
+        args: ["--in-place"]
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.10.0
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
+        additional_dependencies:
+          [
+            "types-python-slugify",
+            "types-requests",
+            "types-PyYAML",
+            "types-pytz",
+          ]
+  - repo: https://github.com/psf/black
+    rev: 24.4.2
+    hooks:
+      - id: black
+        language_version: python3.10
+        args: ["--line-length", "119"]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.7.1
+    hooks:
+      - id: nbstripout
+        args:
+          [
+            "--extra-keys",
+            "metadata.interpreter metadata.kernelspec cell.metadata.pycharm",
+          ]
+  - repo: https://github.com/nbQA-dev/nbQA
+    rev: 1.8.5
+    hooks:
+      - id: nbqa-black
+      - id: nbqa-pyupgrade
+        args: ["--py37-plus"]
+      - id: nbqa-isort
+        args: ["--float-to-top"]

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🦀
 colorFrom: gray
 colorTo: purple
 sdk: gradio
-sdk_version: 3.36.1
 app_file: app.py
 pinned: false
 suggested_hardware: t4-small

 colorFrom: gray
 colorTo: purple
 sdk: gradio
+sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 suggested_hardware: t4-small

app.py CHANGED Viewed

@@ -2,109 +2,106 @@
 from __future__ import annotations
 import pathlib
 import tarfile
 import gradio as gr
 from model import AppModel
-DESCRIPTION = '''# [ViTPose](https://github.com/ViTAE-Transformer/ViTPose)
 Related app: [https://huggingface.co/spaces/Gradio-Blocks/ViTPose](https://huggingface.co/spaces/Gradio-Blocks/ViTPose)
-'''
 def extract_tar() -> None:
-    if pathlib.Path('mmdet_configs/configs').exists():
         return
-    with tarfile.open('mmdet_configs/configs.tar') as f:
-        f.extractall('mmdet_configs')
 extract_tar()
 model = AppModel()
-with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column():
-            input_video = gr.Video(label='Input Video',
-                                   format='mp4',
-                                   elem_id='input_video')
-            detector_name = gr.Dropdown(label='Detector',
-                                        choices=list(
-                                            model.det_model.MODEL_DICT.keys()),
-                                        value=model.det_model.model_name)
             pose_model_name = gr.Dropdown(
-                label='Pose Model',
-                choices=list(model.pose_model.MODEL_DICT.keys()),
-                value=model.pose_model.model_name)
-            det_score_threshold = gr.Slider(label='Box Score Threshold',
-                                            minimum=0,
-                                            maximum=1,
-                                            step=0.05,
-                                            value=0.5)
-            max_num_frames = gr.Slider(label='Maximum Number of Frames',
-                                       minimum=1,
-                                       maximum=300,
-                                       step=1,
-                                       value=60)
-            predict_button = gr.Button('Predict')
-            pose_preds = gr.Variable()
-            paths = sorted(pathlib.Path('videos').rglob('*.mp4'))
-            gr.Examples(examples=[[path.as_posix()] for path in paths],
-                        inputs=input_video)
         with gr.Column():
-            result = gr.Video(label='Result', format='mp4', elem_id='result')
             vis_kpt_score_threshold = gr.Slider(
-                label='Visualization Score Threshold',
-                minimum=0,
-                maximum=1,
-                step=0.05,
-                value=0.3)
-            vis_dot_radius = gr.Slider(label='Dot Radius',
-                                       minimum=1,
-                                       maximum=10,
-                                       step=1,
-                                       value=4)
-            vis_line_thickness = gr.Slider(label='Line Thickness',
-                                           minimum=1,
-                                           maximum=10,
-                                           step=1,
-                                           value=2)
-            redraw_button = gr.Button('Redraw')
     detector_name.change(fn=model.det_model.set_model, inputs=detector_name)
-    pose_model_name.change(fn=model.pose_model.set_model,
-                           inputs=pose_model_name)
-    predict_button.click(fn=model.run,
-                         inputs=[
-                             input_video,
-                             detector_name,
-                             pose_model_name,
-                             det_score_threshold,
-                             max_num_frames,
-                             vis_kpt_score_threshold,
-                             vis_dot_radius,
-                             vis_line_thickness,
-                         ],
-                         outputs=[
-                             result,
-                             pose_preds,
-                         ])
-    redraw_button.click(fn=model.visualize_pose_results,
-                        inputs=[
-                            input_video,
-                            pose_preds,
-                            vis_kpt_score_threshold,
-                            vis_dot_radius,
-                            vis_line_thickness,
-                        ],
-                        outputs=result)
-demo.queue(max_size=10).launch()

 from __future__ import annotations
+import os
 import pathlib
+import shlex
+import subprocess
 import tarfile
+if os.getenv("SYSTEM") == "spaces":
+    subprocess.run(shlex.split("pip install click==7.1.2"))
+    subprocess.run(shlex.split("pip install typer==0.9.4"))
+    import mim
+    mim.uninstall("mmcv-full", confirm_yes=True)
+    mim.install("mmcv-full==1.5.0", is_yes=True)
+    subprocess.call(shlex.split("pip uninstall -y opencv-python"))
+    subprocess.call(shlex.split("pip uninstall -y opencv-python-headless"))
+    subprocess.call(shlex.split("pip install opencv-python-headless==4.8.0.74"))
 import gradio as gr
 from model import AppModel
+DESCRIPTION = """# [ViTPose](https://github.com/ViTAE-Transformer/ViTPose)
 Related app: [https://huggingface.co/spaces/Gradio-Blocks/ViTPose](https://huggingface.co/spaces/Gradio-Blocks/ViTPose)
+"""
 def extract_tar() -> None:
+    if pathlib.Path("mmdet_configs/configs").exists():
         return
+    with tarfile.open("mmdet_configs/configs.tar") as f:
+        f.extractall("mmdet_configs")
 extract_tar()
 model = AppModel()
+with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column():
+            input_video = gr.Video(label="Input Video", format="mp4", elem_id="input_video")
+            detector_name = gr.Dropdown(
+                label="Detector", choices=list(model.det_model.MODEL_DICT.keys()), value=model.det_model.model_name
+            )
             pose_model_name = gr.Dropdown(
+                label="Pose Model", choices=list(model.pose_model.MODEL_DICT.keys()), value=model.pose_model.model_name
+            )
+            det_score_threshold = gr.Slider(label="Box Score Threshold", minimum=0, maximum=1, step=0.05, value=0.5)
+            max_num_frames = gr.Slider(label="Maximum Number of Frames", minimum=1, maximum=300, step=1, value=60)
+            predict_button = gr.Button("Predict")
+            pose_preds = gr.State()
+            paths = sorted(pathlib.Path("videos").rglob("*.mp4"))
+            gr.Examples(examples=[[path.as_posix()] for path in paths], inputs=input_video)
         with gr.Column():
+            result = gr.Video(label="Result", format="mp4", elem_id="result")
             vis_kpt_score_threshold = gr.Slider(
+                label="Visualization Score Threshold", minimum=0, maximum=1, step=0.05, value=0.3
+            )
+            vis_dot_radius = gr.Slider(label="Dot Radius", minimum=1, maximum=10, step=1, value=4)
+            vis_line_thickness = gr.Slider(label="Line Thickness", minimum=1, maximum=10, step=1, value=2)
+            redraw_button = gr.Button("Redraw")
     detector_name.change(fn=model.det_model.set_model, inputs=detector_name)
+    pose_model_name.change(fn=model.pose_model.set_model, inputs=pose_model_name)
+    predict_button.click(
+        fn=model.run,
+        inputs=[
+            input_video,
+            detector_name,
+            pose_model_name,
+            det_score_threshold,
+            max_num_frames,
+            vis_kpt_score_threshold,
+            vis_dot_radius,
+            vis_line_thickness,
+        ],
+        outputs=[
+            result,
+            pose_preds,
+        ],
+    )
+    redraw_button.click(
+        fn=model.visualize_pose_results,
+        inputs=[
+            input_video,
+            pose_preds,
+            vis_kpt_score_threshold,
+            vis_dot_radius,
+            vis_line_thickness,
+        ],
+        outputs=result,
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=10).launch()

model.py CHANGED Viewed

@@ -1,68 +1,49 @@
 from __future__ import annotations
-import os
-import shlex
-import subprocess
 import sys
 import tempfile
-if os.getenv('SYSTEM') == 'spaces':
-    import mim
-    mim.uninstall('mmcv-full', confirm_yes=True)
-    mim.install('mmcv-full==1.5.0', is_yes=True)
-    subprocess.call(shlex.split('pip uninstall -y opencv-python'))
-    subprocess.call(shlex.split('pip uninstall -y opencv-python-headless'))
-    subprocess.call(
-        shlex.split('pip install opencv-python-headless==4.8.0.74'))
 import cv2
 import huggingface_hub
 import numpy as np
 import torch
 import torch.nn as nn
-sys.path.insert(0, 'ViTPose/')
 from mmdet.apis import inference_detector, init_detector
-from mmpose.apis import (inference_top_down_pose_model, init_pose_model,
-                         process_mmdet_results, vis_pose_result)
 class DetModel:
     MODEL_DICT = {
-        'YOLOX-tiny': {
-            'config':
-            'mmdet_configs/configs/yolox/yolox_tiny_8x8_300e_coco.py',
-            'model':
-            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth',
         },
-        'YOLOX-s': {
-            'config':
-            'mmdet_configs/configs/yolox/yolox_s_8x8_300e_coco.py',
-            'model':
-            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth',
         },
-        'YOLOX-l': {
-            'config':
-            'mmdet_configs/configs/yolox/yolox_l_8x8_300e_coco.py',
-            'model':
-            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth',
         },
-        'YOLOX-x': {
-            'config':
-            'mmdet_configs/configs/yolox/yolox_x_8x8_300e_coco.py',
-            'model':
-            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth',
         },
     }
     def __init__(self):
-        self.device = torch.device(
-            'cuda:0' if torch.cuda.is_available() else 'cpu')
         self._load_all_models_once()
-        self.model_name = 'YOLOX-l'
         self.model = self._load_model(self.model_name)
     def _load_all_models_once(self) -> None:
@@ -71,7 +52,7 @@ class DetModel:
     def _load_model(self, name: str) -> nn.Module:
         d = self.MODEL_DICT[name]
-        return init_detector(d['config'], d['model'], device=self.device)
     def set_model(self, name: str) -> None:
         if name == self.model_name:
@@ -79,9 +60,7 @@ class DetModel:
         self.model_name = name
         self.model = self._load_model(name)
-    def detect_and_visualize(
-            self, image: np.ndarray,
-            score_threshold: float) -> tuple[list[np.ndarray], np.ndarray]:
         out = self.detect(image)
         vis = self.visualize_detection_results(image, out, score_threshold)
         return out, vis
@@ -92,50 +71,40 @@ class DetModel:
         return out
     def visualize_detection_results(
-            self,
-            image: np.ndarray,
-            detection_results: list[np.ndarray],
-            score_threshold: float = 0.3) -> np.ndarray:
         person_det = [detection_results[0]] + [np.array([]).reshape(0, 5)] * 79
         image = image[:, :, ::-1]  # RGB -> BGR
-        vis = self.model.show_result(image,
-                                     person_det,
-                                     score_thr=score_threshold,
-                                     bbox_color=None,
-                                     text_color=(200, 200, 200),
-                                     mask_color=None)
         return vis[:, :, ::-1]  # BGR -> RGB
 class PoseModel:
     MODEL_DICT = {
-        'ViTPose-B (single-task train)': {
-            'config':
-            'ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py',
-            'model': 'models/vitpose-b.pth',
         },
-        'ViTPose-L (single-task train)': {
-            'config':
-            'ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py',
-            'model': 'models/vitpose-l.pth',
         },
-        'ViTPose-B (multi-task train, COCO)': {
-            'config':
-            'ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py',
-            'model': 'models/vitpose-b-multi-coco.pth',
         },
-        'ViTPose-L (multi-task train, COCO)': {
-            'config':
-            'ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py',
-            'model': 'models/vitpose-l-multi-coco.pth',
         },
     }
     def __init__(self):
-        self.device = torch.device(
-            'cuda:0' if torch.cuda.is_available() else 'cpu')
-        self.model_name = 'ViTPose-B (multi-task train, COCO)'
         self.model = self._load_model(self.model_name)
     def _load_all_models_once(self) -> None:
@@ -144,9 +113,8 @@ class PoseModel:
     def _load_model(self, name: str) -> nn.Module:
         d = self.MODEL_DICT[name]
-        ckpt_path = huggingface_hub.hf_hub_download('public-data/ViTPose',
-                                                    d['model'])
-        model = init_pose_model(d['config'], ckpt_path, device=self.device)
         return model
     def set_model(self, name: str) -> None:
@@ -165,37 +133,36 @@ class PoseModel:
         vis_line_thickness: int,
     ) -> tuple[list[dict[str, np.ndarray]], np.ndarray]:
         out = self.predict_pose(image, det_results, box_score_threshold)
-        vis = self.visualize_pose_results(image, out, kpt_score_threshold,
-                                          vis_dot_radius, vis_line_thickness)
         return out, vis
     def predict_pose(
-            self,
-            image: np.ndarray,
-            det_results: list[np.ndarray],
-            box_score_threshold: float = 0.5) -> list[dict[str, np.ndarray]]:
         image = image[:, :, ::-1]  # RGB -> BGR
         person_results = process_mmdet_results(det_results, 1)
-        out, _ = inference_top_down_pose_model(self.model,
-                                               image,
-                                               person_results=person_results,
-                                               bbox_thr=box_score_threshold,
-                                               format='xyxy')
         return out
-    def visualize_pose_results(self,
-                               image: np.ndarray,
-                               pose_results: list[dict[str, np.ndarray]],
-                               kpt_score_threshold: float = 0.3,
-                               vis_dot_radius: int = 4,
-                               vis_line_thickness: int = 1) -> np.ndarray:
         image = image[:, :, ::-1]  # RGB -> BGR
-        vis = vis_pose_result(self.model,
-                              image,
-                              pose_results,
-                              kpt_score_thr=kpt_score_threshold,
-                              radius=vis_dot_radius,
-                              thickness=vis_line_thickness)
         return vis[:, :, ::-1]  # BGR -> RGB
@@ -205,10 +172,15 @@ class AppModel:
         self.pose_model = PoseModel()
     def run(
-        self, video_path: str, det_model_name: str, pose_model_name: str,
-        box_score_threshold: float, max_num_frames: int,
-        kpt_score_threshold: float, vis_dot_radius: int,
-        vis_line_thickness: int
     ) -> tuple[str, list[list[dict[str, np.ndarray]]]]:
         if video_path is None:
             return
@@ -222,8 +194,8 @@ class AppModel:
         preds_all = []
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
         writer = cv2.VideoWriter(out_file.name, fourcc, fps, (width, height))
         for _ in range(max_num_frames):
             ok, frame = cap.read()
@@ -232,8 +204,8 @@ class AppModel:
             rgb_frame = frame[:, :, ::-1]
             det_preds = self.det_model.detect(rgb_frame)
             preds, vis = self.pose_model.predict_pose_and_visualize(
-                rgb_frame, det_preds, box_score_threshold, kpt_score_threshold,
-                vis_dot_radius, vis_line_thickness)
             preds_all.append(preds)
             writer.write(vis[:, :, ::-1])
         cap.release()
@@ -241,11 +213,14 @@ class AppModel:
         return out_file.name, preds_all
-    def visualize_pose_results(self, video_path: str,
-                               pose_preds_all: list[list[dict[str,
-                                                              np.ndarray]]],
-                               kpt_score_threshold: float, vis_dot_radius: int,
-                               vis_line_thickness: int) -> str:
         if video_path is None or pose_preds_all is None:
             return
         cap = cv2.VideoCapture(video_path)
@@ -253,8 +228,8 @@ class AppModel:
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         fps = cap.get(cv2.CAP_PROP_FPS)
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
         writer = cv2.VideoWriter(out_file.name, fourcc, fps, (width, height))
         for pose_preds in pose_preds_all:
             ok, frame = cap.read()
@@ -262,8 +237,8 @@ class AppModel:
                 break
             rgb_frame = frame[:, :, ::-1]
             vis = self.pose_model.visualize_pose_results(
-                rgb_frame, pose_preds, kpt_score_threshold, vis_dot_radius,
-                vis_line_thickness)
             writer.write(vis[:, :, ::-1])
         cap.release()
         writer.release()

 from __future__ import annotations
 import sys
 import tempfile
 import cv2
 import huggingface_hub
 import numpy as np
 import torch
 import torch.nn as nn
+sys.path.insert(0, "ViTPose/")
 from mmdet.apis import inference_detector, init_detector
+from mmpose.apis import (
+    inference_top_down_pose_model,
+    init_pose_model,
+    process_mmdet_results,
+    vis_pose_result,
+)
 class DetModel:
     MODEL_DICT = {
+        "YOLOX-tiny": {
+            "config": "mmdet_configs/configs/yolox/yolox_tiny_8x8_300e_coco.py",
+            "model": "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth",
         },
+        "YOLOX-s": {
+            "config": "mmdet_configs/configs/yolox/yolox_s_8x8_300e_coco.py",
+            "model": "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth",
         },
+        "YOLOX-l": {
+            "config": "mmdet_configs/configs/yolox/yolox_l_8x8_300e_coco.py",
+            "model": "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth",
         },
+        "YOLOX-x": {
+            "config": "mmdet_configs/configs/yolox/yolox_x_8x8_300e_coco.py",
+            "model": "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth",
         },
     }
     def __init__(self):
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         self._load_all_models_once()
+        self.model_name = "YOLOX-l"
         self.model = self._load_model(self.model_name)
     def _load_all_models_once(self) -> None:
     def _load_model(self, name: str) -> nn.Module:
         d = self.MODEL_DICT[name]
+        return init_detector(d["config"], d["model"], device=self.device)
     def set_model(self, name: str) -> None:
         if name == self.model_name:
         self.model_name = name
         self.model = self._load_model(name)
+    def detect_and_visualize(self, image: np.ndarray, score_threshold: float) -> tuple[list[np.ndarray], np.ndarray]:
         out = self.detect(image)
         vis = self.visualize_detection_results(image, out, score_threshold)
         return out, vis
         return out
     def visualize_detection_results(
+        self, image: np.ndarray, detection_results: list[np.ndarray], score_threshold: float = 0.3
+    ) -> np.ndarray:
         person_det = [detection_results[0]] + [np.array([]).reshape(0, 5)] * 79
         image = image[:, :, ::-1]  # RGB -> BGR
+        vis = self.model.show_result(
+            image, person_det, score_thr=score_threshold, bbox_color=None, text_color=(200, 200, 200), mask_color=None
+        )
         return vis[:, :, ::-1]  # BGR -> RGB
 class PoseModel:
     MODEL_DICT = {
+        "ViTPose-B (single-task train)": {
+            "config": "ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py",
+            "model": "models/vitpose-b.pth",
         },
+        "ViTPose-L (single-task train)": {
+            "config": "ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py",
+            "model": "models/vitpose-l.pth",
         },
+        "ViTPose-B (multi-task train, COCO)": {
+            "config": "ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_base_coco_256x192.py",
+            "model": "models/vitpose-b-multi-coco.pth",
         },
+        "ViTPose-L (multi-task train, COCO)": {
+            "config": "ViTPose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py",
+            "model": "models/vitpose-l-multi-coco.pth",
         },
     }
     def __init__(self):
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.model_name = "ViTPose-B (multi-task train, COCO)"
         self.model = self._load_model(self.model_name)
     def _load_all_models_once(self) -> None:
     def _load_model(self, name: str) -> nn.Module:
         d = self.MODEL_DICT[name]
+        ckpt_path = huggingface_hub.hf_hub_download("public-data/ViTPose", d["model"])
+        model = init_pose_model(d["config"], ckpt_path, device=self.device)
         return model
     def set_model(self, name: str) -> None:
         vis_line_thickness: int,
     ) -> tuple[list[dict[str, np.ndarray]], np.ndarray]:
         out = self.predict_pose(image, det_results, box_score_threshold)
+        vis = self.visualize_pose_results(image, out, kpt_score_threshold, vis_dot_radius, vis_line_thickness)
         return out, vis
     def predict_pose(
+        self, image: np.ndarray, det_results: list[np.ndarray], box_score_threshold: float = 0.5
+    ) -> list[dict[str, np.ndarray]]:
         image = image[:, :, ::-1]  # RGB -> BGR
         person_results = process_mmdet_results(det_results, 1)
+        out, _ = inference_top_down_pose_model(
+            self.model, image, person_results=person_results, bbox_thr=box_score_threshold, format="xyxy"
+        )
         return out
+    def visualize_pose_results(
+        self,
+        image: np.ndarray,
+        pose_results: list[dict[str, np.ndarray]],
+        kpt_score_threshold: float = 0.3,
+        vis_dot_radius: int = 4,
+        vis_line_thickness: int = 1,
+    ) -> np.ndarray:
         image = image[:, :, ::-1]  # RGB -> BGR
+        vis = vis_pose_result(
+            self.model,
+            image,
+            pose_results,
+            kpt_score_thr=kpt_score_threshold,
+            radius=vis_dot_radius,
+            thickness=vis_line_thickness,
+        )
         return vis[:, :, ::-1]  # BGR -> RGB
         self.pose_model = PoseModel()
     def run(
+        self,
+        video_path: str,
+        det_model_name: str,
+        pose_model_name: str,
+        box_score_threshold: float,
+        max_num_frames: int,
+        kpt_score_threshold: float,
+        vis_dot_radius: int,
+        vis_line_thickness: int,
     ) -> tuple[str, list[list[dict[str, np.ndarray]]]]:
         if video_path is None:
             return
         preds_all = []
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
         writer = cv2.VideoWriter(out_file.name, fourcc, fps, (width, height))
         for _ in range(max_num_frames):
             ok, frame = cap.read()
             rgb_frame = frame[:, :, ::-1]
             det_preds = self.det_model.detect(rgb_frame)
             preds, vis = self.pose_model.predict_pose_and_visualize(
+                rgb_frame, det_preds, box_score_threshold, kpt_score_threshold, vis_dot_radius, vis_line_thickness
+            )
             preds_all.append(preds)
             writer.write(vis[:, :, ::-1])
         cap.release()
         return out_file.name, preds_all
+    def visualize_pose_results(
+        self,
+        video_path: str,
+        pose_preds_all: list[list[dict[str, np.ndarray]]],
+        kpt_score_threshold: float,
+        vis_dot_radius: int,
+        vis_line_thickness: int,
+    ) -> str:
         if video_path is None or pose_preds_all is None:
             return
         cap = cv2.VideoCapture(video_path)
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         fps = cap.get(cv2.CAP_PROP_FPS)
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
         writer = cv2.VideoWriter(out_file.name, fourcc, fps, (width, height))
         for pose_preds in pose_preds_all:
             ok, frame = cap.read()
                 break
             rgb_frame = frame[:, :, ::-1]
             vis = self.pose_model.visualize_pose_results(
+                rgb_frame, pose_preds, kpt_score_threshold, vis_dot_radius, vis_line_thickness
+            )
             writer.write(vis[:, :, ::-1])
         cap.release()
         writer.release()