Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import spaces | |
| import urllib.request | |
| import shutil | |
| import dolphin | |
| from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES | |
| MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| language_options = [(f"{code}: {name[0]}", code) | |
| for code, name in LANGUAGE_CODES.items()] | |
| language_options.sort(key=lambda x: x[0]) | |
| language_options = [("Auto-detect", None)] + language_options | |
| MODELS = { | |
| "base (140M)": "base", | |
| "small (372M)": "small", | |
| } | |
| MODEL_URLS = { | |
| "base": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/base.pt", | |
| "small": "https://huggingface.co/DataoceanAI/dolphin-small/resolve/main/small.pt", | |
| } | |
| ASSET_URLS = { | |
| "bpe.model": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/bpe.model", | |
| "config.yaml": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/config.yaml", | |
| "feats_stats.npz": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/feats_stats.npz", | |
| } | |
| language_to_regions = {} | |
| for lang_region, names in LANGUAGE_REGION_CODES.items(): | |
| if "-" in lang_region: | |
| lang, region = lang_region.split("-", 1) | |
| if lang not in language_to_regions: | |
| language_to_regions[lang] = [] | |
| language_to_regions[lang].append((f"{region}: {names[0]}", region)) | |
| def download_file(url, dest_path): | |
| if not os.path.exists(dest_path): | |
| print(f"Downloading {url} to {dest_path}") | |
| with urllib.request.urlopen(url) as response, open(dest_path, 'wb') as out_file: | |
| shutil.copyfileobj(response, out_file) | |
| print(f"Downloaded {dest_path}") | |
| else: | |
| print(f"File already exists: {dest_path}") | |
| def ensure_assets_downloaded(): | |
| assets_dir = os.path.join(os.path.dirname( | |
| os.path.abspath(__file__)), "dolphin", "assets") | |
| os.makedirs(assets_dir, exist_ok=True) | |
| for filename, url in ASSET_URLS.items(): | |
| download_file(url, os.path.join(assets_dir, filename)) | |
| def ensure_model_downloaded(model_key): | |
| if model_key not in MODEL_URLS: | |
| raise ValueError(f"Unknown model: {model_key}") | |
| model_path = os.path.join(MODEL_DIR, f"{model_key}.pt") | |
| if not os.path.exists(model_path): | |
| download_file(MODEL_URLS[model_key], model_path) | |
| return model_path | |
| def update_regions(language): | |
| if not language: | |
| return [], None, False | |
| if language in language_to_regions: | |
| regions = language_to_regions[language] | |
| regions.sort(key=lambda x: x[0]) | |
| default_value = regions[0][1] if regions else None | |
| return regions, default_value, True | |
| return [], None, False | |
| def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech): | |
| try: | |
| if not audio_file: | |
| return "Please upload or record audio first", "" | |
| ensure_assets_downloaded() | |
| model_key = MODELS[model_name] | |
| ensure_model_downloaded(model_key) | |
| model = dolphin.load_model(model_key, MODEL_DIR, "cuda") | |
| waveform = dolphin.load_audio(audio_file) | |
| kwargs = { | |
| "predict_time": predict_timestamps, | |
| "padding_speech": padding_speech | |
| } | |
| if language: | |
| kwargs["lang_sym"] = language | |
| if region: | |
| kwargs["region_sym"] = region | |
| result = model(waveform, **kwargs) | |
| output_text = result.text | |
| language_detected = f"{result.language}" | |
| region_detected = f"{result.region}" | |
| detected_info = f"Detected language: {result.language}" + ( | |
| f", region: {result.region}" if result.region else "") | |
| return output_text, detected_info | |
| except Exception as e: | |
| print(f"Error in transcribe_audio: {str(e)}") | |
| return f"Error: {str(e)}", "Transcription failed" | |
| with gr.Blocks(title="Dolphin Speech Recognition") as demo: | |
| gr.Markdown("# Dolphin ASR") | |
| gr.Markdown(""" | |
| A multilingual, multitask ASR model supporting 40 Eastern languages and 22 Chinese dialects. | |
| This model is from [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin), for speech recognition in | |
| Eastern languages including Chinese, Japanese, Korean, and many more. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| type="filepath", label="Upload or Record Audio") | |
| with gr.Row(): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), | |
| value=list(MODELS.keys())[1], | |
| label="Model Size" | |
| ) | |
| with gr.Row(): | |
| language_dropdown = gr.Dropdown( | |
| choices=language_options, | |
| value=language_options[0][1], | |
| label="Language", | |
| info="Default is auto-detect" | |
| ) | |
| region_dropdown = gr.Dropdown( | |
| choices=[], | |
| value=None, | |
| label="Region", | |
| visible=False | |
| ) | |
| with gr.Row(): | |
| timestamp_checkbox = gr.Checkbox( | |
| value=True, | |
| label="Include Timestamps" | |
| ) | |
| padding_checkbox = gr.Checkbox( | |
| value=False, | |
| label="Pad Speech to 30s" | |
| ) | |
| transcribe_button = gr.Button("Transcribe", variant="primary") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Transcription", lines=10) | |
| language_info = gr.Textbox(label="Detected Language", lines=1) | |
| def on_language_change(language): | |
| regions, default_value, is_visible = update_regions(language) | |
| return { | |
| region_dropdown: gr.update( | |
| choices=regions, value=default_value, visible=is_visible) | |
| } | |
| language_dropdown.change( | |
| fn=on_language_change, | |
| inputs=[language_dropdown], | |
| outputs=[region_dropdown] | |
| ) | |
| transcribe_button.click( | |
| fn=transcribe_audio, | |
| inputs=[ | |
| audio_input, | |
| model_dropdown, | |
| language_dropdown, | |
| region_dropdown, | |
| timestamp_checkbox, | |
| padding_checkbox | |
| ], | |
| outputs=[output_text, language_info] | |
| ) | |
| gr.Markdown(""" | |
| ## Usage Notes | |
| - The model supports 40 Eastern languages and 22 Chinese dialects | |
| - You can let the model auto-detect language or specify language and region | |
| - Timestamps can be included in the output | |
| - Speech can be padded to 30 seconds for better processing | |
| ## Credits | |
| - Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin) | |
| - Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212) | |
| """) | |
| ensure_assets_downloaded() | |
| for model_key in MODELS.values(): | |
| ensure_model_downloaded(model_key) | |
| demo.launch() | |