|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
import numpy as np |
|
|
import librosa |
|
|
|
|
|
|
|
|
model_id = "Phonepadith/whisper-3-large-lao-finetuned-v1" |
|
|
processor = WhisperProcessor.from_pretrained(model_id) |
|
|
model = WhisperForConditionalGeneration.from_pretrained(model_id) |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
model.to(device) |
|
|
|
|
|
print(f"Model loaded on: {device}") |
|
|
|
|
|
def transcribe_audio(audio): |
|
|
""" |
|
|
Transcribe audio to Lao text |
|
|
Args: |
|
|
audio: Audio file path (string) or tuple (sample_rate, audio_array) from Gradio |
|
|
Returns: |
|
|
transcription: Lao text |
|
|
""" |
|
|
if audio is None: |
|
|
return "Please upload or record audio." |
|
|
|
|
|
try: |
|
|
|
|
|
if isinstance(audio, str): |
|
|
|
|
|
audio_array, sample_rate = librosa.load(audio, sr=16000, mono=True) |
|
|
else: |
|
|
|
|
|
sample_rate, audio_array = audio |
|
|
|
|
|
|
|
|
if audio_array.dtype != np.float32: |
|
|
|
|
|
if np.issubdtype(audio_array.dtype, np.integer): |
|
|
max_val = np.iinfo(audio_array.dtype).max |
|
|
audio_array = audio_array.astype(np.float32) / max_val |
|
|
else: |
|
|
audio_array = audio_array.astype(np.float32) |
|
|
|
|
|
|
|
|
if np.abs(audio_array).max() > 1.0: |
|
|
audio_array = audio_array / np.abs(audio_array).max() |
|
|
|
|
|
|
|
|
if sample_rate != 16000: |
|
|
audio_array = librosa.resample( |
|
|
audio_array, |
|
|
orig_sr=sample_rate, |
|
|
target_sr=16000 |
|
|
) |
|
|
|
|
|
|
|
|
input_features = processor( |
|
|
audio_array, |
|
|
sampling_rate=16000, |
|
|
return_tensors="pt" |
|
|
).input_features.to(device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
predicted_ids = model.generate(input_features) |
|
|
|
|
|
|
|
|
transcription = processor.batch_decode( |
|
|
predicted_ids, |
|
|
skip_special_tokens=True |
|
|
)[0] |
|
|
|
|
|
return transcription |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing audio: {str(e)}" |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=transcribe_audio, |
|
|
inputs=gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="filepath", |
|
|
label="Record or Upload Lao Audio" |
|
|
), |
|
|
outputs=gr.Textbox( |
|
|
label="Transcription (ພາສາລາວ)", |
|
|
placeholder="Your transcription will appear here...", |
|
|
lines=5 |
|
|
), |
|
|
title="🗣️ Whisper Large Lao ASR", |
|
|
description=""" |
|
|
### Automatic Speech Recognition for Lao Language |
|
|
|
|
|
This model transcribes Lao (ພາສາລາວ) speech to text using a fine-tuned Whisper Large model. |
|
|
|
|
|
**How to use:** |
|
|
1. Click the microphone icon to record audio, or upload an audio file |
|
|
2. Wait for the transcription to appear |
|
|
|
|
|
**Supported formats:** WAV, MP3, OGG, FLAC (16kHz recommended) |
|
|
""", |
|
|
article=""" |
|
|
### About this model |
|
|
|
|
|
- **Model:** Fine-tuned Whisper Large for Lao |
|
|
- **Dataset:** 7k+ Lao speech samples |
|
|
- **Repository:** [Phonepadith/whisper-3-large-lao-finetuned-v1](https://huggingface.co/Phonepadith/whisper-3-large-lao-finetuned-v1) |
|
|
|
|
|
--- |
|
|
|
|
|
Created by [@Phonepadith](https://huggingface.co/Phonepadith) | 📧 [email protected] |
|
|
""", |
|
|
examples=[ |
|
|
|
|
|
|
|
|
|
|
|
], |
|
|
cache_examples=False, |
|
|
theme=gr.themes.Soft() |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |