# app.py - Gradio interface for Whisper Lao ASR import gradio as gr import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration import numpy as np import librosa # Load model and processor model_id = "Phonepadith/whisper-3-large-lao-finetuned-v1" processor = WhisperProcessor.from_pretrained(model_id) model = WhisperForConditionalGeneration.from_pretrained(model_id) # Move to GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) print(f"Model loaded on: {device}") def transcribe_audio(audio): """ Transcribe audio to Lao text Args: audio: Audio file path (string) or tuple (sample_rate, audio_array) from Gradio Returns: transcription: Lao text """ if audio is None: return "Please upload or record audio." try: # Handle both file paths and numpy arrays if isinstance(audio, str): # Audio is a file path - use librosa to load it audio_array, sample_rate = librosa.load(audio, sr=16000, mono=True) else: # Audio is a tuple (sample_rate, audio_array) sample_rate, audio_array = audio # Convert to float32 and normalize if audio_array.dtype != np.float32: # If integer type, normalize to [-1, 1] if np.issubdtype(audio_array.dtype, np.integer): max_val = np.iinfo(audio_array.dtype).max audio_array = audio_array.astype(np.float32) / max_val else: audio_array = audio_array.astype(np.float32) # Ensure audio is in [-1, 1] range if np.abs(audio_array).max() > 1.0: audio_array = audio_array / np.abs(audio_array).max() # Resample to 16kHz if needed if sample_rate != 16000: audio_array = librosa.resample( audio_array, orig_sr=sample_rate, target_sr=16000 ) # Process audio input_features = processor( audio_array, sampling_rate=16000, return_tensors="pt" ).input_features.to(device) # Generate transcription with torch.no_grad(): predicted_ids = model.generate(input_features) # Decode transcription transcription = processor.batch_decode( predicted_ids, skip_special_tokens=True )[0] return transcription except Exception as e: return f"Error processing audio: {str(e)}" # Create Gradio interface demo = gr.Interface( fn=transcribe_audio, inputs=gr.Audio( sources=["microphone", "upload"], type="filepath", # Changed to filepath to handle various formats label="Record or Upload Lao Audio" ), outputs=gr.Textbox( label="Transcription (ພາສາລາວ)", placeholder="Your transcription will appear here...", lines=5 ), title="🗣️ Whisper Large Lao ASR", description=""" ### Automatic Speech Recognition for Lao Language This model transcribes Lao (ພາສາລາວ) speech to text using a fine-tuned Whisper Large model. **How to use:** 1. Click the microphone icon to record audio, or upload an audio file 2. Wait for the transcription to appear **Supported formats:** WAV, MP3, OGG, FLAC (16kHz recommended) """, article=""" ### About this model - **Model:** Fine-tuned Whisper Large for Lao - **Dataset:** 7k+ Lao speech samples - **Repository:** [Phonepadith/whisper-3-large-lao-finetuned-v1](https://huggingface.co/Phonepadith/whisper-3-large-lao-finetuned-v1) --- Created by [@Phonepadith](https://huggingface.co/Phonepadith) | 📧 phonepadithpp@gmail.com """, examples=[ # Add example audio files here if you have them # ["example1.wav"], # ["example2.wav"], ], cache_examples=False, theme=gr.themes.Soft() ) # Launch the app if __name__ == "__main__": demo.launch()