Phonepadith commited on
Commit
489c533
·
verified ·
1 Parent(s): 15348a7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Gradio interface for Whisper Lao ASR
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
5
+ import numpy as np
6
+
7
+ # Load model and processor
8
+ model_id = "Phonepadith/whisper-3-large-lao-finetuned-v1"
9
+ processor = WhisperProcessor.from_pretrained(model_id)
10
+ model = WhisperForConditionalGeneration.from_pretrained(model_id)
11
+
12
+ # Move to GPU if available
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ model.to(device)
15
+
16
+ def transcribe_audio(audio):
17
+ """
18
+ Transcribe audio to Lao text
19
+ Args:
20
+ audio: tuple (sample_rate, audio_array) from Gradio
21
+ Returns:
22
+ transcription: Lao text
23
+ """
24
+ if audio is None:
25
+ return "Please upload or record audio."
26
+
27
+ # Get sample rate and audio array
28
+ sample_rate, audio_array = audio
29
+
30
+ # Convert to float32 and normalize
31
+ audio_array = audio_array.astype(np.float32)
32
+ if audio_array.max() > 1.0:
33
+ audio_array = audio_array / np.iinfo(audio_array.dtype).max
34
+
35
+ # Resample to 16kHz if needed
36
+ if sample_rate != 16000:
37
+ import librosa
38
+ audio_array = librosa.resample(
39
+ audio_array,
40
+ orig_sr=sample_rate,
41
+ target_sr=16000
42
+ )
43
+
44
+ # Process audio
45
+ input_features = processor(
46
+ audio_array,
47
+ sampling_rate=16000,
48
+ return_tensors="pt"
49
+ ).input_features.to(device)
50
+
51
+ # Generate transcription
52
+ with torch.no_grad():
53
+ predicted_ids = model.generate(input_features)
54
+
55
+ # Decode transcription
56
+ transcription = processor.batch_decode(
57
+ predicted_ids,
58
+ skip_special_tokens=True
59
+ )[0]
60
+
61
+ return transcription
62
+
63
+ # Create Gradio interface
64
+ demo = gr.Interface(
65
+ fn=transcribe_audio,
66
+ inputs=gr.Audio(
67
+ sources=["microphone", "upload"],
68
+ type="numpy",
69
+ label="Record or Upload Lao Audio"
70
+ ),
71
+ outputs=gr.Textbox(
72
+ label="Transcription (ພາສາລາວ)",
73
+ placeholder="Your transcription will appear here...",
74
+ lines=5
75
+ ),
76
+ title="🗣️ Whisper Large Lao ASR",
77
+ description="""
78
+ ### Automatic Speech Recognition for Lao Language
79
+
80
+ This model transcribes Lao (ພາສາລາວ) speech to text using a fine-tuned Whisper Large model.
81
+
82
+ **How to use:**
83
+ 1. Click the microphone icon to record audio, or upload an audio file
84
+ 2. Wait for the transcription to appear
85
+
86
+ **Supported formats:** WAV, MP3, OGG, FLAC (16kHz recommended)
87
+ """,
88
+ article="""
89
+ ### About this model
90
+
91
+ - **Model:** Fine-tuned Whisper Large for Lao
92
+ - **Dataset:** 7k+ Lao speech samples
93
+ - **Repository:** [Phonepadith/whisper-3-large-lao-finetuned-v1](https://huggingface.co/Phonepadith/whisper-3-large-lao-finetuned-v1)
94
+
95
+ ---
96
+
97
+ Created by [@Phonepadith](https://huggingface.co/Phonepadith) | 📧 [email protected]
98
+ """,
99
+ examples=[
100
+ # Add example audio files here if you have them
101
+ # ["example1.wav"],
102
+ # ["example2.wav"],
103
+ ],
104
+ cache_examples=False,
105
+ theme=gr.themes.Soft()
106
+ )
107
+
108
+ # Launch the app
109
+ if __name__ == "__main__":
110
+ demo.launch()