Spaces:

Phonepadith
/

whisper-3-large-lao-fine-tuned

Sleeping

App Files Files Community

Phonepadith commited on Oct 14

Commit

60a3a10

verified ·

1 Parent(s): 020ac09

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -43

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import numpy as np
 # Load model and processor
 model_id = "Phonepadith/whisper-3-large-lao-finetuned-v1"
@@ -13,67 +14,77 @@ model = WhisperForConditionalGeneration.from_pretrained(model_id)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def transcribe_audio(audio):
     """
     Transcribe audio to Lao text
     Args:
-        audio: tuple (sample_rate, audio_array) from Gradio
     Returns:
         transcription: Lao text
     """
     if audio is None:
         return "Please upload or record audio."
-    # Get sample rate and audio array
-    sample_rate, audio_array = audio
-    # Convert to float32 and normalize
-    if audio_array.dtype != np.float32:
-        # If integer type, normalize to [-1, 1]
-        if np.issubdtype(audio_array.dtype, np.integer):
-            max_val = np.iinfo(audio_array.dtype).max
-            audio_array = audio_array.astype(np.float32) / max_val
         else:
-            audio_array = audio_array.astype(np.float32)
-    # Ensure audio is in [-1, 1] range
-    if np.abs(audio_array).max() > 1.0:
-        audio_array = audio_array / np.abs(audio_array).max()
-    # Resample to 16kHz if needed
-    if sample_rate != 16000:
-        import librosa
-        audio_array = librosa.resample(
-            audio_array,
-            orig_sr=sample_rate,
-            target_sr=16000
-        )
-    # Process audio
-    input_features = processor(
-        audio_array,
-        sampling_rate=16000,
-        return_tensors="pt"
-    ).input_features.to(device)
-    # Generate transcription
-    with torch.no_grad():
-        predicted_ids = model.generate(input_features)
-    # Decode transcription
-    transcription = processor.batch_decode(
-        predicted_ids,
-        skip_special_tokens=True
-    )[0]
-    return transcription
 # Create Gradio interface
 demo = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(
         sources=["microphone", "upload"],
-        type="numpy",
         label="Record or Upload Lao Audio"
     ),
     outputs=gr.Textbox(

 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import numpy as np
+import librosa
 # Load model and processor
 model_id = "Phonepadith/whisper-3-large-lao-finetuned-v1"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+print(f"Model loaded on: {device}")
 def transcribe_audio(audio):
     """
     Transcribe audio to Lao text
     Args:
+        audio: Audio file path (string) or tuple (sample_rate, audio_array) from Gradio
     Returns:
         transcription: Lao text
     """
     if audio is None:
         return "Please upload or record audio."
+    try:
+        # Handle both file paths and numpy arrays
+        if isinstance(audio, str):
+            # Audio is a file path - use librosa to load it
+            audio_array, sample_rate = librosa.load(audio, sr=16000, mono=True)
         else:
+            # Audio is a tuple (sample_rate, audio_array)
+            sample_rate, audio_array = audio
+            # Convert to float32 and normalize
+            if audio_array.dtype != np.float32:
+                # If integer type, normalize to [-1, 1]
+                if np.issubdtype(audio_array.dtype, np.integer):
+                    max_val = np.iinfo(audio_array.dtype).max
+                    audio_array = audio_array.astype(np.float32) / max_val
+                else:
+                    audio_array = audio_array.astype(np.float32)
+            # Ensure audio is in [-1, 1] range
+            if np.abs(audio_array).max() > 1.0:
+                audio_array = audio_array / np.abs(audio_array).max()
+            # Resample to 16kHz if needed
+            if sample_rate != 16000:
+                audio_array = librosa.resample(
+                    audio_array,
+                    orig_sr=sample_rate,
+                    target_sr=16000
+                )
+        # Process audio
+        input_features = processor(
+            audio_array,
+            sampling_rate=16000,
+            return_tensors="pt"
+        ).input_features.to(device)
+        # Generate transcription
+        with torch.no_grad():
+            predicted_ids = model.generate(input_features)
+        # Decode transcription
+        transcription = processor.batch_decode(
+            predicted_ids,
+            skip_special_tokens=True
+        )[0]
+        return transcription
+    except Exception as e:
+        return f"Error processing audio: {str(e)}"
 # Create Gradio interface
 demo = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(
         sources=["microphone", "upload"],
+        type="filepath",  # Changed to filepath to handle various formats
         label="Record or Upload Lao Audio"
     ),
     outputs=gr.Textbox(