Spaces:

Phonepadith
/

whisper-3-large-lao-fine-tuned

Sleeping

App Files Files Community

whisper-3-large-lao-fine-tuned / app.py

Phonepadith

Update app.py

60a3a10 verified 2 months ago

raw

history blame contribute delete

4.23 kB

	# app.py - Gradio interface for Whisper Lao ASR
	import gradio as gr
	import torch
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import numpy as np
	import librosa

	# Load model and processor
	model_id = "Phonepadith/whisper-3-large-lao-finetuned-v1"
	processor = WhisperProcessor.from_pretrained(model_id)
	model = WhisperForConditionalGeneration.from_pretrained(model_id)

	# Move to GPU if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	print(f"Model loaded on: {device}")

	def transcribe_audio(audio):
	"""
	Transcribe audio to Lao text
	Args:
	audio: Audio file path (string) or tuple (sample_rate, audio_array) from Gradio
	Returns:
	transcription: Lao text
	"""
	if audio is None:
	return "Please upload or record audio."

	try:
	# Handle both file paths and numpy arrays
	if isinstance(audio, str):
	# Audio is a file path - use librosa to load it
	audio_array, sample_rate = librosa.load(audio, sr=16000, mono=True)
	else:
	# Audio is a tuple (sample_rate, audio_array)
	sample_rate, audio_array = audio

	# Convert to float32 and normalize
	if audio_array.dtype != np.float32:
	# If integer type, normalize to [-1, 1]
	if np.issubdtype(audio_array.dtype, np.integer):
	max_val = np.iinfo(audio_array.dtype).max
	audio_array = audio_array.astype(np.float32) / max_val
	else:
	audio_array = audio_array.astype(np.float32)

	# Ensure audio is in [-1, 1] range
	if np.abs(audio_array).max() > 1.0:
	audio_array = audio_array / np.abs(audio_array).max()

	# Resample to 16kHz if needed
	if sample_rate != 16000:
	audio_array = librosa.resample(
	audio_array,
	orig_sr=sample_rate,
	target_sr=16000
	)

	# Process audio
	input_features = processor(
	audio_array,
	sampling_rate=16000,
	return_tensors="pt"
	).input_features.to(device)

	# Generate transcription
	with torch.no_grad():
	predicted_ids = model.generate(input_features)

	# Decode transcription
	transcription = processor.batch_decode(
	predicted_ids,
	skip_special_tokens=True
	)[0]

	return transcription

	except Exception as e:
	return f"Error processing audio: {str(e)}"

	# Create Gradio interface
	demo = gr.Interface(
	fn=transcribe_audio,
	inputs=gr.Audio(
	sources=["microphone", "upload"],
	type="filepath", # Changed to filepath to handle various formats
	label="Record or Upload Lao Audio"
	),
	outputs=gr.Textbox(
	label="Transcription (ພາສາລາວ)",
	placeholder="Your transcription will appear here...",
	lines=5
	),
	title="🗣️ Whisper Large Lao ASR",
	description="""
	### Automatic Speech Recognition for Lao Language

	This model transcribes Lao (ພາສາລາວ) speech to text using a fine-tuned Whisper Large model.

	How to use:
	1. Click the microphone icon to record audio, or upload an audio file
	2. Wait for the transcription to appear

	Supported formats: WAV, MP3, OGG, FLAC (16kHz recommended)
	""",
	article="""
	### About this model

	- Model: Fine-tuned Whisper Large for Lao
	- Dataset: 7k+ Lao speech samples
	- Repository: [Phonepadith/whisper-3-large-lao-finetuned-v1](https://huggingface.co/Phonepadith/whisper-3-large-lao-finetuned-v1)

	---

	Created by [@Phonepadith](https://huggingface.co/Phonepadith) \| 📧 [email protected]
	""",
	examples=[
	# Add example audio files here if you have them
	# ["example1.wav"],
	# ["example2.wav"],
	],
	cache_examples=False,
	theme=gr.themes.Soft()
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()