sanchit-gandhi's picture
Update app.py
5aacf93
import gradio as gr
from transformers.models.speech_to_speech.modeling_speech_to_speech import SpeechToSpeechModelWithCodeHiFiGAN
from transformers import Wav2Vec2FeatureExtractor
import torch
import librosa
import soundfile
description = """(Teaser) Demo for the 🤗 Transformers Speech-to-Speech Translation implementation. Integrated from the original work [Enhanced Direct Speech-to-Speech Translation](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md) by Sravya et al. from Meta AI.
The number of beams controls the behaviour of auto-regressive decoding. Set to 1 for greedy (fastest), increase to 5 or 10 for beam-search with slower decoding and higher accuracy translations."""
SAMPLE_RATE = 16000
processor = Wav2Vec2FeatureExtractor.from_pretrained("./s2ut_model")
model = SpeechToSpeechModelWithCodeHiFiGAN.from_s2ut_vocoder_pretrained("./s2ut_model", "./vocoder")
model.eval()
def process_audio_file(file):
data, sr = librosa.load(file)
if sr != SAMPLE_RATE:
data = librosa.resample(data, sr, SAMPLE_RATE)
# monochannel
data = librosa.to_mono(data)
return data
def translate(Microphone, File_Upload, Num_Beams):
warn_output = ""
if (Microphone is not None) and (File_Upload is not None):
warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
file = Microphone
elif (Microphone is None) and (File_Upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif Microphone is not None:
file = Microphone
else:
file = File_Upload
audio_data = process_audio_file(file)
input_values = processor(audio_data, return_tensors="pt", return_attention_mask=False, sampling_rate=16000).input_values
with torch.no_grad():
vocoder_out = model.generate(inputs=input_values, max_length=500, num_beams=Num_Beams, pad_token_id=1, eos_token_id=2)
soundfile.write("./out.wav", vocoder_out[0, 0, :].cpu().detach().numpy(), SAMPLE_RATE)
return "./out.wav"
iface = gr.Interface(
fn=translate,
inputs=[
gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True),
gr.inputs.Radio([1, 5, 10], label="Num beams", default=1),
],
outputs= "audio",
layout="horizontal",
theme="huggingface",
title="Enhanced Direct S2ST: English to Spanish",
description=description,
allow_flagging='never',
)
iface.launch(enable_queue=True)