Spaces:

sanchit-gandhi
/

enhanced_direct_s2st

Runtime error

App Files Files Community

enhanced_direct_s2st / app.py

sanchit-gandhi

Update app.py

5aacf93 about 3 years ago

raw

history blame contribute delete

2.75 kB

	import gradio as gr
	from transformers.models.speech_to_speech.modeling_speech_to_speech import SpeechToSpeechModelWithCodeHiFiGAN
	from transformers import Wav2Vec2FeatureExtractor
	import torch

	import librosa
	import soundfile


	description = """(Teaser) Demo for the 🤗 Transformers Speech-to-Speech Translation implementation. Integrated from the original work [Enhanced Direct Speech-to-Speech Translation](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md) by Sravya et al. from Meta AI.

	The number of beams controls the behaviour of auto-regressive decoding. Set to 1 for greedy (fastest), increase to 5 or 10 for beam-search with slower decoding and higher accuracy translations."""

	SAMPLE_RATE = 16000

	processor = Wav2Vec2FeatureExtractor.from_pretrained("./s2ut_model")
	model = SpeechToSpeechModelWithCodeHiFiGAN.from_s2ut_vocoder_pretrained("./s2ut_model", "./vocoder")
	model.eval()


	def process_audio_file(file):
	data, sr = librosa.load(file)

	if sr != SAMPLE_RATE:
	data = librosa.resample(data, sr, SAMPLE_RATE)

	# monochannel
	data = librosa.to_mono(data)
	return data


	def translate(Microphone, File_Upload, Num_Beams):
	warn_output = ""
	if (Microphone is not None) and (File_Upload is not None):
	warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
	"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
	file = Microphone

	elif (Microphone is None) and (File_Upload is None):
	return "ERROR: You have to either use the microphone or upload an audio file"

	elif Microphone is not None:
	file = Microphone
	else:
	file = File_Upload

	audio_data = process_audio_file(file)

	input_values = processor(audio_data, return_tensors="pt", return_attention_mask=False, sampling_rate=16000).input_values

	with torch.no_grad():
	vocoder_out = model.generate(inputs=input_values, max_length=500, num_beams=Num_Beams, pad_token_id=1, eos_token_id=2)

	soundfile.write("./out.wav", vocoder_out[0, 0, :].cpu().detach().numpy(), SAMPLE_RATE)

	return "./out.wav"


	iface = gr.Interface(
	fn=translate,
	inputs=[
	gr.inputs.Audio(source="microphone", type='filepath', optional=True),
	gr.inputs.Audio(source="upload", type='filepath', optional=True),
	gr.inputs.Radio([1, 5, 10], label="Num beams", default=1),
	],
	outputs= "audio",
	layout="horizontal",
	theme="huggingface",
	title="Enhanced Direct S2ST: English to Spanish",
	description=description,
	allow_flagging='never',
	)

	iface.launch(enable_queue=True)