Spaces:

aletrn
/

ai-pronunciation-trainer

Running

alessandro trinca tornidor

doc: add/update docstring and typing hints

0700cb3 9 months ago

3.91 kB

	import numpy as np
	import torch
	import ModelInterfaces


	class NeuralASR(ModelInterfaces.IASRModel):
	word_locations_in_samples = None
	audio_transcript = None

	def __init__(self, model: torch.nn.Module, decoder) -> None:
	"""
	Initialize the NeuralASR (Audio Speech Recognition) model.

	Args:
	model (torch.nn.Module): The neural network model for ASR.
	decoder: The decoder to convert CTC outputs to transcripts.
	"""
	super().__init__()
	self.model = model
	self.decoder = decoder # Decoder from CTC-outputs to transcripts

	def getTranscript(self) -> str:
	"""
	Get the transcript of the processed audio.

	Returns:
	str: The audio transcript.

	Raises:
	AssertionError: If the audio has not been processed.
	"""
	assert self.audio_transcript is not None, 'Can get audio transcripts without having processed the audio'
	return self.audio_transcript

	def getWordLocations(self) -> list:
	"""
	Get the word locations from the processed audio.

	Returns:
	list: A list of word locations in samples.

	Raises:
	AssertionError: If the audio has not been processed.
	"""
	assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
	return self.word_locations_in_samples

	def processAudio(self, audio: torch.Tensor) -> None:
	"""
	Process the audio to generate transcripts and word locations.

	Args:
	audio (torch.Tensor): The input audio tensor.
	"""
	audio_length_in_samples = audio.shape[1]
	with torch.inference_mode():
	nn_output = self.model(audio)

	self.audio_transcript, self.word_locations_in_samples = self.decoder(
	nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)


	class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
	def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
	"""
	Initialize the NeuralTTS (Text to Speech) model.

	Args:
	model (torch.nn.Module): The neural network model for TTS.
	sampling_rate (int): The sampling rate for the audio.
	"""
	super().__init__()
	self.model = model
	self.sampling_rate = sampling_rate

	def getAudioFromSentence(self, sentence: str) -> np.array:
	"""
	Generate audio from a given sentence.

	Args:
	sentence (str): The input sentence.

	Returns:
	np.array: The generated audio as a numpy array.
	"""
	with torch.inference_mode():
	audio_transcript = self.model.apply_tts(texts=[sentence],
	sample_rate=self.sampling_rate)[0]

	return audio_transcript


	class NeuralTranslator(ModelInterfaces.ITranslationModel):
	def __init__(self, model: torch.nn.Module, tokenizer) -> None:
	"""
	Initialize the NeuralTranslator model.

	Args:
	model (torch.nn.Module): The neural network model for translation.
	tokenizer: The tokenizer for text processing.
	"""
	super().__init__()
	self.model = model
	self.tokenizer = tokenizer

	def translateSentence(self, sentence: str) -> str:
	"""
	Translate a given sentence to the target language.

	Args:
	sentence (str): The input sentence.

	Returns:
	str: The translated sentence.
	"""
	tokenized_text = self.tokenizer(sentence, return_tensors='pt')
	translation = self.model.generate(**tokenized_text)
	translated_text = self.tokenizer.batch_decode(
	translation, skip_special_tokens=True)[0]

	return translated_text