Spaces:
Runtime error
Runtime error
| import torch | |
| import torch.nn.functional as F | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| from transformers.models.whisper.tokenization_whisper import LANGUAGES | |
| from transformers.pipelines.audio_utils import ffmpeg_read | |
| import gradio as gr | |
| model_id = "openai/whisper-large-v2" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| processor = WhisperProcessor.from_pretrained(model_id) | |
| model = WhisperForConditionalGeneration.from_pretrained(model_id) | |
| model.eval() | |
| model.to(device) | |
| sampling_rate = processor.feature_extractor.sampling_rate | |
| bos_token_id = processor.tokenizer.all_special_ids[-106] | |
| decoder_input_ids = torch.tensor([bos_token_id]).to(device) | |
| def process_audio_file(file): | |
| with open(file, "rb") as f: | |
| inputs = f.read() | |
| audio = ffmpeg_read(inputs, sampling_rate) | |
| return audio | |
| def transcribe(Microphone, File_Upload): | |
| warn_output = "" | |
| if (Microphone is not None) and (File_Upload is not None): | |
| warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \ | |
| "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
| file = Microphone | |
| elif (Microphone is None) and (File_Upload is None): | |
| return "ERROR: You have to either use the microphone or upload an audio file" | |
| elif Microphone is not None: | |
| file = Microphone | |
| else: | |
| file = File_Upload | |
| audio_data = process_audio_file(file) | |
| input_features = processor(audio_data, return_tensors="pt").input_features | |
| with torch.no_grad(): | |
| logits = model.forward(input_features.to(device), decoder_input_ids=decoder_input_ids).logits | |
| pred_ids = torch.argmax(logits, dim=-1) | |
| probability = F.softmax(logits, dim=-1).max() | |
| lang_ids = processor.decode(pred_ids[0]) | |
| lang_ids = lang_ids.lstrip("<|").rstrip("|>") | |
| language = LANGUAGES.get(lang_ids, "not detected") | |
| return language.capitalize(), probability.cpu().numpy() | |
| iface = gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.inputs.Audio(source="microphone", type='filepath', optional=True), | |
| gr.inputs.Audio(source="upload", type='filepath', optional=True), | |
| ], | |
| outputs=[ | |
| gr.outputs.Textbox(label="Language"), | |
| gr.Number(label="Probability"), | |
| ], | |
| layout="horizontal", | |
| theme="huggingface", | |
| title="Whisper Language Identification", | |
| description="Demo for Language Identification using OpenAI's [Whisper Large V2](https://huggingface.co/openai/whisper-large-v2).", | |
| allow_flagging='never', | |
| ) | |
| iface.launch(enable_queue=True) |