vitorcalvi commited on
Commit
74ee391
·
1 Parent(s): c8fe47c

Add application file

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import librosa
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import torch
7
+ import spaces
8
+
9
+ # Check for GPU availability
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ # Load Whisper model using transformers pipeline
13
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=0 if device == "cuda" else -1)
14
+
15
+ @spaces.GPU
16
+ def analyze_audio(audio):
17
+ # Convert audio to text using Whisper
18
+ transcription_result = transcriber(audio)
19
+ transcription = transcription_result["text"]
20
+
21
+ # Load audio file
22
+ y, sr = librosa.load(audio, sr=None)
23
+
24
+ # Extract prosodic features
25
+ pitch = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
26
+ tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
27
+
28
+ # Calculate pitch variance
29
+ pitch_variance = np.var(pitch)
30
+
31
+ # Estimate speaking pace (syllables per second)
32
+ num_syllables = len(transcription.split())
33
+ duration = librosa.get_duration(y=y, sr=sr)
34
+ pace = num_syllables / duration
35
+
36
+ # Plot pitch
37
+ plt.figure(figsize=(10, 4))
38
+ plt.plot(pitch, label='Pitch')
39
+ plt.xlabel('Time')
40
+ plt.ylabel('Frequency (Hz)')
41
+ plt.title('Pitch Over Time')
42
+ plt.legend()
43
+ pitch_plot_path = '/tmp/pitch_contour.png'
44
+ plt.savefig(pitch_plot_path)
45
+ plt.close()
46
+
47
+ # Voice Stress Analysis (simplified example)
48
+ stress_level = np.std(pitch)
49
+
50
+ return transcription, tempo, pace, pitch_variance, pitch_plot_path
51
+
52
+ # Create Gradio interface
53
+ input_audio = gr.Audio(label="Input Audio", type="filepath")
54
+
55
+ iface = gr.Interface(
56
+ fn=analyze_audio,
57
+ inputs=input_audio,
58
+ outputs=[
59
+ gr.Textbox(label="Transcription"),
60
+ gr.Number(label="Tempo (BPM)"),
61
+ gr.Number(label="Speaking Pace (syllables/sec)"),
62
+ gr.Number(label="Pitch Variance"),
63
+ gr.Image(label="Pitch Contour Plot")
64
+ ],
65
+ live=True
66
+ )
67
+
68
+ iface.launch(share=False)