frimelle HF Staff commited on
Commit
f329f75
·
1 Parent(s): 7ea298f

inital commit

Browse files
Files changed (3) hide show
  1. app.py +160 -0
  2. packages.txt +1 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+ import re
4
+ import difflib
5
+ import torch
6
+ from functools import lru_cache
7
+ from transformers import pipeline
8
+
9
+ # -------- Sentences to practice (customize freely) ----------
10
+ SENTENCE_BANK = [
11
+ "The quick brown fox jumps over the lazy dog.",
12
+ "I promise to speak clearly and at a steady pace.",
13
+ "Open source makes AI more transparent and inclusive.",
14
+ "Hugging Face Spaces make demos easy to share.",
15
+ "Today the weather in Berlin is pleasantly cool.",
16
+ "Privacy and transparency should go hand in hand.",
17
+ "Please generate a new sentence for me to read.",
18
+ "Machine learning can amplify or reduce inequality.",
19
+ "Responsible AI requires participation from everyone.",
20
+ "This microphone test checks my pronunciation accuracy.",
21
+ ]
22
+
23
+ # -------- Utilities ----------
24
+ def normalize_text(t: str) -> str:
25
+ t = t.lower()
26
+ # keep letters and numbers, replace anything else with space
27
+ t = re.sub(r"[^a-z0-9'äöüßçéèêáàóòúùîïôñ\-]+", " ", t)
28
+ # collapse whitespace
29
+ t = re.sub(r"\s+", " ", t).strip()
30
+ return t
31
+
32
+ def similarity_and_diff(ref: str, hyp: str):
33
+ """Return similarity ratio (0..1) and HTML diff highlighting changes."""
34
+ ref_tokens = ref.split()
35
+ hyp_tokens = hyp.split()
36
+ sm = difflib.SequenceMatcher(a=ref_tokens, b=hyp_tokens)
37
+ ratio = sm.ratio()
38
+
39
+ # Build HTML with insertions/deletions highlighted
40
+ out = []
41
+ for op, i1, i2, j1, j2 in sm.get_opcodes():
42
+ if op == "equal":
43
+ out.append(" " + " ".join(ref_tokens[i1:i2]))
44
+ elif op == "delete":
45
+ out.append(' <span style="background:#ffe0e0;text-decoration:line-through;">'
46
+ + " ".join(ref_tokens[i1:i2]) + "</span>")
47
+ elif op == "insert":
48
+ out.append(' <span style="background:#e0ffe0;">'
49
+ + " ".join(hyp_tokens[j1:j2]) + "</span>")
50
+ elif op == "replace":
51
+ out.append(' <span style="background:#ffe0e0;text-decoration:line-through;">'
52
+ + " ".join(ref_tokens[i1:i2]) + "</span>")
53
+ out.append(' <span style="background:#e0ffe0;">'
54
+ + " ".join(hyp_tokens[j1:j2]) + "</span>")
55
+ html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(out).strip() + "</div>"
56
+ return ratio, html
57
+
58
+ @lru_cache(maxsize=2)
59
+ def get_asr(model_id: str, device_preference: str):
60
+ """Cache an ASR pipeline. device_preference: 'auto'|'cpu'|'cuda'."""
61
+ if device_preference == "cuda" and torch.cuda.is_available():
62
+ device = 0
63
+ elif device_preference == "auto":
64
+ device = 0 if torch.cuda.is_available() else -1
65
+ else:
66
+ device = -1
67
+ return pipeline(
68
+ "automatic-speech-recognition",
69
+ model=model_id,
70
+ device=device,
71
+ chunk_length_s=30,
72
+ return_timestamps=False,
73
+ )
74
+
75
+ def gen_sentence():
76
+ return random.choice(SENTENCE_BANK)
77
+
78
+ def check_pronunciation(audio_path, target_sentence, model_id, lang, device_pref, pass_threshold):
79
+ if not target_sentence:
80
+ return gr.update(value=""), gr.update(value=""), gr.update(value=""), gr.update(value="Please generate a sentence first.")
81
+
82
+ asr = get_asr(model_id, device_pref)
83
+ # Whisper models accept a 'generate' kwarg with language hints via tokenizer, but
84
+ # transformers pipeline exposes it as 'generate_kwargs' for whisper models.
85
+ try:
86
+ result = asr(audio_path, generate_kwargs={"language": lang} if lang else None)
87
+ hyp_raw = result["text"].strip()
88
+ except Exception as e:
89
+ return "", "", "", f"Transcription failed: {e}"
90
+
91
+ ref_norm = normalize_text(target_sentence)
92
+ hyp_norm = normalize_text(hyp_raw)
93
+
94
+ ratio, diff_html = similarity_and_diff(ref_norm, hyp_norm)
95
+ passed = ratio >= pass_threshold
96
+
97
+ summary = (
98
+ f"✅ Correct (≥ {int(pass_threshold*100)}%)"
99
+ if passed else
100
+ f"❌ Not a match (need ≥ {int(pass_threshold*100)}%)"
101
+ )
102
+ score = f"Similarity: {ratio*100:.1f}%"
103
+
104
+ return hyp_raw, score, diff_html, summary
105
+
106
+ with gr.Blocks(title="Say the Sentence") as demo:
107
+ gr.Markdown(
108
+ """
109
+ # 🎤 Say the Sentence
110
+ 1) Generate a sentence.
111
+ 2) Press the mic to record yourself reading it.
112
+ 3) Transcribe & check.
113
+ """
114
+ )
115
+
116
+ with gr.Row():
117
+ target = gr.Textbox(label="Target sentence", interactive=False, placeholder="Click 'Generate sentence'")
118
+ with gr.Row():
119
+ btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
120
+ btn_clear = gr.Button("🧹 Clear")
121
+
122
+ with gr.Row():
123
+ audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice")
124
+ with gr.Accordion("Advanced settings", open=False):
125
+ model_id = gr.Dropdown(
126
+ choices=[
127
+ "openai/whisper-tiny.en", # Fastest (English)
128
+ "openai/whisper-base.en",
129
+ "openai/whisper-small.en",
130
+ "distil-whisper/distil-small.en", # Distil variant (English)
131
+ "openai/whisper-tiny", # Multilingual tiny
132
+ ],
133
+ value="openai/whisper-tiny.en",
134
+ label="ASR model",
135
+ )
136
+ lang = gr.Textbox(value="en", label="Language hint (e.g., 'en', 'de', 'fr')", info="Whisper language code; leave as 'en' for English-only models.")
137
+ device_pref = gr.Radio(choices=["auto", "cpu", "cuda"], value="auto", label="Device preference")
138
+ pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01, label="Match threshold")
139
+
140
+ with gr.Row():
141
+ btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
142
+
143
+ with gr.Row():
144
+ hyp_out = gr.Textbox(label="Transcription", interactive=False)
145
+ with gr.Row():
146
+ score_out = gr.Label(label="Score")
147
+ summary_out = gr.Label(label="Result")
148
+ diff_out = gr.HTML(label="Word-level diff (red = expected but missing / green = extra or replacement)")
149
+
150
+ # Events
151
+ btn_gen.click(fn=gen_sentence, outputs=target)
152
+ btn_clear.click(fn=lambda: ("", "", "", "", ""), outputs=[target, hyp_out, score_out, diff_out, summary_out])
153
+ btn_check.click(
154
+ fn=check_pronunciation,
155
+ inputs=[audio, target, model_id, lang, device_pref, pass_threshold],
156
+ outputs=[hyp_out, score_out, diff_out, summary_out]
157
+ )
158
+
159
+ if __name__ == "__main__":
160
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.39.0
2
+ transformers>=4.44.0
3
+ torch>=2.2.0
4
+ accelerate>=0.33.0
5
+ sentencepiece>=0.2.0