meg-huggingface commited on
Commit
e3f1c3d
·
1 Parent(s): 6c0cd22

Merge in frimelle in-progress work.

Browse files
Files changed (4) hide show
  1. app.py +111 -23
  2. src/process.py +1 -0
  3. src/prompts.py +47 -0
  4. src/tts.py +43 -0
app.py CHANGED
@@ -2,12 +2,14 @@ import gradio as gr
2
 
3
  import src.generate as generate
4
  import src.process as process
 
5
 
6
 
7
  # ------------------- UI printing functions -------------------
8
  def clear_all():
9
- # target, user_transcript, score_html, diff_html, result_html
10
- return "", "", "", "", ""
 
11
 
12
 
13
  def make_result_html(pass_threshold, passed, ratio):
@@ -66,15 +68,17 @@ def make_html(sentence_match):
66
  sentence_match.user_tokens,
67
  sentence_match.alignments)
68
  result_html, score_html = make_result_html(sentence_match.pass_threshold,
69
- sentence_match.passed,
70
- sentence_match.ratio)
71
 
72
  return score_html, result_html, diff_html
73
 
74
 
75
  # ------------------- Core Check (English-only) -------------------
76
- def get_user_transcript(audio_path: gr.Audio, target_sentence: str, model_id: str, device_pref: str) -> (str, str):
77
- """Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
 
 
78
  Parameters:
79
  audio_path: Processed audio file returned from gradio Audio component.
80
  target_sentence: Sentence the user needs to say.
@@ -84,7 +88,6 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str, model_id: st
84
  error_msg: If there's an error, a string describing what happened.
85
  user_transcript: The recognized user utterance.
86
  """
87
- error_msg = ""
88
  # Handles user interaction errors.
89
  if not target_sentence:
90
  return "Please generate a sentence first.", ""
@@ -92,20 +95,18 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str, model_id: st
92
  if audio_path is None:
93
  return "Please start, record, then stop the audio recording before trying to transcribe.", ""
94
 
95
- # Runs automatic speech recognition
96
  user_transcript = process.run_asr(audio_path, model_id, device_pref)
97
 
98
  # Handles processing errors.
99
- if type(user_transcript) is Exception:
100
  return f"Transcription failed: {user_transcript}", ""
101
-
102
- return error_msg, user_transcript
103
 
104
 
105
  def transcribe_check(audio_path, target_sentence, model_id, device_pref,
106
  pass_threshold):
107
- """Transcribe the input user audio, calculate the match to the target sentence,
108
- create the output HTML string displaying the results.
109
  Parameters:
110
  audio_path: Local path to recorded audio.
111
  target_sentence: Sentence the user needs to say.
@@ -118,21 +119,67 @@ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
118
  result_html: HTML string describing the results, or an error message
119
  """
120
  # Transcribe user input
121
- error_msg, user_transcript = get_user_transcript(audio_path, target_sentence, model_id,
122
- device_pref)
123
- if error_msg != "":
 
124
  score_html = ""
125
  diff_html = ""
126
  result_html = error_msg
127
  else:
128
  # Calculate match details between the target and recognized user input
129
- sentence_match = process.SentenceMatcher(target_sentence, user_transcript,
 
130
  pass_threshold)
131
  # Create the output to print out
132
  score_html, result_html, diff_html = make_html(sentence_match)
133
  return user_transcript, score_html, result_html, diff_html
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # ------------------- UI -------------------
137
  with gr.Blocks(title="Say the Sentence (English)") as demo:
138
  gr.Markdown(
@@ -141,6 +188,7 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
141
  1) Generate a sentence.
142
  2) Record yourself reading it.
143
  3) Transcribe & check your accuracy.
 
144
  """
145
  )
146
 
@@ -161,8 +209,8 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
161
  choices=[
162
  "openai/whisper-tiny.en", # fastest (CPU-friendly)
163
  "openai/whisper-base.en", # better accuracy, a bit slower
164
- "distil-whisper/distil-small.en"
165
- # optional distil English model
166
  ],
167
  value="openai/whisper-tiny.en",
168
  label="ASR model (English only)",
@@ -185,18 +233,58 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
185
  diff_html = gr.HTML(
186
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  # -------- Events --------
189
- # Uncomment below if you prefer to use the pre-specified set of target sentences.
190
  btn_gen.click(fn=generate.gen_sentence_set, outputs=target)
191
- # Comment this out below if you prefer to use the pre-specified set of target sentences (above).
192
  # btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
193
- btn_clear.click(fn=clear_all,
194
- outputs=[target, user_transcript, score_html, result_html, diff_html])
 
 
 
 
 
195
  btn_check.click(
196
  fn=transcribe_check,
197
  inputs=[audio, target, model_id, device_pref, pass_threshold],
198
  outputs=[user_transcript, score_html, result_html, diff_html]
199
  )
200
 
 
 
 
 
 
 
 
201
  if __name__ == "__main__":
202
  demo.launch()
 
2
 
3
  import src.generate as generate
4
  import src.process as process
5
+ import src.tts as tts
6
 
7
 
8
  # ------------------- UI printing functions -------------------
9
  def clear_all():
10
+ # target, user_transcript, score_html, diff_html, result_html,
11
+ # tts_text, clone_status, tts_audio
12
+ return "", "", "", "", "", "", "", None
13
 
14
 
15
  def make_result_html(pass_threshold, passed, ratio):
 
68
  sentence_match.user_tokens,
69
  sentence_match.alignments)
70
  result_html, score_html = make_result_html(sentence_match.pass_threshold,
71
+ sentence_match.passed,
72
+ sentence_match.ratio)
73
 
74
  return score_html, result_html, diff_html
75
 
76
 
77
  # ------------------- Core Check (English-only) -------------------
78
+ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
79
+ model_id: str, device_pref: str) -> (str, str):
80
+ """ASR for the input audio and basic validation.
81
+ Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
82
  Parameters:
83
  audio_path: Processed audio file returned from gradio Audio component.
84
  target_sentence: Sentence the user needs to say.
 
88
  error_msg: If there's an error, a string describing what happened.
89
  user_transcript: The recognized user utterance.
90
  """
 
91
  # Handles user interaction errors.
92
  if not target_sentence:
93
  return "Please generate a sentence first.", ""
 
95
  if audio_path is None:
96
  return "Please start, record, then stop the audio recording before trying to transcribe.", ""
97
 
98
+ # Runs the automatic speech recognition
99
  user_transcript = process.run_asr(audio_path, model_id, device_pref)
100
 
101
  # Handles processing errors.
102
+ if isinstance(user_transcript, Exception):
103
  return f"Transcription failed: {user_transcript}", ""
104
+ return "", user_transcript
 
105
 
106
 
107
  def transcribe_check(audio_path, target_sentence, model_id, device_pref,
108
  pass_threshold):
109
+ """Transcribe user, calculate match to target sentence, create results HTML.
 
110
  Parameters:
111
  audio_path: Local path to recorded audio.
112
  target_sentence: Sentence the user needs to say.
 
119
  result_html: HTML string describing the results, or an error message
120
  """
121
  # Transcribe user input
122
+ error_msg, user_transcript = get_user_transcript(audio_path,
123
+ target_sentence, model_id,
124
+ device_pref)
125
+ if error_msg:
126
  score_html = ""
127
  diff_html = ""
128
  result_html = error_msg
129
  else:
130
  # Calculate match details between the target and recognized user input
131
+ sentence_match = process.SentenceMatcher(target_sentence,
132
+ user_transcript,
133
  pass_threshold)
134
  # Create the output to print out
135
  score_html, result_html, diff_html = make_html(sentence_match)
136
  return user_transcript, score_html, result_html, diff_html
137
 
138
 
139
+ # ------------------- Voice cloning gate -------------------
140
+ def clone_if_pass(
141
+ audio_path, # ref voice (the same recorded clip)
142
+ target_sentence, # sentence user was supposed to say
143
+ user_transcript, # what ASR heard
144
+ tts_text, # what we want to synthesize (in cloned voice)
145
+ pass_threshold, # must meet or exceed this
146
+ tts_model_id, # e.g., "coqui/XTTS-v2"
147
+ tts_language, # e.g., "en"
148
+ ):
149
+ """
150
+ If user correctly read the target (>= threshold), clone their voice from the
151
+ recorded audio and speak 'tts_text'. Otherwise, refuse.
152
+ """
153
+ # Basic validations
154
+ if audio_path is None:
155
+ return None, "Record audio first (reference voice is required)."
156
+ if not target_sentence:
157
+ return None, "Generate a target sentence first."
158
+ if not user_transcript:
159
+ return None, "Transcribe first to verify the sentence."
160
+ if not tts_text:
161
+ return None, "Enter the sentence to synthesize."
162
+
163
+ # Recompute pass/fail to avoid relying on UI state
164
+ sm = process.SentenceMatcher(target_sentence, user_transcript,
165
+ pass_threshold)
166
+ if not sm.passed:
167
+ return None, (
168
+ f"❌ Cloning blocked: your reading did not reach the threshold "
169
+ f"({sm.ratio * 100:.1f}% < {int(pass_threshold * 100)}%)."
170
+ )
171
+
172
+ # Run zero-shot cloning
173
+ out = tts.run_tts_clone(audio_path, tts_text, model_id=tts_model_id,
174
+ language=tts_language)
175
+ if isinstance(out, Exception):
176
+ return None, f"Voice cloning failed: {out}"
177
+ sr, wav = out
178
+ # Gradio Audio can take a tuple (sr, np.array)
179
+ return (
180
+ sr, wav), f"✅ Cloned and synthesized with {tts_model_id} ({tts_language})."
181
+
182
+
183
  # ------------------- UI -------------------
184
  with gr.Blocks(title="Say the Sentence (English)") as demo:
185
  gr.Markdown(
 
188
  1) Generate a sentence.
189
  2) Record yourself reading it.
190
  3) Transcribe & check your accuracy.
191
+ 4) If matched, clone your voice to speak any sentence you enter.
192
  """
193
  )
194
 
 
209
  choices=[
210
  "openai/whisper-tiny.en", # fastest (CPU-friendly)
211
  "openai/whisper-base.en", # better accuracy, a bit slower
212
+ "distil-whisper/distil-small.en" # optional distil English model
213
+ "distil-whisper/distil-small.en",
214
  ],
215
  value="openai/whisper-tiny.en",
216
  label="ASR model (English only)",
 
233
  diff_html = gr.HTML(
234
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
235
 
236
+ # gr.Markdown("## 🔁 Voice cloning (gated)")
237
+ # with gr.Row():
238
+ # tts_text = gr.Textbox(
239
+ # label="Text to synthesize (voice clone)",
240
+ # placeholder="Type the sentence you want the cloned voice to say",
241
+ # )
242
+ # with gr.Row():
243
+ # tts_model_id = gr.Dropdown(
244
+ # choices=[
245
+ # "coqui/XTTS-v2",
246
+ # # add others if you like, e.g. "myshell-ai/MeloTTS"
247
+ # ],
248
+ # value="coqui/XTTS-v2",
249
+ # label="TTS (voice cloning) model",
250
+ # )
251
+ # tts_language = gr.Dropdown(
252
+ # choices=["en", "de", "fr", "es", "it", "pt", "pl", "tr", "ru", "nl",
253
+ # "cs", "ar", "zh"],
254
+ # value="en",
255
+ # label="Language",
256
+ # )
257
+
258
+ # with gr.Row():
259
+ # btn_clone = gr.Button("🔁 Clone voice (if passed)", variant="secondary")
260
+ # with gr.Row():
261
+ # tts_audio = gr.Audio(label="Cloned speech output", interactive=False)
262
+ # clone_status = gr.Label(label="Cloning status")
263
+
264
  # -------- Events --------
265
+ # Use pre-specified sentence bank by default
266
  btn_gen.click(fn=generate.gen_sentence_set, outputs=target)
267
+ # Or use LLM generation:
268
  # btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
269
+
270
+ btn_clear.click(
271
+ fn=clear_all,
272
+ outputs=[target, user_transcript, score_html, result_html, diff_html,]
273
+ # tts_text, clone_status, tts_audio]
274
+ )
275
+
276
  btn_check.click(
277
  fn=transcribe_check,
278
  inputs=[audio, target, model_id, device_pref, pass_threshold],
279
  outputs=[user_transcript, score_html, result_html, diff_html]
280
  )
281
 
282
+ # btn_clone.click(
283
+ # fn=clone_if_pass,
284
+ # inputs=[audio, target, user_transcript, tts_text, pass_threshold,
285
+ # tts_model_id, tts_language],
286
+ # outputs=[tts_audio, clone_status],
287
+ # )
288
+
289
  if __name__ == "__main__":
290
  demo.launch()
src/process.py CHANGED
@@ -85,3 +85,4 @@ class SentenceMatcher:
85
  self.ratio, self.alignments = similarity_and_diff(self.target_tokens,
86
  self.user_tokens)
87
  self.passed: bool = self.ratio >= self.pass_threshold
 
 
85
  self.ratio, self.alignments = similarity_and_diff(self.target_tokens,
86
  self.user_tokens)
87
  self.passed: bool = self.ratio >= self.pass_threshold
88
+
src/prompts.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils/prompts.py
2
+
3
+ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = False) -> str:
4
+ """
5
+ Returns a text prompt instructing the model to generate a natural-sounding
6
+ consent sentence for voice cloning with the specified model.
7
+
8
+ Args:
9
+ audio_model_name (str): Name of the audio model to mention in the prompt.
10
+ short_prompt (bool): If True, returns a concise one-line prompt suitable
11
+ for direct model input. If False (default), returns the full detailed prompt.
12
+
13
+ Returns:
14
+ str: The prompt text.
15
+ """
16
+
17
+ if short_prompt:
18
+ return (
19
+ f"Generate one natural, spoken-style English sentence (10–20 words) in which a person "
20
+ f"clearly gives informed consent to use their voice for generating synthetic audio "
21
+ f"with the model {audio_model_name}. The sentence should sound conversational, include "
22
+ f"a clear consent phrase like 'I give my consent' or 'I agree', mention {audio_model_name} "
23
+ f"by name, and be phonetically varied but neutral in tone. Output only the final sentence."
24
+ )
25
+
26
+ return f"""
27
+ Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
28
+ to clearly state their informed consent to use their voice for generating synthetic audio with
29
+ an AI model called {audio_model_name}.
30
+
31
+ The sentence should:
32
+ - Sound natural and conversational, not like legal text.
33
+ - Explicitly include a consent phrase, such as “I give my consent,” “I agree,” or “I allow.”
34
+ - Mention the model name ({audio_model_name}) clearly in the sentence.
35
+ - Include a neutral descriptive clause before or after the consent phrase to add phonetic variety
36
+ (e.g., “The weather today is bright and calm” or “This recording is made clearly and freely.”)
37
+ - Have a neutral or polite tone (no emotional extremes).
38
+ - Be comfortable to read aloud and phonetically rich, covering diverse vowels and consonants naturally.
39
+ - Be self-contained, so the full sentence can serve as an independent audio clip.
40
+
41
+ Examples of structure to follow:
42
+ - “The weather is clear and warm today. I give my consent to use my voice for generating audio with the model {audio_model_name}.”
43
+ - “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
44
+ - “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
45
+
46
+ The output should be a single, natural sentence ready to be spoken aloud for recording purposes.
47
+ """
src/tts.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/tts.py
2
+ from __future__ import annotations
3
+ from typing import Tuple, Union
4
+
5
+ import numpy as np
6
+ from transformers import pipeline
7
+
8
+ # We use the text-to-speech pipeline with XTTS v2 (zero-shot cloning)
9
+ # Example forward params: {"speaker_wav": "/path/to/ref.wav", "language": "en"}
10
+
11
+ def get_tts_pipeline(model_id: str):
12
+ """
13
+ Create a TTS pipeline for the given model.
14
+ XTTS v2 works well for zero-shot cloning and is available on the Hub.
15
+ """
16
+ # NOTE: Add device selection similar to ASR if needed
17
+ return pipeline("text-to-speech", model=model_id)
18
+
19
+ def run_tts_clone(
20
+ ref_audio_path: str,
21
+ text_to_speak: str,
22
+ model_id: str = "coqui/XTTS-v2",
23
+ language: str = "en",
24
+ ) -> Union[Tuple[int, np.ndarray], Exception]:
25
+ """
26
+ Synthesize 'text_to_speak' in the cloned voice from 'ref_audio_path'.
27
+
28
+ Returns:
29
+ (sampling_rate, waveform) on success, or Exception on failure.
30
+ """
31
+ try:
32
+ tts = get_tts_pipeline(model_id)
33
+ result = tts(
34
+ text_to_speak,
35
+ forward_params={"speaker_wav": ref_audio_path, "language": language},
36
+ )
37
+ # transformers TTS returns dict like: {"audio": {"array": np.ndarray, "sampling_rate": 24000}}
38
+ audio = result["audio"]
39
+ sr = int(audio["sampling_rate"])
40
+ wav = audio["array"].astype(np.float32)
41
+ return sr, wav
42
+ except Exception as e:
43
+ return e