meg-huggingface commited on
Commit
6110073
·
2 Parent(s): d688fcb e8d021a

Fixing some of the code issues from Lucy's update, now it all seems to be updated nicely and working.

Browse files
Files changed (2) hide show
  1. app.py +18 -23
  2. src/generate.py +35 -85
app.py CHANGED
@@ -173,14 +173,13 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
173
  label="Click for further information on this demo",
174
  open=False):
175
  gr.Markdown("""
176
-
177
- To create a basic voice cloning system with a voice consent gate, you need three parts:
178
- 1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
179
- 2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
180
- 3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
181
-
182
- Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
183
- """)
184
  with gr.Row():
185
  with gr.Column(scale=2):
186
  gr.Markdown(
@@ -195,11 +194,12 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
195
  """
196
  )
197
  with gr.Column():
198
- consent_method = gr.Dropdown(label="Sentence generation method",
199
- choices=["Llama 3.2 3B Instruct",
200
- "Pre-written"],
201
- value="Pre-written")
202
- asr_model = gr.Dropdown(label="Speech recognition model",
 
203
  choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
204
  "openai/whisper-base.en", # better accuracy, a bit slower
205
  "distil-whisper/distil-small.en"
@@ -208,31 +208,26 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
208
  value="openai/whisper-tiny.en",
209
  )
210
  voice_clone_model = gr.Dropdown(
211
- label="Voice cloning model",
212
  choices=["Chatterbox", ], value="Chatterbox")
213
- #with gr.Column():
214
- # pass # Just for spacing
215
  with gr.Row():
216
  target = gr.Textbox(label="Target sentence", interactive=False,
217
  placeholder="Click 'Generate sentence'")
218
-
219
  with gr.Row():
220
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
221
  btn_clear = gr.Button("🧹 Clear")
222
-
223
  with gr.Row():
224
  consent_audio = gr.Audio(sources=["microphone"], type="filepath",
225
  label="Record your voice", key='consent_audio')
226
-
227
  with gr.Accordion("Advanced ASR settings", open=False):
228
  device_pref = gr.Radio(
229
  choices=["auto", "cpu", "cuda"],
230
  value="auto",
231
  label="Device preference"
232
  )
 
233
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
234
  label="Match threshold")
235
-
236
  with gr.Row():
237
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
238
  with gr.Row():
@@ -256,8 +251,8 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
256
  with gr.Column():
257
  gr.Markdown("## Audio input")
258
  # Prepopulating with the consent audio.
259
- # Set interactive=True to be able to change.
260
- tts_audio = gr.Audio(audio_input, type="filepath")
261
  with gr.Row():
262
  with gr.Column():
263
  gr.Markdown("## Text input")
@@ -280,7 +275,7 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
280
  label="Temperature", value=.8)
281
  with gr.Row():
282
  clone_btn = gr.Button("Clone!")
283
- cloned_audio = gr.Audio()
284
  clone_btn.click(fn=clone_voice,
285
  inputs=[tts_audio, tts_text, exaggeration,
286
  cfg_weight, seed_num, temp],
 
173
  label="Click for further information on this demo",
174
  open=False):
175
  gr.Markdown("""
176
+ To create a basic voice cloning system with a voice consent gate, you need three parts:
177
+ 1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
178
+ 2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
179
+ 3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
180
+
181
+ Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
182
+ """)
 
183
  with gr.Row():
184
  with gr.Column(scale=2):
185
  gr.Markdown(
 
194
  """
195
  )
196
  with gr.Column():
197
+ consent_method = gr.Dropdown(
198
+ label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
199
+ choices=["Llama 3.2 3B Instruct"],
200
+ value="Llama 3.2 3B Instruct"
201
+ )
202
+ asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
203
  choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
204
  "openai/whisper-base.en", # better accuracy, a bit slower
205
  "distil-whisper/distil-small.en"
 
208
  value="openai/whisper-tiny.en",
209
  )
210
  voice_clone_model = gr.Dropdown(
211
+ label="Voice cloning model (currently limited to Chatterbox)",
212
  choices=["Chatterbox", ], value="Chatterbox")
 
 
213
  with gr.Row():
214
  target = gr.Textbox(label="Target sentence", interactive=False,
215
  placeholder="Click 'Generate sentence'")
 
216
  with gr.Row():
217
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
218
  btn_clear = gr.Button("🧹 Clear")
 
219
  with gr.Row():
220
  consent_audio = gr.Audio(sources=["microphone"], type="filepath",
221
  label="Record your voice", key='consent_audio')
 
222
  with gr.Accordion("Advanced ASR settings", open=False):
223
  device_pref = gr.Radio(
224
  choices=["auto", "cpu", "cuda"],
225
  value="auto",
226
  label="Device preference"
227
  )
228
+ # In your own code, do not provide users with the option to change this: Set it yourself.
229
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
230
  label="Match threshold")
 
231
  with gr.Row():
232
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
233
  with gr.Row():
 
251
  with gr.Column():
252
  gr.Markdown("## Audio input")
253
  # Prepopulating with the consent audio.
254
+ # Setting interactive=False keeps it from being possible to upload something else.
255
+ tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
256
  with gr.Row():
257
  with gr.Column():
258
  gr.Markdown("## Text input")
 
275
  label="Temperature", value=.8)
276
  with gr.Row():
277
  clone_btn = gr.Button("Clone!")
278
+ cloned_audio = gr.Audio(show_download_button=True)
279
  clone_btn.click(fn=clone_voice,
280
  inputs=[tts_audio, tts_text, exaggeration,
281
  cfg_weight, seed_num, temp],
src/generate.py CHANGED
@@ -8,17 +8,12 @@ This module connects to an external language model (in this case, the public
8
  Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
9
  sentences that users can read aloud to give informed consent for voice cloning.
10
 
11
- If the model call fails (e.g., due to rate limits or network issues),
12
- a fallback sentence is chosen from a small built-in sentence bank.
13
-
14
  Functions:
15
  - _extract_llama_text(): Normalize the API output from the Llama demo.
16
  - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
17
- - gen_sentence_set(): Select a random prewritten sentence (for fallback/testing).
18
  """
19
 
20
  import os
21
- import random
22
  from typing import Any
23
  from gradio_client import Client
24
 
@@ -26,21 +21,6 @@ import src.process as process
26
  from src.prompts import get_consent_generation_prompt
27
 
28
 
29
- # ------------------- Sentence Bank (unchanged) -------------------
30
- SENTENCE_BANK = [
31
- "The quick brown fox jumps over the lazy dog.",
32
- "I promise to speak clearly and at a steady pace.",
33
- "Open source makes AI more transparent and inclusive.",
34
- "Hugging Face Spaces make demos easy to share.",
35
- "Today the weather in Berlin is pleasantly cool.",
36
- "Privacy and transparency should go hand in hand.",
37
- "Please generate a new sentence for me to read.",
38
- "Machine learning can amplify or reduce inequality.",
39
- "Responsible AI requires participation from everyone.",
40
- "This microphone test checks my pronunciation accuracy.",
41
- ]
42
-
43
-
44
  # ------------------- Model / Space Configuration -------------------
45
  # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
46
  # You can override these defaults by setting environment variables in your Space.
@@ -92,54 +72,44 @@ def _extract_llama_text(result: Any) -> str:
92
  return ""
93
 
94
 
95
- def gen_sentence(sentence_method="Pre-written", audio_model_name="Chatterbox"):
96
- # chatterbox model name, detailed prompt (short_prompt=False)
97
- if sentence_method == "Pre-written":
98
- return gen_sentence_set()
99
- else:
100
- try:
101
- return gen_sentence_llm(sentence_method,
102
- audio_model_name,
103
- fallback_on_error=False # ← show errors during testing
104
- )
105
- except Exception as e:
106
- # Show a helpful message directly in the Target sentence box
107
- return f"[ERROR calling LLM] {type(e).__name__}: {e}"
108
-
109
- # TODO: Support more than just Llama 3.2 3B Instruct
110
- def gen_sentence_llm(sentence_method="Llama 3.2 3B Instruct", audio_model_name: str = "Chatterbox", *, fallback_on_error: bool = False # Set True for production to avoid crashes
111
- ) -> str:
112
  """
113
- Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
114
-
115
- This function constructs a prompt describing the linguistic and ethical
116
- requirements for a consent sentence (via `get_consent_generation_prompt`)
117
- and sends it to the Llama demo hosted on Hugging Face Spaces.
118
-
119
- The response is normalized into a single English sentence suitable
120
- for reading aloud.
121
-
122
- Parameters
123
- ----------
124
- audio_model_name : str, optional
125
- The name of the voice-cloning model to mention in the sentence.
126
- Defaults to "Chatterbox".
127
- fallback_on_error : bool, optional
128
- If True, return a random fallback sentence instead of raising
129
- an error when the Space call fails. Default is False for debugging.
130
-
131
- Returns
132
- -------
133
- str
134
- A clean, human-readable consent sentence.
135
 
136
- Raises
137
- ------
138
- Exception
139
- Re-raises the underlying error if `fallback_on_error` is False.
140
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  # Generate the full natural-language prompt that the LLM will receive
142
- prompt = get_consent_generation_prompt(audio_model_name)
143
 
144
  try:
145
  # Initialize Gradio client for the Llama demo Space
@@ -170,24 +140,4 @@ def gen_sentence_llm(sentence_method="Llama 3.2 3B Instruct", audio_model_name:
170
 
171
  except Exception as e:
172
  print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
173
- if fallback_on_error:
174
- # If fallback is enabled, use a predefined sentence instead
175
- return random.choice(SENTENCE_BANK)
176
- # Otherwise propagate the exception so the UI displays it
177
- raise
178
-
179
-
180
- def gen_sentence_set() -> str:
181
- """
182
- Return a sentence from a predefined static list.
183
-
184
- This is used as a simple fallback generator when model-based
185
- generation is unavailable or for testing the ASR pipeline
186
- without network access.
187
-
188
- Returns
189
- -------
190
- str
191
- A single English sentence from the fallback bank.
192
- """
193
- return random.choice(SENTENCE_BANK)
 
8
  Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
9
  sentences that users can read aloud to give informed consent for voice cloning.
10
 
 
 
 
11
  Functions:
12
  - _extract_llama_text(): Normalize the API output from the Llama demo.
13
  - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
 
14
  """
15
 
16
  import os
 
17
  from typing import Any
18
  from gradio_client import Client
19
 
 
21
  from src.prompts import get_consent_generation_prompt
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # ------------------- Model / Space Configuration -------------------
25
  # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
26
  # You can override these defaults by setting environment variables in your Space.
 
72
  return ""
73
 
74
 
75
+ def gen_sentence(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  """
77
+ Always generate a sentence via the LLM.
78
+ :param consent_method:
79
+ """
80
+ try:
81
+ return gen_sentence_llm(consent_method, voice_clone_model)
82
+ except Exception as e:
83
+ # Show a helpful message directly in the Target sentence box
84
+ return f"[ERROR calling LLM] {type(e).__name__}: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # TODO: Support more than just Llama 3.2 3B Instruct
87
+ def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox") -> str:
 
 
88
  """
89
+ Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
90
+
91
+ This function constructs a prompt describing the linguistic and ethical
92
+ requirements for a consent sentence (via `get_consent_generation_prompt`)
93
+ and sends it to the Llama demo hosted on Hugging Face Spaces.
94
+
95
+ The response is normalized into a single English sentence suitable
96
+ for reading aloud.
97
+
98
+ Parameters
99
+ ----------
100
+ audio_model_name : str, optional
101
+ The name of the voice-cloning model to mention in the sentence.
102
+ Defaults to "Chatterbox".
103
+
104
+ Returns
105
+ -------
106
+ str
107
+ A clean, human-readable consent sentence.
108
+ :param consent_method:
109
+ :param voice_clone_model:
110
+ """
111
  # Generate the full natural-language prompt that the LLM will receive
112
+ prompt = get_consent_generation_prompt(voice_clone_model)
113
 
114
  try:
115
  # Initialize Gradio client for the Llama demo Space
 
140
 
141
  except Exception as e:
142
  print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
143
+ raise