RepeatAfterMe

Running on T4

App Files Files Community

meg-huggingface commited on 8 days ago

Commit

6110073

2 Parent(s): d688fcb e8d021a

Fixing some of the code issues from Lucy's update, now it all seems to be updated nicely and working.

Browse files

Files changed (2) hide show

app.py +18 -23
src/generate.py +35 -85

app.py CHANGED Viewed

@@ -173,14 +173,13 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
                     label="Click for further information on this demo",
                     open=False):
                 gr.Markdown("""
-To create a basic voice cloning system with a voice consent gate, you need three parts:
-1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
-2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
-3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
-Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
-""")
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown(
@@ -195,11 +194,12 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
             """
             )
         with gr.Column():
-            consent_method = gr.Dropdown(label="Sentence generation method",
-                                         choices=["Llama 3.2 3B Instruct",
-                                                  "Pre-written"],
-                                         value="Pre-written")
-            asr_model = gr.Dropdown(label="Speech recognition model",
                                     choices=["openai/whisper-tiny.en",  # fastest (CPU-friendly)
                                             "openai/whisper-base.en",  # better accuracy, a bit slower
                                             "distil-whisper/distil-small.en"
@@ -208,31 +208,26 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
                                     value="openai/whisper-tiny.en",
                                     )
             voice_clone_model = gr.Dropdown(
-                label="Voice cloning model",
                 choices=["Chatterbox", ], value="Chatterbox")
-        #with gr.Column():
-        #    pass # Just for spacing
     with gr.Row():
         target = gr.Textbox(label="Target sentence", interactive=False,
                             placeholder="Click 'Generate sentence'")
     with gr.Row():
         btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
         btn_clear = gr.Button("🧹 Clear")
     with gr.Row():
         consent_audio = gr.Audio(sources=["microphone"], type="filepath",
                                  label="Record your voice", key='consent_audio')
     with gr.Accordion("Advanced ASR settings", open=False):
         device_pref = gr.Radio(
             choices=["auto", "cpu", "cuda"],
             value="auto",
             label="Device preference"
         )
         pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
                                    label="Match threshold")
     with gr.Row():
         btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
     with gr.Row():
@@ -256,8 +251,8 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
                     with gr.Column():
                         gr.Markdown("## Audio input")
                         # Prepopulating with the consent audio.
-                        # Set interactive=True to be able to change.
-                        tts_audio = gr.Audio(audio_input, type="filepath")
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("## Text input")
@@ -280,7 +275,7 @@ Since some voice-cloning TTS systems can now generate speech similar to a speake
                                          label="Temperature", value=.8)
                 with gr.Row():
                     clone_btn = gr.Button("Clone!")
-                    cloned_audio = gr.Audio()
                     clone_btn.click(fn=clone_voice,
                                     inputs=[tts_audio, tts_text, exaggeration,
                                             cfg_weight, seed_num, temp],

                     label="Click for further information on this demo",
                     open=False):
                 gr.Markdown("""
+                    To create a basic voice cloning system with a voice consent gate, you need three parts:
+                    1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
+                    2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
+                    3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
+                    Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
+                    """)
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown(
             """
             )
         with gr.Column():
+            consent_method = gr.Dropdown(
+                label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
+                choices=["Llama 3.2 3B Instruct"],
+                value="Llama 3.2 3B Instruct"
+            )
+            asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
                                     choices=["openai/whisper-tiny.en",  # fastest (CPU-friendly)
                                             "openai/whisper-base.en",  # better accuracy, a bit slower
                                             "distil-whisper/distil-small.en"
                                     value="openai/whisper-tiny.en",
                                     )
             voice_clone_model = gr.Dropdown(
+                label="Voice cloning model (currently limited to Chatterbox)",
                 choices=["Chatterbox", ], value="Chatterbox")
     with gr.Row():
         target = gr.Textbox(label="Target sentence", interactive=False,
                             placeholder="Click 'Generate sentence'")
     with gr.Row():
         btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
         btn_clear = gr.Button("🧹 Clear")
     with gr.Row():
         consent_audio = gr.Audio(sources=["microphone"], type="filepath",
                                  label="Record your voice", key='consent_audio')
     with gr.Accordion("Advanced ASR settings", open=False):
         device_pref = gr.Radio(
             choices=["auto", "cpu", "cuda"],
             value="auto",
             label="Device preference"
         )
+        # In your own code, do not provide users with the option to change this: Set it yourself.
         pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
                                    label="Match threshold")
     with gr.Row():
         btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
     with gr.Row():
                     with gr.Column():
                         gr.Markdown("## Audio input")
                         # Prepopulating with the consent audio.
+                        # Setting interactive=False keeps it from being possible to upload something else.
+                        tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("## Text input")
                                          label="Temperature", value=.8)
                 with gr.Row():
                     clone_btn = gr.Button("Clone!")
+                    cloned_audio = gr.Audio(show_download_button=True)
                     clone_btn.click(fn=clone_voice,
                                     inputs=[tts_audio, tts_text, exaggeration,
                                             cfg_weight, seed_num, temp],

src/generate.py CHANGED Viewed

@@ -8,17 +8,12 @@ This module connects to an external language model (in this case, the public
 Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
 sentences that users can read aloud to give informed consent for voice cloning.
-If the model call fails (e.g., due to rate limits or network issues),
-a fallback sentence is chosen from a small built-in sentence bank.
 Functions:
     - _extract_llama_text(): Normalize the API output from the Llama demo.
     - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
-    - gen_sentence_set(): Select a random prewritten sentence (for fallback/testing).
 """
 import os
-import random
 from typing import Any
 from gradio_client import Client
@@ -26,21 +21,6 @@ import src.process as process
 from src.prompts import get_consent_generation_prompt
-# ------------------- Sentence Bank (unchanged) -------------------
-SENTENCE_BANK = [
-    "The quick brown fox jumps over the lazy dog.",
-    "I promise to speak clearly and at a steady pace.",
-    "Open source makes AI more transparent and inclusive.",
-    "Hugging Face Spaces make demos easy to share.",
-    "Today the weather in Berlin is pleasantly cool.",
-    "Privacy and transparency should go hand in hand.",
-    "Please generate a new sentence for me to read.",
-    "Machine learning can amplify or reduce inequality.",
-    "Responsible AI requires participation from everyone.",
-    "This microphone test checks my pronunciation accuracy.",
-]
 # ------------------- Model / Space Configuration -------------------
 # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
 # You can override these defaults by setting environment variables in your Space.
@@ -92,54 +72,44 @@ def _extract_llama_text(result: Any) -> str:
     return ""
-def gen_sentence(sentence_method="Pre-written", audio_model_name="Chatterbox"):
-    # chatterbox model name, detailed prompt (short_prompt=False)
-    if sentence_method == "Pre-written":
-        return gen_sentence_set()
-    else:
-        try:
-            return gen_sentence_llm(sentence_method,
-                audio_model_name,
-                fallback_on_error=False  # ← show errors during testing
-            )
-        except Exception as e:
-            # Show a helpful message directly in the Target sentence box
-            return f"[ERROR calling LLM] {type(e).__name__}: {e}"
-# TODO: Support more than just Llama 3.2 3B Instruct
-def gen_sentence_llm(sentence_method="Llama 3.2 3B Instruct", audio_model_name: str = "Chatterbox", *, fallback_on_error: bool = False  # Set True for production to avoid crashes
-) -> str:
     """
-    Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
-    This function constructs a prompt describing the linguistic and ethical
-    requirements for a consent sentence (via `get_consent_generation_prompt`)
-    and sends it to the Llama demo hosted on Hugging Face Spaces.
-    The response is normalized into a single English sentence suitable
-    for reading aloud.
-    Parameters
-    ----------
-    audio_model_name : str, optional
-        The name of the voice-cloning model to mention in the sentence.
-        Defaults to "Chatterbox".
-    fallback_on_error : bool, optional
-        If True, return a random fallback sentence instead of raising
-        an error when the Space call fails. Default is False for debugging.
-    Returns
-    -------
-    str
-        A clean, human-readable consent sentence.
-    Raises
-    ------
-    Exception
-        Re-raises the underlying error if `fallback_on_error` is False.
     """
     # Generate the full natural-language prompt that the LLM will receive
-    prompt = get_consent_generation_prompt(audio_model_name)
     try:
         # Initialize Gradio client for the Llama demo Space
@@ -170,24 +140,4 @@ def gen_sentence_llm(sentence_method="Llama 3.2 3B Instruct", audio_model_name:
     except Exception as e:
         print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
-        if fallback_on_error:
-            # If fallback is enabled, use a predefined sentence instead
-            return random.choice(SENTENCE_BANK)
-        # Otherwise propagate the exception so the UI displays it
-        raise
-def gen_sentence_set() -> str:
-    """
-    Return a sentence from a predefined static list.
-    This is used as a simple fallback generator when model-based
-    generation is unavailable or for testing the ASR pipeline
-    without network access.
-    Returns
-    -------
-    str
-        A single English sentence from the fallback bank.
-    """
-    return random.choice(SENTENCE_BANK)

 Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
 sentences that users can read aloud to give informed consent for voice cloning.
 Functions:
     - _extract_llama_text(): Normalize the API output from the Llama demo.
     - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
 """
 import os
 from typing import Any
 from gradio_client import Client
 from src.prompts import get_consent_generation_prompt
 # ------------------- Model / Space Configuration -------------------
 # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
 # You can override these defaults by setting environment variables in your Space.
     return ""
+def gen_sentence(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox"):
     """
+    Always generate a sentence via the LLM.
+    :param consent_method:
+    """
+    try:
+        return gen_sentence_llm(consent_method, voice_clone_model)
+    except Exception as e:
+        # Show a helpful message directly in the Target sentence box
+        return f"[ERROR calling LLM] {type(e).__name__}: {e}"
+# TODO: Support more than just Llama 3.2 3B Instruct
+def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox") -> str:
     """
+   Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
+   This function constructs a prompt describing the linguistic and ethical
+   requirements for a consent sentence (via `get_consent_generation_prompt`)
+   and sends it to the Llama demo hosted on Hugging Face Spaces.
+   The response is normalized into a single English sentence suitable
+   for reading aloud.
+   Parameters
+   ----------
+   audio_model_name : str, optional
+       The name of the voice-cloning model to mention in the sentence.
+       Defaults to "Chatterbox".
+   Returns
+   -------
+   str
+       A clean, human-readable consent sentence.
+       :param consent_method:
+       :param voice_clone_model:
+   """
     # Generate the full natural-language prompt that the LLM will receive
+    prompt = get_consent_generation_prompt(voice_clone_model)
     try:
         # Initialize Gradio client for the Llama demo Space
     except Exception as e:
         print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
+        raise