Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			T4
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			T4
	inital commit
Browse files- app.py +160 -0
 - packages.txt +1 -0
 - requirements.txt +5 -0
 
    	
        app.py
    ADDED
    
    | 
         @@ -0,0 +1,160 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import gradio as gr
         
     | 
| 2 | 
         
            +
            import random
         
     | 
| 3 | 
         
            +
            import re
         
     | 
| 4 | 
         
            +
            import difflib
         
     | 
| 5 | 
         
            +
            import torch
         
     | 
| 6 | 
         
            +
            from functools import lru_cache
         
     | 
| 7 | 
         
            +
            from transformers import pipeline
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            # -------- Sentences to practice (customize freely) ----------
         
     | 
| 10 | 
         
            +
            SENTENCE_BANK = [
         
     | 
| 11 | 
         
            +
                "The quick brown fox jumps over the lazy dog.",
         
     | 
| 12 | 
         
            +
                "I promise to speak clearly and at a steady pace.",
         
     | 
| 13 | 
         
            +
                "Open source makes AI more transparent and inclusive.",
         
     | 
| 14 | 
         
            +
                "Hugging Face Spaces make demos easy to share.",
         
     | 
| 15 | 
         
            +
                "Today the weather in Berlin is pleasantly cool.",
         
     | 
| 16 | 
         
            +
                "Privacy and transparency should go hand in hand.",
         
     | 
| 17 | 
         
            +
                "Please generate a new sentence for me to read.",
         
     | 
| 18 | 
         
            +
                "Machine learning can amplify or reduce inequality.",
         
     | 
| 19 | 
         
            +
                "Responsible AI requires participation from everyone.",
         
     | 
| 20 | 
         
            +
                "This microphone test checks my pronunciation accuracy.",
         
     | 
| 21 | 
         
            +
            ]
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            # -------- Utilities ----------
         
     | 
| 24 | 
         
            +
            def normalize_text(t: str) -> str:
         
     | 
| 25 | 
         
            +
                t = t.lower()
         
     | 
| 26 | 
         
            +
                # keep letters and numbers, replace anything else with space
         
     | 
| 27 | 
         
            +
                t = re.sub(r"[^a-z0-9'äöüßçéèêáàóòúùîïôñ\-]+", " ", t)
         
     | 
| 28 | 
         
            +
                # collapse whitespace
         
     | 
| 29 | 
         
            +
                t = re.sub(r"\s+", " ", t).strip()
         
     | 
| 30 | 
         
            +
                return t
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
            def similarity_and_diff(ref: str, hyp: str):
         
     | 
| 33 | 
         
            +
                """Return similarity ratio (0..1) and HTML diff highlighting changes."""
         
     | 
| 34 | 
         
            +
                ref_tokens = ref.split()
         
     | 
| 35 | 
         
            +
                hyp_tokens = hyp.split()
         
     | 
| 36 | 
         
            +
                sm = difflib.SequenceMatcher(a=ref_tokens, b=hyp_tokens)
         
     | 
| 37 | 
         
            +
                ratio = sm.ratio()
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
                # Build HTML with insertions/deletions highlighted
         
     | 
| 40 | 
         
            +
                out = []
         
     | 
| 41 | 
         
            +
                for op, i1, i2, j1, j2 in sm.get_opcodes():
         
     | 
| 42 | 
         
            +
                    if op == "equal":
         
     | 
| 43 | 
         
            +
                        out.append(" " + " ".join(ref_tokens[i1:i2]))
         
     | 
| 44 | 
         
            +
                    elif op == "delete":
         
     | 
| 45 | 
         
            +
                        out.append(' <span style="background:#ffe0e0;text-decoration:line-through;">'
         
     | 
| 46 | 
         
            +
                                   + " ".join(ref_tokens[i1:i2]) + "</span>")
         
     | 
| 47 | 
         
            +
                    elif op == "insert":
         
     | 
| 48 | 
         
            +
                        out.append(' <span style="background:#e0ffe0;">'
         
     | 
| 49 | 
         
            +
                                   + " ".join(hyp_tokens[j1:j2]) + "</span>")
         
     | 
| 50 | 
         
            +
                    elif op == "replace":
         
     | 
| 51 | 
         
            +
                        out.append(' <span style="background:#ffe0e0;text-decoration:line-through;">'
         
     | 
| 52 | 
         
            +
                                   + " ".join(ref_tokens[i1:i2]) + "</span>")
         
     | 
| 53 | 
         
            +
                        out.append(' <span style="background:#e0ffe0;">'
         
     | 
| 54 | 
         
            +
                                   + " ".join(hyp_tokens[j1:j2]) + "</span>")
         
     | 
| 55 | 
         
            +
                html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(out).strip() + "</div>"
         
     | 
| 56 | 
         
            +
                return ratio, html
         
     | 
| 57 | 
         
            +
             
     | 
| 58 | 
         
            +
            @lru_cache(maxsize=2)
         
     | 
| 59 | 
         
            +
            def get_asr(model_id: str, device_preference: str):
         
     | 
| 60 | 
         
            +
                """Cache an ASR pipeline. device_preference: 'auto'|'cpu'|'cuda'."""
         
     | 
| 61 | 
         
            +
                if device_preference == "cuda" and torch.cuda.is_available():
         
     | 
| 62 | 
         
            +
                    device = 0
         
     | 
| 63 | 
         
            +
                elif device_preference == "auto":
         
     | 
| 64 | 
         
            +
                    device = 0 if torch.cuda.is_available() else -1
         
     | 
| 65 | 
         
            +
                else:
         
     | 
| 66 | 
         
            +
                    device = -1
         
     | 
| 67 | 
         
            +
                return pipeline(
         
     | 
| 68 | 
         
            +
                    "automatic-speech-recognition",
         
     | 
| 69 | 
         
            +
                    model=model_id,
         
     | 
| 70 | 
         
            +
                    device=device,
         
     | 
| 71 | 
         
            +
                    chunk_length_s=30,
         
     | 
| 72 | 
         
            +
                    return_timestamps=False,
         
     | 
| 73 | 
         
            +
                )
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
            def gen_sentence():
         
     | 
| 76 | 
         
            +
                return random.choice(SENTENCE_BANK)
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
            def check_pronunciation(audio_path, target_sentence, model_id, lang, device_pref, pass_threshold):
         
     | 
| 79 | 
         
            +
                if not target_sentence:
         
     | 
| 80 | 
         
            +
                    return gr.update(value=""), gr.update(value=""), gr.update(value=""), gr.update(value="Please generate a sentence first.")
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
                asr = get_asr(model_id, device_pref)
         
     | 
| 83 | 
         
            +
                # Whisper models accept a 'generate' kwarg with language hints via tokenizer, but
         
     | 
| 84 | 
         
            +
                # transformers pipeline exposes it as 'generate_kwargs' for whisper models.
         
     | 
| 85 | 
         
            +
                try:
         
     | 
| 86 | 
         
            +
                    result = asr(audio_path, generate_kwargs={"language": lang} if lang else None)
         
     | 
| 87 | 
         
            +
                    hyp_raw = result["text"].strip()
         
     | 
| 88 | 
         
            +
                except Exception as e:
         
     | 
| 89 | 
         
            +
                    return "", "", "", f"Transcription failed: {e}"
         
     | 
| 90 | 
         
            +
             
     | 
| 91 | 
         
            +
                ref_norm = normalize_text(target_sentence)
         
     | 
| 92 | 
         
            +
                hyp_norm = normalize_text(hyp_raw)
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
                ratio, diff_html = similarity_and_diff(ref_norm, hyp_norm)
         
     | 
| 95 | 
         
            +
                passed = ratio >= pass_threshold
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
                summary = (
         
     | 
| 98 | 
         
            +
                    f"✅ Correct (≥ {int(pass_threshold*100)}%)"
         
     | 
| 99 | 
         
            +
                    if passed else
         
     | 
| 100 | 
         
            +
                    f"❌ Not a match (need ≥ {int(pass_threshold*100)}%)"
         
     | 
| 101 | 
         
            +
                )
         
     | 
| 102 | 
         
            +
                score = f"Similarity: {ratio*100:.1f}%"
         
     | 
| 103 | 
         
            +
             
     | 
| 104 | 
         
            +
                return hyp_raw, score, diff_html, summary
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            with gr.Blocks(title="Say the Sentence") as demo:
         
     | 
| 107 | 
         
            +
                gr.Markdown(
         
     | 
| 108 | 
         
            +
                    """
         
     | 
| 109 | 
         
            +
                    # 🎤 Say the Sentence
         
     | 
| 110 | 
         
            +
                    1) Generate a sentence.  
         
     | 
| 111 | 
         
            +
                    2) Press the mic to record yourself reading it.  
         
     | 
| 112 | 
         
            +
                    3) Transcribe & check.  
         
     | 
| 113 | 
         
            +
                    """
         
     | 
| 114 | 
         
            +
                )
         
     | 
| 115 | 
         
            +
             
     | 
| 116 | 
         
            +
                with gr.Row():
         
     | 
| 117 | 
         
            +
                    target = gr.Textbox(label="Target sentence", interactive=False, placeholder="Click 'Generate sentence'")
         
     | 
| 118 | 
         
            +
                with gr.Row():
         
     | 
| 119 | 
         
            +
                    btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
         
     | 
| 120 | 
         
            +
                    btn_clear = gr.Button("🧹 Clear")
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
                with gr.Row():
         
     | 
| 123 | 
         
            +
                    audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice")
         
     | 
| 124 | 
         
            +
                with gr.Accordion("Advanced settings", open=False):
         
     | 
| 125 | 
         
            +
                    model_id = gr.Dropdown(
         
     | 
| 126 | 
         
            +
                        choices=[
         
     | 
| 127 | 
         
            +
                            "openai/whisper-tiny.en",      # Fastest (English)
         
     | 
| 128 | 
         
            +
                            "openai/whisper-base.en",
         
     | 
| 129 | 
         
            +
                            "openai/whisper-small.en",
         
     | 
| 130 | 
         
            +
                            "distil-whisper/distil-small.en",  # Distil variant (English)
         
     | 
| 131 | 
         
            +
                            "openai/whisper-tiny",         # Multilingual tiny
         
     | 
| 132 | 
         
            +
                        ],
         
     | 
| 133 | 
         
            +
                        value="openai/whisper-tiny.en",
         
     | 
| 134 | 
         
            +
                        label="ASR model",
         
     | 
| 135 | 
         
            +
                    )
         
     | 
| 136 | 
         
            +
                    lang = gr.Textbox(value="en", label="Language hint (e.g., 'en', 'de', 'fr')", info="Whisper language code; leave as 'en' for English-only models.")
         
     | 
| 137 | 
         
            +
                    device_pref = gr.Radio(choices=["auto", "cpu", "cuda"], value="auto", label="Device preference")
         
     | 
| 138 | 
         
            +
                    pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01, label="Match threshold")
         
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
                with gr.Row():
         
     | 
| 141 | 
         
            +
                    btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
         
     | 
| 142 | 
         
            +
             
     | 
| 143 | 
         
            +
                with gr.Row():
         
     | 
| 144 | 
         
            +
                    hyp_out = gr.Textbox(label="Transcription", interactive=False)
         
     | 
| 145 | 
         
            +
                with gr.Row():
         
     | 
| 146 | 
         
            +
                    score_out = gr.Label(label="Score")
         
     | 
| 147 | 
         
            +
                    summary_out = gr.Label(label="Result")
         
     | 
| 148 | 
         
            +
                diff_out = gr.HTML(label="Word-level diff (red = expected but missing / green = extra or replacement)")
         
     | 
| 149 | 
         
            +
             
     | 
| 150 | 
         
            +
                # Events
         
     | 
| 151 | 
         
            +
                btn_gen.click(fn=gen_sentence, outputs=target)
         
     | 
| 152 | 
         
            +
                btn_clear.click(fn=lambda: ("", "", "", "", ""), outputs=[target, hyp_out, score_out, diff_out, summary_out])
         
     | 
| 153 | 
         
            +
                btn_check.click(
         
     | 
| 154 | 
         
            +
                    fn=check_pronunciation,
         
     | 
| 155 | 
         
            +
                    inputs=[audio, target, model_id, lang, device_pref, pass_threshold],
         
     | 
| 156 | 
         
            +
                    outputs=[hyp_out, score_out, diff_out, summary_out]
         
     | 
| 157 | 
         
            +
                )
         
     | 
| 158 | 
         
            +
             
     | 
| 159 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 160 | 
         
            +
                demo.launch()
         
     | 
    	
        packages.txt
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            ffmpeg
         
     | 
    	
        requirements.txt
    ADDED
    
    | 
         @@ -0,0 +1,5 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            gradio>=4.39.0
         
     | 
| 2 | 
         
            +
            transformers>=4.44.0
         
     | 
| 3 | 
         
            +
            torch>=2.2.0
         
     | 
| 4 | 
         
            +
            accelerate>=0.33.0
         
     | 
| 5 | 
         
            +
            sentencepiece>=0.2.0
         
     |