vixtts-demo

Paused

App Files Files Community

Uhhy commited on Sep 12, 2024

Commit

12802b8

verified ·

1 Parent(s): 24e9d34

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -7

app.py CHANGED Viewed

@@ -7,14 +7,19 @@ import uuid
 from io import StringIO
 import gradio as gr
 import spaces
 import torch
 import torchaudio
 from huggingface_hub import HfApi, hf_hub_download, snapshot_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
 os.system("python -m unidic download")
 HF_TOKEN = None
@@ -54,7 +59,7 @@ supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
 if not "es-AR" in supported_languages:
-    supported_languages.append("es-AR")
 def normalize_vietnamese_text(text):
     text = (
@@ -85,6 +90,14 @@ def calculate_keep_len(text, lang):
         return 13000 * word_count + 2000 * num_punct
     return -1
 @spaces.GPU(duration=0)
 def predict(
@@ -138,6 +151,11 @@ def predict(
         if normalize_text and language == "vi":
             prompt = normalize_vietnamese_text(prompt)
         t0 = time.time()
         out = MODEL.inference(
             prompt,
@@ -145,7 +163,7 @@ def predict(
             gpt_cond_latent,
             speaker_embedding,
             repetition_penalty=5.0,
-            temperature=0.75,
             enable_text_splitting=True,
         )
         inference_time = time.time() - t0
@@ -158,7 +176,11 @@ def predict(
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
-        torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
         if "device-side assert" in str(e):
@@ -230,7 +252,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             language_gr = gr.Dropdown(
                 label="Idioma",
                 choices=[
-                    "es-AR",
                     "vi",
                     "en",
                     "es",
@@ -251,15 +273,15 @@ with gr.Blocks(analytics_enabled=False) as demo:
                     "hi",
                 ],
                 max_choices=1,
-                value="es-AR",
             )
             normalize_text = gr.Checkbox(
-                label="Normalizar texto en vietnamita",
                 info="Solo aplicable al idioma vietnamita",
                 value=True,
             )
             ref_gr = gr.Audio(
-                label="Audio de referencia (opcional)",
                 type="filepath",
                 value="model/samples/nu-luu-loat.wav",
             )

 from io import StringIO
 import gradio as gr
+import nltk
+import numpy as np
+import pyrubberband
 import spaces
 import torch
 import torchaudio
 from huggingface_hub import HfApi, hf_hub_download, snapshot_download
+from nltk.sentiment import SentimentIntensityAnalyzer
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
+nltk.download('vader_lexicon')
 os.system("python -m unidic download")
 HF_TOKEN = None
 if not "vi" in supported_languages:
     supported_languages.append("vi")
 if not "es-AR" in supported_languages:
+    supported_languages.append("es-AR")
 def normalize_vietnamese_text(text):
     text = (
         return 13000 * word_count + 2000 * num_punct
     return -1
+def analyze_sentiment(text):
+    sia = SentimentIntensityAnalyzer()
+    scores = sia.polarity_scores(text)
+    return scores['compound']
+def change_pitch(audio_data, sampling_rate, sentiment):
+    semitones = sentiment * 2
+    return pyrubberband.pitch_shift(audio_data, sampling_rate, semitones)
 @spaces.GPU(duration=0)
 def predict(
         if normalize_text and language == "vi":
             prompt = normalize_vietnamese_text(prompt)
+        sentiment = analyze_sentiment(prompt)
+        temperature = 0.75 + sentiment * 0.2
+        temperature = max(0.5, min(temperature, 1.0))
         t0 = time.time()
         out = MODEL.inference(
             prompt,
             gpt_cond_latent,
             speaker_embedding,
             repetition_penalty=5.0,
+            temperature=temperature,
             enable_text_splitting=True,
         )
         inference_time = time.time() - t0
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
+        audio_data = np.array(out["wav"])
+        modified_audio = change_pitch(audio_data, 24000, sentiment)
+        torchaudio.save("output.wav", torch.tensor(modified_audio).unsqueeze(0), 24000)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             language_gr = gr.Dropdown(
                 label="Idioma",
                 choices=[
+                    "es-AR",
                     "vi",
                     "en",
                     "es",
                     "hi",
                 ],
                 max_choices=1,
+                value="es-AR",
             )
             normalize_text = gr.Checkbox(
+                label="Normalizar texto en vietnamita",
                 info="Solo aplicable al idioma vietnamita",
                 value=True,
             )
             ref_gr = gr.Audio(
+                label="Audio de referencia (opcional)",
                 type="filepath",
                 value="model/samples/nu-luu-loat.wav",
             )