|
|
import gradio as gr |
|
|
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs |
|
|
import transformers |
|
|
import numpy as np |
|
|
from twilio.rest import Client |
|
|
import os |
|
|
import torch |
|
|
import librosa |
|
|
|
|
|
pipe = transformers.pipeline( |
|
|
model="fixie-ai/ultravox-v0_5-llama-3_2-1b", |
|
|
trust_remote_code=True, |
|
|
device=torch.device("cuda"), |
|
|
) |
|
|
whisper = transformers.pipeline( |
|
|
model="openai/whisper-large-v3-turbo", device=torch.device("cuda") |
|
|
) |
|
|
|
|
|
account_sid = os.environ.get("TWILIO_ACCOUNT_SID") |
|
|
auth_token = os.environ.get("TWILIO_AUTH_TOKEN") |
|
|
|
|
|
if account_sid and auth_token: |
|
|
client = Client(account_sid, auth_token) |
|
|
token = client.tokens.create() |
|
|
rtc_configuration = { |
|
|
"iceServers": token.ice_servers, |
|
|
"iceTransportPolicy": "relay", |
|
|
} |
|
|
else: |
|
|
rtc_configuration = None |
|
|
|
|
|
|
|
|
def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]): |
|
|
original_sr = audio[0] |
|
|
target_sr = 16000 |
|
|
|
|
|
audio_sr = librosa.resample( |
|
|
audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr |
|
|
) |
|
|
|
|
|
tf_input = [d for d in transformers_chat] |
|
|
|
|
|
|
|
|
output = pipe( |
|
|
{"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr}, |
|
|
max_new_tokens=2048, |
|
|
) |
|
|
|
|
|
transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr}) |
|
|
|
|
|
|
|
|
conversation.append({"role": "user", "content": transcription["text"]}) |
|
|
conversation.append({"role": "assistant", "content": output}) |
|
|
transformers_chat.append({"role": "user", "content": transcription["text"]}) |
|
|
transformers_chat.append({"role": "assistant", "content": output}) |
|
|
|
|
|
yield AdditionalOutputs(transformers_chat, conversation) |
|
|
|
|
|
|
|
|
def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]): |
|
|
if not user_text.strip(): |
|
|
return transformers_chat, conversation |
|
|
|
|
|
|
|
|
conversation.append({"role": "user", "content": user_text}) |
|
|
transformers_chat.append({"role": "user", "content": user_text}) |
|
|
|
|
|
|
|
|
output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512) |
|
|
|
|
|
conversation.append({"role": "assistant", "content": output}) |
|
|
transformers_chat.append({"role": "assistant", "content": output}) |
|
|
return transformers_chat, conversation |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.HTML( |
|
|
""" |
|
|
<h1 style='text-align: center'> |
|
|
Talk to Ultravox v0.5 Llama 3.2 1b (Powered by WebRTC ⚡️) |
|
|
</h1> |
|
|
<p style='text-align: center'> |
|
|
Once you grant access to your microphone, you can talk naturally to Ultravox. |
|
|
When you stop talking, the audio will be sent for processing. |
|
|
</p> |
|
|
<p style='text-align: center'> |
|
|
Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation. |
|
|
</p> |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
transformers_chat = gr.State( |
|
|
value=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a friendly and helpful character. You love to answer questions for people.", |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
transcript = gr.Chatbot(label="Transcript", type="messages") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
text_input = gr.Textbox( |
|
|
placeholder="Type your message here and press Enter...", label="Your Message" |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
audio = WebRTC( |
|
|
rtc_configuration=rtc_configuration, |
|
|
label="Stream", |
|
|
mode="send", |
|
|
modality="audio", |
|
|
) |
|
|
|
|
|
|
|
|
audio.stream( |
|
|
ReplyOnPause(transcribe), |
|
|
inputs=[audio, transformers_chat, transcript], |
|
|
outputs=[audio], |
|
|
time_limit=90, |
|
|
) |
|
|
audio.on_additional_outputs( |
|
|
lambda t, g: (t, g), |
|
|
outputs=[transformers_chat, transcript], |
|
|
queue=False, |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
|
|
|
text_input.submit( |
|
|
respond_text, |
|
|
inputs=[text_input, transformers_chat, transcript], |
|
|
outputs=[transformers_chat, transcript], |
|
|
) |
|
|
|
|
|
text_input.submit(lambda: "", inputs=[], outputs=[text_input]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|