talk-to-ultravox-0.5

Paused

App Files Files Community

talk-to-ultravox-0.5 / app.py

Steveeeeeeen HF Staff

Update app.py

0bd7b17 verified 9 months ago

raw

history blame contribute delete

4.7 kB

	import gradio as gr
	from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
	import transformers
	import numpy as np
	from twilio.rest import Client
	import os
	import torch
	import librosa

	pipe = transformers.pipeline(
	model="fixie-ai/ultravox-v0_5-llama-3_2-1b",
	trust_remote_code=True,
	device=torch.device("cuda"),
	)
	whisper = transformers.pipeline(
	model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
	)

	account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
	auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

	if account_sid and auth_token:
	client = Client(account_sid, auth_token)
	token = client.tokens.create()
	rtc_configuration = {
	"iceServers": token.ice_servers,
	"iceTransportPolicy": "relay",
	}
	else:
	rtc_configuration = None


	def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]):
	original_sr = audio[0]
	target_sr = 16000

	audio_sr = librosa.resample(
	audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
	)

	tf_input = [d for d in transformers_chat]

	# Generate a response from the pipeline using the audio input
	output = pipe(
	{"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
	max_new_tokens=2048,
	)
	# Transcribe the audio using Whisper
	transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})

	# Update both conversation histories
	conversation.append({"role": "user", "content": transcription["text"]})
	conversation.append({"role": "assistant", "content": output})
	transformers_chat.append({"role": "user", "content": transcription["text"]})
	transformers_chat.append({"role": "assistant", "content": output})

	yield AdditionalOutputs(transformers_chat, conversation)


	def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]):
	if not user_text.strip():
	return transformers_chat, conversation

	# Append the user message from the textbox
	conversation.append({"role": "user", "content": user_text})
	transformers_chat.append({"role": "user", "content": user_text})

	# Generate a response using the pipeline. We assume it can process text input via "text"
	output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)

	conversation.append({"role": "assistant", "content": output})
	transformers_chat.append({"role": "assistant", "content": output})
	return transformers_chat, conversation


	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<h1 style='text-align: center'>
	Talk to Ultravox v0.5 Llama 3.2 1b (Powered by WebRTC ⚡️)
	</h1>
	<p style='text-align: center'>
	Once you grant access to your microphone, you can talk naturally to Ultravox.
	When you stop talking, the audio will be sent for processing.
	</p>
	<p style='text-align: center'>
	Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
	</p>
	"""
	)

	# Shared conversation state
	transformers_chat = gr.State(
	value=[
	{
	"role": "system",
	"content": "You are a friendly and helpful character. You love to answer questions for people.",
	}
	]
	)

	# Chat transcript at the top
	transcript = gr.Chatbot(label="Transcript", type="messages")

	# Lower row: text input and audio input side by side
	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	placeholder="Type your message here and press Enter...", label="Your Message"
	)
	with gr.Column(scale=1):
	audio = WebRTC(
	rtc_configuration=rtc_configuration,
	label="Stream",
	mode="send",
	modality="audio",
	)

	# Audio stream: process audio when speaking stops.
	audio.stream(
	ReplyOnPause(transcribe),
	inputs=[audio, transformers_chat, transcript],
	outputs=[audio],
	time_limit=90,
	)
	audio.on_additional_outputs(
	lambda t, g: (t, g),
	outputs=[transformers_chat, transcript],
	queue=False,
	show_progress="hidden",
	)

	# Text input: submit callback when pressing Enter.
	text_input.submit(
	respond_text,
	inputs=[text_input, transformers_chat, transcript],
	outputs=[transformers_chat, transcript],
	)
	# Clear text input after submission.
	text_input.submit(lambda: "", inputs=[], outputs=[text_input])

	if __name__ == "__main__":
	demo.launch()