Spaces:

rane777
/

image_to_image

Running

App Files Files Community

image_to_image / app.py

rane777

debug

44f6f4f verified 5 months ago

raw

history blame contribute delete

8.77 kB

	import gradio as gr
	import torch
	import numpy as np
	from PIL import Image, ImageDraw
	import tempfile
	import os
	import cv2
	from transformers import pipeline
	import soundfile as sf

	# Initialize models
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Initialize TTS pipeline
	try:
	tts_pipeline = pipeline(
	"text-to-speech",
	model="microsoft/speecht5_tts",
	device=device
	)
	print("TTS pipeline loaded successfully")
	except Exception as e:
	print(f"Error loading TTS pipeline: {e}")
	tts_pipeline = None

	# Initialize speaker embeddings for TTS
	try:
	from datasets import load_dataset
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
	print("Speaker embeddings loaded successfully")
	except Exception as e:
	print(f"Error loading speaker embeddings: {e}")
	speaker_embeddings = None

	# Character image - create simple colored background with basic shapes
	def create_character_image():
	"""Create a simple character image programmatically"""
	# Create a 400x400 image
	img = Image.new('RGB', (400, 400), color='white')
	draw = ImageDraw.Draw(img)

	# Friendly Robot character
	draw.rectangle([0, 0, 400, 400], fill='#4a9eff') # Blue background
	draw.ellipse([60, 60, 340, 340], fill='#ffffff') # Face
	draw.ellipse([140, 140, 180, 180], fill='#333333') # Left eye
	draw.ellipse([220, 140, 260, 180], fill='#333333') # Right eye
	draw.ellipse([170, 220, 230, 250], fill='#333333') # Mouth

	return img

	class TalkingCharacterGenerator:
	def __init__(self):
	self.temp_dir = tempfile.mkdtemp()

	def generate_tts_audio(self, text):
	"""Generate speech audio from text"""
	try:
	# Check if TTS pipeline and speaker embeddings are available
	if tts_pipeline is None:
	print("TTS pipeline not available")
	return None, 0

	if speaker_embeddings is None:
	print("Speaker embeddings not available")
	return None, 0

	# Generate speech
	speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings})

	# Save audio to temporary file
	audio_path = os.path.join(self.temp_dir, "speech.wav")

	# Convert to numpy array and save as WAV
	audio_data = speech["audio"]
	sample_rate = speech["sampling_rate"]

	# Normalize audio
	audio_data = audio_data / np.max(np.abs(audio_data))

	# Save as WAV file
	sf.write(audio_path, audio_data, sample_rate)

	return audio_path, len(audio_data) / sample_rate # Return path and duration
	except Exception as e:
	print(f"TTS Error: {e}")
	return None, 0

	def create_mouth_animation(self, duration, text):
	"""Create mouth movement animation based on text and duration"""
	try:
	# Create character image programmatically
	image = create_character_image()
	image = image.resize((400, 400)) # Ensure correct size

	# Convert to numpy array
	img_array = np.array(image)

	# Animation parameters
	fps = 24
	total_frames = int(duration * fps)
	if total_frames == 0:
	total_frames = 24 # Minimum 1 second

	frames = []

	# Simple mouth animation based on text analysis
	words = text.split()
	syllables_per_word = [max(1, len(word) // 2) for word in words]
	total_syllables = sum(syllables_per_word)
	if total_syllables == 0:
	total_syllables = 1

	for frame in range(total_frames):
	# Copy the original image
	frame_img = img_array.copy()

	# Calculate mouth opening based on syllables and time
	time_ratio = frame / total_frames
	syllable_position = time_ratio * total_syllables

	# Create mouth movement (simple animation)
	mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2

	# Apply mouth animation (simple oval modification)
	center_x, center_y = 200, 240 # Approximate mouth position
	mouth_width = int(30 * (1 + mouth_open))
	mouth_height = int(20 * mouth_open)

	# Draw mouth (simple approach)
	y_start = max(0, center_y - mouth_height // 2)
	y_end = min(400, center_y + mouth_height // 2)
	x_start = max(0, center_x - mouth_width // 2)
	x_end = min(400, center_x + mouth_width // 2)

	# Darken mouth area to simulate opening
	if y_end > y_start and x_end > x_start:
	frame_img[y_start:y_end, x_start:x_end] = (frame_img[y_start:y_end, x_start:x_end] * 0.7).astype(np.uint8)

	frames.append(frame_img)

	# Create video from frames
	video_path = os.path.join(self.temp_dir, "talking_character.mp4")

	# Use OpenCV to create video
	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400))

	for frame in frames:
	# Convert RGB to BGR for OpenCV
	frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
	out.write(frame_bgr)

	out.release()

	return video_path

	except Exception as e:
	print(f"Animation Error: {e}")
	return None

	def generate_talking_character(self, text):
	"""Main function to generate talking character video"""
	if not text:
	return None, "Please provide text."

	# Generate TTS audio
	audio_path, duration = self.generate_tts_audio(text)
	if not audio_path:
	return None, "Failed to generate speech audio. Please check if TTS models are loaded properly."

	# Create mouth animation
	video_path = self.create_mouth_animation(duration, text)
	if not video_path:
	return None, "Failed to create character animation."

	return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s"

	# Initialize the generator
	generator = TalkingCharacterGenerator()

	# Create Gradio interface
	def create_talking_character(text):
	"""Gradio interface function"""
	try:
	video_path, message = generator.generate_talking_character(text)
	if video_path and os.path.exists(video_path):
	return video_path, message
	else:
	return None, message
	except Exception as e:
	return None, f"Error: {str(e)}"

	# Create the Gradio app
	with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 🎭 Talking Character Generator")
	gr.Markdown("Generate videos of a friendly robot character speaking your text with mouth movements!")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Enter your text",
	placeholder="Type what you want the character to say...",
	lines=3,
	max_lines=10
	)

	generate_btn = gr.Button("Generate Talking Character", variant="primary")

	with gr.Column():
	video_output = gr.Video(label="Generated Talking Character")
	status_output = gr.Textbox(label="Status", interactive=False)

	# Event handlers
	generate_btn.click(
	fn=create_talking_character,
	inputs=[text_input],
	outputs=[video_output, status_output]
	)

	# Examples
	gr.Examples(
	examples=[
	["Hello! Welcome to the talking character generator. I'm excited to speak your text!"],
	["Hi there! I'm a friendly robot and I love to chat with you!"],
	["Beep boop! I'm ready to speak your words with animated mouth movements!"]
	],
	inputs=[text_input]
	)

	if __name__ == "__main__":
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)