import gradio as gr import torch import numpy as np from PIL import Image, ImageDraw import tempfile import os import cv2 from transformers import pipeline import soundfile as sf # Initialize models device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Initialize TTS pipeline try: tts_pipeline = pipeline( "text-to-speech", model="microsoft/speecht5_tts", device=device ) print("TTS pipeline loaded successfully") except Exception as e: print(f"Error loading TTS pipeline: {e}") tts_pipeline = None # Initialize speaker embeddings for TTS try: from datasets import load_dataset embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) print("Speaker embeddings loaded successfully") except Exception as e: print(f"Error loading speaker embeddings: {e}") speaker_embeddings = None # Character image - create simple colored background with basic shapes def create_character_image(): """Create a simple character image programmatically""" # Create a 400x400 image img = Image.new('RGB', (400, 400), color='white') draw = ImageDraw.Draw(img) # Friendly Robot character draw.rectangle([0, 0, 400, 400], fill='#4a9eff') # Blue background draw.ellipse([60, 60, 340, 340], fill='#ffffff') # Face draw.ellipse([140, 140, 180, 180], fill='#333333') # Left eye draw.ellipse([220, 140, 260, 180], fill='#333333') # Right eye draw.ellipse([170, 220, 230, 250], fill='#333333') # Mouth return img class TalkingCharacterGenerator: def __init__(self): self.temp_dir = tempfile.mkdtemp() def generate_tts_audio(self, text): """Generate speech audio from text""" try: # Check if TTS pipeline and speaker embeddings are available if tts_pipeline is None: print("TTS pipeline not available") return None, 0 if speaker_embeddings is None: print("Speaker embeddings not available") return None, 0 # Generate speech speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings}) # Save audio to temporary file audio_path = os.path.join(self.temp_dir, "speech.wav") # Convert to numpy array and save as WAV audio_data = speech["audio"] sample_rate = speech["sampling_rate"] # Normalize audio audio_data = audio_data / np.max(np.abs(audio_data)) # Save as WAV file sf.write(audio_path, audio_data, sample_rate) return audio_path, len(audio_data) / sample_rate # Return path and duration except Exception as e: print(f"TTS Error: {e}") return None, 0 def create_mouth_animation(self, duration, text): """Create mouth movement animation based on text and duration""" try: # Create character image programmatically image = create_character_image() image = image.resize((400, 400)) # Ensure correct size # Convert to numpy array img_array = np.array(image) # Animation parameters fps = 24 total_frames = int(duration * fps) if total_frames == 0: total_frames = 24 # Minimum 1 second frames = [] # Simple mouth animation based on text analysis words = text.split() syllables_per_word = [max(1, len(word) // 2) for word in words] total_syllables = sum(syllables_per_word) if total_syllables == 0: total_syllables = 1 for frame in range(total_frames): # Copy the original image frame_img = img_array.copy() # Calculate mouth opening based on syllables and time time_ratio = frame / total_frames syllable_position = time_ratio * total_syllables # Create mouth movement (simple animation) mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2 # Apply mouth animation (simple oval modification) center_x, center_y = 200, 240 # Approximate mouth position mouth_width = int(30 * (1 + mouth_open)) mouth_height = int(20 * mouth_open) # Draw mouth (simple approach) y_start = max(0, center_y - mouth_height // 2) y_end = min(400, center_y + mouth_height // 2) x_start = max(0, center_x - mouth_width // 2) x_end = min(400, center_x + mouth_width // 2) # Darken mouth area to simulate opening if y_end > y_start and x_end > x_start: frame_img[y_start:y_end, x_start:x_end] = (frame_img[y_start:y_end, x_start:x_end] * 0.7).astype(np.uint8) frames.append(frame_img) # Create video from frames video_path = os.path.join(self.temp_dir, "talking_character.mp4") # Use OpenCV to create video fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400)) for frame in frames: # Convert RGB to BGR for OpenCV frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) out.write(frame_bgr) out.release() return video_path except Exception as e: print(f"Animation Error: {e}") return None def generate_talking_character(self, text): """Main function to generate talking character video""" if not text: return None, "Please provide text." # Generate TTS audio audio_path, duration = self.generate_tts_audio(text) if not audio_path: return None, "Failed to generate speech audio. Please check if TTS models are loaded properly." # Create mouth animation video_path = self.create_mouth_animation(duration, text) if not video_path: return None, "Failed to create character animation." return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s" # Initialize the generator generator = TalkingCharacterGenerator() # Create Gradio interface def create_talking_character(text): """Gradio interface function""" try: video_path, message = generator.generate_talking_character(text) if video_path and os.path.exists(video_path): return video_path, message else: return None, message except Exception as e: return None, f"Error: {str(e)}" # Create the Gradio app with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app: gr.Markdown("# 🎭 Talking Character Generator") gr.Markdown("Generate videos of a friendly robot character speaking your text with mouth movements!") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter your text", placeholder="Type what you want the character to say...", lines=3, max_lines=10 ) generate_btn = gr.Button("Generate Talking Character", variant="primary") with gr.Column(): video_output = gr.Video(label="Generated Talking Character") status_output = gr.Textbox(label="Status", interactive=False) # Event handlers generate_btn.click( fn=create_talking_character, inputs=[text_input], outputs=[video_output, status_output] ) # Examples gr.Examples( examples=[ ["Hello! Welcome to the talking character generator. I'm excited to speak your text!"], ["Hi there! I'm a friendly robot and I love to chat with you!"], ["Beep boop! I'm ready to speak your words with animated mouth movements!"] ], inputs=[text_input] ) if __name__ == "__main__": app.launch( server_name="0.0.0.0", server_port=7860, share=True )