Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from PIL import Image, ImageDraw | |
| import tempfile | |
| import os | |
| import cv2 | |
| from transformers import pipeline | |
| import soundfile as sf | |
| # Initialize models | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Initialize TTS pipeline | |
| try: | |
| tts_pipeline = pipeline( | |
| "text-to-speech", | |
| model="microsoft/speecht5_tts", | |
| device=device | |
| ) | |
| print("TTS pipeline loaded successfully") | |
| except Exception as e: | |
| print(f"Error loading TTS pipeline: {e}") | |
| tts_pipeline = None | |
| # Initialize speaker embeddings for TTS | |
| try: | |
| from datasets import load_dataset | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| print("Speaker embeddings loaded successfully") | |
| except Exception as e: | |
| print(f"Error loading speaker embeddings: {e}") | |
| speaker_embeddings = None | |
| # Character image - create simple colored background with basic shapes | |
| def create_character_image(): | |
| """Create a simple character image programmatically""" | |
| # Create a 400x400 image | |
| img = Image.new('RGB', (400, 400), color='white') | |
| draw = ImageDraw.Draw(img) | |
| # Friendly Robot character | |
| draw.rectangle([0, 0, 400, 400], fill='#4a9eff') # Blue background | |
| draw.ellipse([60, 60, 340, 340], fill='#ffffff') # Face | |
| draw.ellipse([140, 140, 180, 180], fill='#333333') # Left eye | |
| draw.ellipse([220, 140, 260, 180], fill='#333333') # Right eye | |
| draw.ellipse([170, 220, 230, 250], fill='#333333') # Mouth | |
| return img | |
| class TalkingCharacterGenerator: | |
| def __init__(self): | |
| self.temp_dir = tempfile.mkdtemp() | |
| def generate_tts_audio(self, text): | |
| """Generate speech audio from text""" | |
| try: | |
| # Check if TTS pipeline and speaker embeddings are available | |
| if tts_pipeline is None: | |
| print("TTS pipeline not available") | |
| return None, 0 | |
| if speaker_embeddings is None: | |
| print("Speaker embeddings not available") | |
| return None, 0 | |
| # Generate speech | |
| speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings}) | |
| # Save audio to temporary file | |
| audio_path = os.path.join(self.temp_dir, "speech.wav") | |
| # Convert to numpy array and save as WAV | |
| audio_data = speech["audio"] | |
| sample_rate = speech["sampling_rate"] | |
| # Normalize audio | |
| audio_data = audio_data / np.max(np.abs(audio_data)) | |
| # Save as WAV file | |
| sf.write(audio_path, audio_data, sample_rate) | |
| return audio_path, len(audio_data) / sample_rate # Return path and duration | |
| except Exception as e: | |
| print(f"TTS Error: {e}") | |
| return None, 0 | |
| def create_mouth_animation(self, duration, text): | |
| """Create mouth movement animation based on text and duration""" | |
| try: | |
| # Create character image programmatically | |
| image = create_character_image() | |
| image = image.resize((400, 400)) # Ensure correct size | |
| # Convert to numpy array | |
| img_array = np.array(image) | |
| # Animation parameters | |
| fps = 24 | |
| total_frames = int(duration * fps) | |
| if total_frames == 0: | |
| total_frames = 24 # Minimum 1 second | |
| frames = [] | |
| # Simple mouth animation based on text analysis | |
| words = text.split() | |
| syllables_per_word = [max(1, len(word) // 2) for word in words] | |
| total_syllables = sum(syllables_per_word) | |
| if total_syllables == 0: | |
| total_syllables = 1 | |
| for frame in range(total_frames): | |
| # Copy the original image | |
| frame_img = img_array.copy() | |
| # Calculate mouth opening based on syllables and time | |
| time_ratio = frame / total_frames | |
| syllable_position = time_ratio * total_syllables | |
| # Create mouth movement (simple animation) | |
| mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2 | |
| # Apply mouth animation (simple oval modification) | |
| center_x, center_y = 200, 240 # Approximate mouth position | |
| mouth_width = int(30 * (1 + mouth_open)) | |
| mouth_height = int(20 * mouth_open) | |
| # Draw mouth (simple approach) | |
| y_start = max(0, center_y - mouth_height // 2) | |
| y_end = min(400, center_y + mouth_height // 2) | |
| x_start = max(0, center_x - mouth_width // 2) | |
| x_end = min(400, center_x + mouth_width // 2) | |
| # Darken mouth area to simulate opening | |
| if y_end > y_start and x_end > x_start: | |
| frame_img[y_start:y_end, x_start:x_end] = (frame_img[y_start:y_end, x_start:x_end] * 0.7).astype(np.uint8) | |
| frames.append(frame_img) | |
| # Create video from frames | |
| video_path = os.path.join(self.temp_dir, "talking_character.mp4") | |
| # Use OpenCV to create video | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400)) | |
| for frame in frames: | |
| # Convert RGB to BGR for OpenCV | |
| frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) | |
| out.write(frame_bgr) | |
| out.release() | |
| return video_path | |
| except Exception as e: | |
| print(f"Animation Error: {e}") | |
| return None | |
| def generate_talking_character(self, text): | |
| """Main function to generate talking character video""" | |
| if not text: | |
| return None, "Please provide text." | |
| # Generate TTS audio | |
| audio_path, duration = self.generate_tts_audio(text) | |
| if not audio_path: | |
| return None, "Failed to generate speech audio. Please check if TTS models are loaded properly." | |
| # Create mouth animation | |
| video_path = self.create_mouth_animation(duration, text) | |
| if not video_path: | |
| return None, "Failed to create character animation." | |
| return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s" | |
| # Initialize the generator | |
| generator = TalkingCharacterGenerator() | |
| # Create Gradio interface | |
| def create_talking_character(text): | |
| """Gradio interface function""" | |
| try: | |
| video_path, message = generator.generate_talking_character(text) | |
| if video_path and os.path.exists(video_path): | |
| return video_path, message | |
| else: | |
| return None, message | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| # Create the Gradio app | |
| with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app: | |
| gr.Markdown("# 🎭 Talking Character Generator") | |
| gr.Markdown("Generate videos of a friendly robot character speaking your text with mouth movements!") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Enter your text", | |
| placeholder="Type what you want the character to say...", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| generate_btn = gr.Button("Generate Talking Character", variant="primary") | |
| with gr.Column(): | |
| video_output = gr.Video(label="Generated Talking Character") | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=create_talking_character, | |
| inputs=[text_input], | |
| outputs=[video_output, status_output] | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["Hello! Welcome to the talking character generator. I'm excited to speak your text!"], | |
| ["Hi there! I'm a friendly robot and I love to chat with you!"], | |
| ["Beep boop! I'm ready to speak your words with animated mouth movements!"] | |
| ], | |
| inputs=[text_input] | |
| ) | |
| if __name__ == "__main__": | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True | |
| ) |