image_to_image / app.py
rane777's picture
debug
44f6f4f verified
import gradio as gr
import torch
import numpy as np
from PIL import Image, ImageDraw
import tempfile
import os
import cv2
from transformers import pipeline
import soundfile as sf
# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize TTS pipeline
try:
tts_pipeline = pipeline(
"text-to-speech",
model="microsoft/speecht5_tts",
device=device
)
print("TTS pipeline loaded successfully")
except Exception as e:
print(f"Error loading TTS pipeline: {e}")
tts_pipeline = None
# Initialize speaker embeddings for TTS
try:
from datasets import load_dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
print("Speaker embeddings loaded successfully")
except Exception as e:
print(f"Error loading speaker embeddings: {e}")
speaker_embeddings = None
# Character image - create simple colored background with basic shapes
def create_character_image():
"""Create a simple character image programmatically"""
# Create a 400x400 image
img = Image.new('RGB', (400, 400), color='white')
draw = ImageDraw.Draw(img)
# Friendly Robot character
draw.rectangle([0, 0, 400, 400], fill='#4a9eff') # Blue background
draw.ellipse([60, 60, 340, 340], fill='#ffffff') # Face
draw.ellipse([140, 140, 180, 180], fill='#333333') # Left eye
draw.ellipse([220, 140, 260, 180], fill='#333333') # Right eye
draw.ellipse([170, 220, 230, 250], fill='#333333') # Mouth
return img
class TalkingCharacterGenerator:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
def generate_tts_audio(self, text):
"""Generate speech audio from text"""
try:
# Check if TTS pipeline and speaker embeddings are available
if tts_pipeline is None:
print("TTS pipeline not available")
return None, 0
if speaker_embeddings is None:
print("Speaker embeddings not available")
return None, 0
# Generate speech
speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings})
# Save audio to temporary file
audio_path = os.path.join(self.temp_dir, "speech.wav")
# Convert to numpy array and save as WAV
audio_data = speech["audio"]
sample_rate = speech["sampling_rate"]
# Normalize audio
audio_data = audio_data / np.max(np.abs(audio_data))
# Save as WAV file
sf.write(audio_path, audio_data, sample_rate)
return audio_path, len(audio_data) / sample_rate # Return path and duration
except Exception as e:
print(f"TTS Error: {e}")
return None, 0
def create_mouth_animation(self, duration, text):
"""Create mouth movement animation based on text and duration"""
try:
# Create character image programmatically
image = create_character_image()
image = image.resize((400, 400)) # Ensure correct size
# Convert to numpy array
img_array = np.array(image)
# Animation parameters
fps = 24
total_frames = int(duration * fps)
if total_frames == 0:
total_frames = 24 # Minimum 1 second
frames = []
# Simple mouth animation based on text analysis
words = text.split()
syllables_per_word = [max(1, len(word) // 2) for word in words]
total_syllables = sum(syllables_per_word)
if total_syllables == 0:
total_syllables = 1
for frame in range(total_frames):
# Copy the original image
frame_img = img_array.copy()
# Calculate mouth opening based on syllables and time
time_ratio = frame / total_frames
syllable_position = time_ratio * total_syllables
# Create mouth movement (simple animation)
mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2
# Apply mouth animation (simple oval modification)
center_x, center_y = 200, 240 # Approximate mouth position
mouth_width = int(30 * (1 + mouth_open))
mouth_height = int(20 * mouth_open)
# Draw mouth (simple approach)
y_start = max(0, center_y - mouth_height // 2)
y_end = min(400, center_y + mouth_height // 2)
x_start = max(0, center_x - mouth_width // 2)
x_end = min(400, center_x + mouth_width // 2)
# Darken mouth area to simulate opening
if y_end > y_start and x_end > x_start:
frame_img[y_start:y_end, x_start:x_end] = (frame_img[y_start:y_end, x_start:x_end] * 0.7).astype(np.uint8)
frames.append(frame_img)
# Create video from frames
video_path = os.path.join(self.temp_dir, "talking_character.mp4")
# Use OpenCV to create video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400))
for frame in frames:
# Convert RGB to BGR for OpenCV
frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
out.write(frame_bgr)
out.release()
return video_path
except Exception as e:
print(f"Animation Error: {e}")
return None
def generate_talking_character(self, text):
"""Main function to generate talking character video"""
if not text:
return None, "Please provide text."
# Generate TTS audio
audio_path, duration = self.generate_tts_audio(text)
if not audio_path:
return None, "Failed to generate speech audio. Please check if TTS models are loaded properly."
# Create mouth animation
video_path = self.create_mouth_animation(duration, text)
if not video_path:
return None, "Failed to create character animation."
return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s"
# Initialize the generator
generator = TalkingCharacterGenerator()
# Create Gradio interface
def create_talking_character(text):
"""Gradio interface function"""
try:
video_path, message = generator.generate_talking_character(text)
if video_path and os.path.exists(video_path):
return video_path, message
else:
return None, message
except Exception as e:
return None, f"Error: {str(e)}"
# Create the Gradio app
with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎭 Talking Character Generator")
gr.Markdown("Generate videos of a friendly robot character speaking your text with mouth movements!")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter your text",
placeholder="Type what you want the character to say...",
lines=3,
max_lines=10
)
generate_btn = gr.Button("Generate Talking Character", variant="primary")
with gr.Column():
video_output = gr.Video(label="Generated Talking Character")
status_output = gr.Textbox(label="Status", interactive=False)
# Event handlers
generate_btn.click(
fn=create_talking_character,
inputs=[text_input],
outputs=[video_output, status_output]
)
# Examples
gr.Examples(
examples=[
["Hello! Welcome to the talking character generator. I'm excited to speak your text!"],
["Hi there! I'm a friendly robot and I love to chat with you!"],
["Beep boop! I'm ready to speak your words with animated mouth movements!"]
],
inputs=[text_input]
)
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)