import gradio as gr
import torch
import numpy as np
from PIL import Image, ImageDraw
import tempfile
import os
import cv2
from transformers import pipeline
import soundfile as sf

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize TTS pipeline
try:
    tts_pipeline = pipeline(
        "text-to-speech",
        model="microsoft/speecht5_tts",
        device=device
    )
    print("TTS pipeline loaded successfully")
except Exception as e:
    print(f"Error loading TTS pipeline: {e}")
    tts_pipeline = None

# Initialize speaker embeddings for TTS
try:
    from datasets import load_dataset
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    print("Speaker embeddings loaded successfully")
except Exception as e:
    print(f"Error loading speaker embeddings: {e}")
    speaker_embeddings = None

# Character image - create simple colored background with basic shapes
def create_character_image():
    """Create a simple character image programmatically"""
    # Create a 400x400 image
    img = Image.new('RGB', (400, 400), color='white')
    draw = ImageDraw.Draw(img)
    
    # Friendly Robot character
    draw.rectangle([0, 0, 400, 400], fill='#4a9eff')  # Blue background
    draw.ellipse([60, 60, 340, 340], fill='#ffffff')  # Face
    draw.ellipse([140, 140, 180, 180], fill='#333333')  # Left eye
    draw.ellipse([220, 140, 260, 180], fill='#333333')  # Right eye
    draw.ellipse([170, 220, 230, 250], fill='#333333')  # Mouth
    
    return img

class TalkingCharacterGenerator:
    def __init__(self):
        self.temp_dir = tempfile.mkdtemp()
        
    def generate_tts_audio(self, text):
        """Generate speech audio from text"""
        try:
            # Check if TTS pipeline and speaker embeddings are available
            if tts_pipeline is None:
                print("TTS pipeline not available")
                return None, 0
            
            if speaker_embeddings is None:
                print("Speaker embeddings not available")
                return None, 0
            
            # Generate speech
            speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings})
            
            # Save audio to temporary file
            audio_path = os.path.join(self.temp_dir, "speech.wav")
            
            # Convert to numpy array and save as WAV
            audio_data = speech["audio"]
            sample_rate = speech["sampling_rate"]
            
            # Normalize audio
            audio_data = audio_data / np.max(np.abs(audio_data))
            
            # Save as WAV file
            sf.write(audio_path, audio_data, sample_rate)
            
            return audio_path, len(audio_data) / sample_rate  # Return path and duration
        except Exception as e:
            print(f"TTS Error: {e}")
            return None, 0
    
    def create_mouth_animation(self, duration, text):
        """Create mouth movement animation based on text and duration"""
        try:
            # Create character image programmatically
            image = create_character_image()
            image = image.resize((400, 400))  # Ensure correct size
            
            # Convert to numpy array
            img_array = np.array(image)
            
            # Animation parameters
            fps = 24
            total_frames = int(duration * fps)
            if total_frames == 0:
                total_frames = 24  # Minimum 1 second
            
            frames = []
            
            # Simple mouth animation based on text analysis
            words = text.split()
            syllables_per_word = [max(1, len(word) // 2) for word in words]
            total_syllables = sum(syllables_per_word)
            if total_syllables == 0:
                total_syllables = 1
            
            for frame in range(total_frames):
                # Copy the original image
                frame_img = img_array.copy()
                
                # Calculate mouth opening based on syllables and time
                time_ratio = frame / total_frames
                syllable_position = time_ratio * total_syllables
                
                # Create mouth movement (simple animation)
                mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2
                
                # Apply mouth animation (simple oval modification)
                center_x, center_y = 200, 240  # Approximate mouth position
                mouth_width = int(30 * (1 + mouth_open))
                mouth_height = int(20 * mouth_open)
                
                # Draw mouth (simple approach)
                y_start = max(0, center_y - mouth_height // 2)
                y_end = min(400, center_y + mouth_height // 2)
                x_start = max(0, center_x - mouth_width // 2)
                x_end = min(400, center_x + mouth_width // 2)
                
                # Darken mouth area to simulate opening
                if y_end > y_start and x_end > x_start:
                    frame_img[y_start:y_end, x_start:x_end] = (frame_img[y_start:y_end, x_start:x_end] * 0.7).astype(np.uint8)
                
                frames.append(frame_img)
            
            # Create video from frames
            video_path = os.path.join(self.temp_dir, "talking_character.mp4")
            
            # Use OpenCV to create video
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400))
            
            for frame in frames:
                # Convert RGB to BGR for OpenCV
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                out.write(frame_bgr)
            
            out.release()
            
            return video_path
            
        except Exception as e:
            print(f"Animation Error: {e}")
            return None
    
    def generate_talking_character(self, text):
        """Main function to generate talking character video"""
        if not text:
            return None, "Please provide text."
        
        # Generate TTS audio
        audio_path, duration = self.generate_tts_audio(text)
        if not audio_path:
            return None, "Failed to generate speech audio. Please check if TTS models are loaded properly."
        
        # Create mouth animation
        video_path = self.create_mouth_animation(duration, text)
        if not video_path:
            return None, "Failed to create character animation."
        
        return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s"

# Initialize the generator
generator = TalkingCharacterGenerator()

# Create Gradio interface
def create_talking_character(text):
    """Gradio interface function"""
    try:
        video_path, message = generator.generate_talking_character(text)
        if video_path and os.path.exists(video_path):
            return video_path, message
        else:
            return None, message
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create the Gradio app
with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🎭 Talking Character Generator")
    gr.Markdown("Generate videos of a friendly robot character speaking your text with mouth movements!")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter your text",
                placeholder="Type what you want the character to say...",
                lines=3,
                max_lines=10
            )
            
            generate_btn = gr.Button("Generate Talking Character", variant="primary")
            
        with gr.Column():
            video_output = gr.Video(label="Generated Talking Character")
            status_output = gr.Textbox(label="Status", interactive=False)
    
    # Event handlers
    generate_btn.click(
        fn=create_talking_character,
        inputs=[text_input],
        outputs=[video_output, status_output]
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["Hello! Welcome to the talking character generator. I'm excited to speak your text!"],
            ["Hi there! I'm a friendly robot and I love to chat with you!"],
            ["Beep boop! I'm ready to speak your words with animated mouth movements!"]
        ],
        inputs=[text_input]
    )

if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )