Spaces:

rane777
/

image_to_image

Running

App Files Files Community

rane777 commited on Jul 6

Commit

e400519

verified ·

1 Parent(s): 01c4e90

Create app.py

Browse files

Files changed (1) hide show

app.py +229 -0

app.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image
+import io
+import base64
+from transformers import pipeline
+import cv2
+import tempfile
+import os
+from pathlib import Path
+import requests
+import json
+import time
+# Initialize models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Text-to-speech pipeline
+tts_pipeline = pipeline(
+    "text-to-speech",
+    model="microsoft/speecht5_tts",
+    device=device
+)
+# Initialize speaker embeddings for TTS
+from datasets import load_dataset
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+# Character images (base64 encoded placeholders - in production, use actual character images)
+CHARACTERS = {
+    "Character 1 - Friendly Robot": "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICA8cmVjdCB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iIzRhOWVmZiIvPgogIDxjaXJjbGUgY3g9IjEwMCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9IiNmZmZmZmYiLz4KICA8Y2lyY2xlIGN4PSI4MCIgY3k9IjgwIiByPSIxMCIgZmlsbD0iIzMzMzMzMyIvPgogIDxjaXJjbGUgY3g9IjEyMCIgY3k9IjgwIiByPSIxMCIgZmlsbD0iIzMzMzMzMyIvPgogIDxlbGxpcHNlIGN4PSIxMDAiIGN5PSIxMjAiIHJ4PSIzMCIgcnk9IjEwIiBmaWxsPSIjMzMzMzMzIi8+CiAgPHRleHQgeD0iMTAwIiB5PSIxODAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IiMzMzMzMzMiIGZvbnQtZmFtaWx5PSJBcmlhbCIgZm9udC1zaXplPSIxNCIgZm9udC13ZWlnaHQ9ImJvbGQiPkZyaWVuZGx5IFJvYm90PC90ZXh0Pgo8L3N2Zz4=",
+    "Character 2 - Cartoon Person": "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICA8cmVjdCB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iI2ZmOTk5OSIvPgogIDxjaXJjbGUgY3g9IjEwMCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9IiNmZmRiYjMiLz4KICA8Y2lyY2xlIGN4PSI4MCIgY3k9IjgwIiByPSIxNSIgZmlsbD0iIzMzMzMzMyIvPgogIDxjaXJjbGUgY3g9IjEyMCIgY3k9IjgwIiByPSIxNSIgZmlsbD0iIzMzMzMzMyIvPgogIDxlbGxpcHNlIGN4PSIxMDAiIGN5PSIxMjAiIHJ4PSIyNSIgcnk9IjE1IiBmaWxsPSIjZmY2NjY2Ii8+CiAgPHRleHQgeD0iMTAwIiB5PSIxODAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IiMzMzMzMzMiIGZvbnQtZmFtaWx5PSJBcmlhbCIgZm9udC1zaXplPSIxNCIgZm9udC13ZWlnaHQ9ImJvbGQiPkNhcnRvb24gUGVyc29uPC90ZXh0Pgo8L3N2Zz4=",
+    "Character 3 - Cute Animal": "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICA8cmVjdCB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iIzk5ZmY5OSIvPgogIDxjaXJjbGUgY3g9IjEwMCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9IiNmZmYiLz4KICA8Y2lyY2xlIGN4PSI2MCIgY3k9IjYwIiByPSIyMCIgZmlsbD0iI2ZmZiIvPgogIDxjaXJjbGUgY3g9IjE0MCIgY3k9IjYwIiByPSIyMCIgZmlsbD0iI2ZmZiIvPgogIDxjaXJjbGUgY3g9Ijc1IiBjeT0iNzUiIHI9IjEwIiBmaWxsPSIjMzMzMzMzIi8+CiAgPGNpcmNsZSBjeD0iMTI1IiBjeT0iNzUiIHI9IjEwIiBmaWxsPSIjMzMzMzMzIi8+CiAgPGVsbGlwc2UgY3g9IjEwMCIgY3k9IjEyMCIgcng9IjIwIiByeT0iMTAiIGZpbGw9IiNmZjY2NjYiLz4KICA8dGV4dCB4PSIxMDAiIHk9IjE4MCIgdGV4dC1hbmNob3I9Im1pZGRsZSIgZmlsbD0iIzMzMzMzMyIgZm9udC1mYW1pbHk9IkFyaWFsIiBmb250LXNpemU9IjE0IiBmb250LXdlaWdodD0iYm9sZCI+Q3V0ZSBBbmltYWw8L3RleHQ+Cjwvc3ZnPgo="
+}
+class TalkingCharacterGenerator:
+    def __init__(self):
+        self.temp_dir = tempfile.mkdtemp()
+    def generate_tts_audio(self, text):
+        """Generate speech audio from text"""
+        try:
+            # Generate speech
+            speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings})
+            # Save audio to temporary file
+            audio_path = os.path.join(self.temp_dir, "speech.wav")
+            # Convert to numpy array and save as WAV
+            audio_data = speech["audio"]
+            sample_rate = speech["sampling_rate"]
+            # Normalize audio
+            audio_data = audio_data / np.max(np.abs(audio_data))
+            # Save as WAV file
+            import soundfile as sf
+            sf.write(audio_path, audio_data, sample_rate)
+            return audio_path, len(audio_data) / sample_rate  # Return path and duration
+        except Exception as e:
+            print(f"TTS Error: {e}")
+            return None, 0
+    def create_mouth_animation(self, character_image_data, duration, text):
+        """Create mouth movement animation based on text and duration"""
+        try:
+            # Decode base64 image
+            if character_image_data.startswith('data:image'):
+                image_data = character_image_data.split(',')[1]
+                image_bytes = base64.b64decode(image_data)
+            else:
+                image_bytes = base64.b64decode(character_image_data)
+            # Create PIL image
+            image = Image.open(io.BytesIO(image_bytes))
+            image = image.convert('RGB')
+            image = image.resize((400, 400))  # Resize for better quality
+            # Convert to numpy array
+            img_array = np.array(image)
+            # Animation parameters
+            fps = 24
+            total_frames = int(duration * fps)
+            frames = []
+            # Simple mouth animation based on text analysis
+            words = text.split()
+            syllables_per_word = [max(1, len(word) // 2) for word in words]
+            total_syllables = sum(syllables_per_word)
+            for frame in range(total_frames):
+                # Copy the original image
+                frame_img = img_array.copy()
+                # Calculate mouth opening based on syllables and time
+                time_ratio = frame / total_frames
+                syllable_position = time_ratio * total_syllables
+                # Create mouth movement (simple animation)
+                mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2
+                # Apply mouth animation (simple oval modification)
+                center_x, center_y = 200, 240  # Approximate mouth position
+                mouth_width = int(30 * (1 + mouth_open))
+                mouth_height = int(20 * mouth_open)
+                # Draw mouth (simple approach)
+                y_start = max(0, center_y - mouth_height // 2)
+                y_end = min(400, center_y + mouth_height // 2)
+                x_start = max(0, center_x - mouth_width // 2)
+                x_end = min(400, center_x + mouth_width // 2)
+                # Darken mouth area to simulate opening
+                frame_img[y_start:y_end, x_start:x_end] = frame_img[y_start:y_end, x_start:x_end] * 0.7
+                frames.append(frame_img)
+            # Create video from frames
+            video_path = os.path.join(self.temp_dir, "talking_character.mp4")
+            # Use OpenCV to create video
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400))
+            for frame in frames:
+                # Convert RGB to BGR for OpenCV
+                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                out.write(frame_bgr)
+            out.release()
+            return video_path
+        except Exception as e:
+            print(f"Animation Error: {e}")
+            return None
+    def generate_talking_character(self, text, character_choice):
+        """Main function to generate talking character video"""
+        if not text or not character_choice:
+            return None, "Please provide text and select a character."
+        # Generate TTS audio
+        audio_path, duration = self.generate_tts_audio(text)
+        if not audio_path:
+            return None, "Failed to generate speech audio."
+        # Get character image
+        character_image = CHARACTERS.get(character_choice)
+        if not character_image:
+            return None, "Invalid character selection."
+        # Create mouth animation
+        video_path = self.create_mouth_animation(character_image, duration, text)
+        if not video_path:
+            return None, "Failed to create character animation."
+        return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s"
+# Initialize the generator
+generator = TalkingCharacterGenerator()
+# Create Gradio interface
+def create_talking_character(text, character):
+    """Gradio interface function"""
+    try:
+        video_path, message = generator.generate_talking_character(text, character)
+        if video_path and os.path.exists(video_path):
+            return video_path, message
+        else:
+            return None, message
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Create the Gradio app
+with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🎭 Talking Character Generator")
+    gr.Markdown("Generate videos of characters speaking your text with mouth movements!")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Enter your text",
+                placeholder="Type what you want the character to say...",
+                lines=3,
+                max_lines=10
+            )
+            character_dropdown = gr.Dropdown(
+                choices=list(CHARACTERS.keys()),
+                label="Select Character",
+                value=list(CHARACTERS.keys())[0]
+            )
+            generate_btn = gr.Button("Generate Talking Character", variant="primary")
+        with gr.Column():
+            video_output = gr.Video(label="Generated Talking Character")
+            status_output = gr.Textbox(label="Status", interactive=False)
+    # Event handlers
+    generate_btn.click(
+        fn=create_talking_character,
+        inputs=[text_input, character_dropdown],
+        outputs=[video_output, status_output]
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Hello! Welcome to the talking character generator. I'm excited to speak your text!", "Character 1 - Friendly Robot"],
+            ["Hi there! I'm a cartoon character and I love to chat with you!", "Character 2 - Cartoon Person"],
+            ["Woof! I'm a cute animal character ready to speak your words!", "Character 3 - Cute Animal"]
+        ],
+        inputs=[text_input, character_dropdown]
+    )
+if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )