rane777 commited on
Commit
e400519
·
verified ·
1 Parent(s): 01c4e90

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -0
app.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from PIL import Image
5
+ import io
6
+ import base64
7
+ from transformers import pipeline
8
+ import cv2
9
+ import tempfile
10
+ import os
11
+ from pathlib import Path
12
+ import requests
13
+ import json
14
+ import time
15
+
16
+ # Initialize models
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # Text-to-speech pipeline
20
+ tts_pipeline = pipeline(
21
+ "text-to-speech",
22
+ model="microsoft/speecht5_tts",
23
+ device=device
24
+ )
25
+
26
+ # Initialize speaker embeddings for TTS
27
+ from datasets import load_dataset
28
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
29
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
30
+
31
+ # Character images (base64 encoded placeholders - in production, use actual character images)
32
+ CHARACTERS = {
33
+ "Character 1 - Friendly Robot": "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICA8cmVjdCB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iIzRhOWVmZiIvPgogIDxjaXJjbGUgY3g9IjEwMCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9IiNmZmZmZmYiLz4KICA8Y2lyY2xlIGN4PSI4MCIgY3k9IjgwIiByPSIxMCIgZmlsbD0iIzMzMzMzMyIvPgogIDxjaXJjbGUgY3g9IjEyMCIgY3k9IjgwIiByPSIxMCIgZmlsbD0iIzMzMzMzMyIvPgogIDxlbGxpcHNlIGN4PSIxMDAiIGN5PSIxMjAiIHJ4PSIzMCIgcnk9IjEwIiBmaWxsPSIjMzMzMzMzIi8+CiAgPHRleHQgeD0iMTAwIiB5PSIxODAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IiMzMzMzMzMiIGZvbnQtZmFtaWx5PSJBcmlhbCIgZm9udC1zaXplPSIxNCIgZm9udC13ZWlnaHQ9ImJvbGQiPkZyaWVuZGx5IFJvYm90PC90ZXh0Pgo8L3N2Zz4=",
34
+
35
+ "Character 2 - Cartoon Person": "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICA8cmVjdCB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iI2ZmOTk5OSIvPgogIDxjaXJjbGUgY3g9IjEwMCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9IiNmZmRiYjMiLz4KICA8Y2lyY2xlIGN4PSI4MCIgY3k9IjgwIiByPSIxNSIgZmlsbD0iIzMzMzMzMyIvPgogIDxjaXJjbGUgY3g9IjEyMCIgY3k9IjgwIiByPSIxNSIgZmlsbD0iIzMzMzMzMyIvPgogIDxlbGxpcHNlIGN4PSIxMDAiIGN5PSIxMjAiIHJ4PSIyNSIgcnk9IjE1IiBmaWxsPSIjZmY2NjY2Ii8+CiAgPHRleHQgeD0iMTAwIiB5PSIxODAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IiMzMzMzMzMiIGZvbnQtZmFtaWx5PSJBcmlhbCIgZm9udC1zaXplPSIxNCIgZm9udC13ZWlnaHQ9ImJvbGQiPkNhcnRvb24gUGVyc29uPC90ZXh0Pgo8L3N2Zz4=",
36
+
37
+ "Character 3 - Cute Animal": "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICA8cmVjdCB3aWR0aD0iMjAwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iIzk5ZmY5OSIvPgogIDxjaXJjbGUgY3g9IjEwMCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9IiNmZmYiLz4KICA8Y2lyY2xlIGN4PSI2MCIgY3k9IjYwIiByPSIyMCIgZmlsbD0iI2ZmZiIvPgogIDxjaXJjbGUgY3g9IjE0MCIgY3k9IjYwIiByPSIyMCIgZmlsbD0iI2ZmZiIvPgogIDxjaXJjbGUgY3g9Ijc1IiBjeT0iNzUiIHI9IjEwIiBmaWxsPSIjMzMzMzMzIi8+CiAgPGNpcmNsZSBjeD0iMTI1IiBjeT0iNzUiIHI9IjEwIiBmaWxsPSIjMzMzMzMzIi8+CiAgPGVsbGlwc2UgY3g9IjEwMCIgY3k9IjEyMCIgcng9IjIwIiByeT0iMTAiIGZpbGw9IiNmZjY2NjYiLz4KICA8dGV4dCB4PSIxMDAiIHk9IjE4MCIgdGV4dC1hbmNob3I9Im1pZGRsZSIgZmlsbD0iIzMzMzMzMyIgZm9udC1mYW1pbHk9IkFyaWFsIiBmb250LXNpemU9IjE0IiBmb250LXdlaWdodD0iYm9sZCI+Q3V0ZSBBbmltYWw8L3RleHQ+Cjwvc3ZnPgo="
38
+ }
39
+
40
+ class TalkingCharacterGenerator:
41
+ def __init__(self):
42
+ self.temp_dir = tempfile.mkdtemp()
43
+
44
+ def generate_tts_audio(self, text):
45
+ """Generate speech audio from text"""
46
+ try:
47
+ # Generate speech
48
+ speech = tts_pipeline(text, forward_params={"speaker_embeddings": speaker_embeddings})
49
+
50
+ # Save audio to temporary file
51
+ audio_path = os.path.join(self.temp_dir, "speech.wav")
52
+
53
+ # Convert to numpy array and save as WAV
54
+ audio_data = speech["audio"]
55
+ sample_rate = speech["sampling_rate"]
56
+
57
+ # Normalize audio
58
+ audio_data = audio_data / np.max(np.abs(audio_data))
59
+
60
+ # Save as WAV file
61
+ import soundfile as sf
62
+ sf.write(audio_path, audio_data, sample_rate)
63
+
64
+ return audio_path, len(audio_data) / sample_rate # Return path and duration
65
+ except Exception as e:
66
+ print(f"TTS Error: {e}")
67
+ return None, 0
68
+
69
+ def create_mouth_animation(self, character_image_data, duration, text):
70
+ """Create mouth movement animation based on text and duration"""
71
+ try:
72
+ # Decode base64 image
73
+ if character_image_data.startswith('data:image'):
74
+ image_data = character_image_data.split(',')[1]
75
+ image_bytes = base64.b64decode(image_data)
76
+ else:
77
+ image_bytes = base64.b64decode(character_image_data)
78
+
79
+ # Create PIL image
80
+ image = Image.open(io.BytesIO(image_bytes))
81
+ image = image.convert('RGB')
82
+ image = image.resize((400, 400)) # Resize for better quality
83
+
84
+ # Convert to numpy array
85
+ img_array = np.array(image)
86
+
87
+ # Animation parameters
88
+ fps = 24
89
+ total_frames = int(duration * fps)
90
+ frames = []
91
+
92
+ # Simple mouth animation based on text analysis
93
+ words = text.split()
94
+ syllables_per_word = [max(1, len(word) // 2) for word in words]
95
+ total_syllables = sum(syllables_per_word)
96
+
97
+ for frame in range(total_frames):
98
+ # Copy the original image
99
+ frame_img = img_array.copy()
100
+
101
+ # Calculate mouth opening based on syllables and time
102
+ time_ratio = frame / total_frames
103
+ syllable_position = time_ratio * total_syllables
104
+
105
+ # Create mouth movement (simple animation)
106
+ mouth_open = abs(np.sin(syllable_position * np.pi * 2)) * 0.5 + 0.2
107
+
108
+ # Apply mouth animation (simple oval modification)
109
+ center_x, center_y = 200, 240 # Approximate mouth position
110
+ mouth_width = int(30 * (1 + mouth_open))
111
+ mouth_height = int(20 * mouth_open)
112
+
113
+ # Draw mouth (simple approach)
114
+ y_start = max(0, center_y - mouth_height // 2)
115
+ y_end = min(400, center_y + mouth_height // 2)
116
+ x_start = max(0, center_x - mouth_width // 2)
117
+ x_end = min(400, center_x + mouth_width // 2)
118
+
119
+ # Darken mouth area to simulate opening
120
+ frame_img[y_start:y_end, x_start:x_end] = frame_img[y_start:y_end, x_start:x_end] * 0.7
121
+
122
+ frames.append(frame_img)
123
+
124
+ # Create video from frames
125
+ video_path = os.path.join(self.temp_dir, "talking_character.mp4")
126
+
127
+ # Use OpenCV to create video
128
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
129
+ out = cv2.VideoWriter(video_path, fourcc, fps, (400, 400))
130
+
131
+ for frame in frames:
132
+ # Convert RGB to BGR for OpenCV
133
+ frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
134
+ out.write(frame_bgr)
135
+
136
+ out.release()
137
+
138
+ return video_path
139
+
140
+ except Exception as e:
141
+ print(f"Animation Error: {e}")
142
+ return None
143
+
144
+ def generate_talking_character(self, text, character_choice):
145
+ """Main function to generate talking character video"""
146
+ if not text or not character_choice:
147
+ return None, "Please provide text and select a character."
148
+
149
+ # Generate TTS audio
150
+ audio_path, duration = self.generate_tts_audio(text)
151
+ if not audio_path:
152
+ return None, "Failed to generate speech audio."
153
+
154
+ # Get character image
155
+ character_image = CHARACTERS.get(character_choice)
156
+ if not character_image:
157
+ return None, "Invalid character selection."
158
+
159
+ # Create mouth animation
160
+ video_path = self.create_mouth_animation(character_image, duration, text)
161
+ if not video_path:
162
+ return None, "Failed to create character animation."
163
+
164
+ return video_path, f"Successfully generated talking character video! Duration: {duration:.2f}s"
165
+
166
+ # Initialize the generator
167
+ generator = TalkingCharacterGenerator()
168
+
169
+ # Create Gradio interface
170
+ def create_talking_character(text, character):
171
+ """Gradio interface function"""
172
+ try:
173
+ video_path, message = generator.generate_talking_character(text, character)
174
+ if video_path and os.path.exists(video_path):
175
+ return video_path, message
176
+ else:
177
+ return None, message
178
+ except Exception as e:
179
+ return None, f"Error: {str(e)}"
180
+
181
+ # Create the Gradio app
182
+ with gr.Blocks(title="Talking Character Generator", theme=gr.themes.Soft()) as app:
183
+ gr.Markdown("# 🎭 Talking Character Generator")
184
+ gr.Markdown("Generate videos of characters speaking your text with mouth movements!")
185
+
186
+ with gr.Row():
187
+ with gr.Column():
188
+ text_input = gr.Textbox(
189
+ label="Enter your text",
190
+ placeholder="Type what you want the character to say...",
191
+ lines=3,
192
+ max_lines=10
193
+ )
194
+
195
+ character_dropdown = gr.Dropdown(
196
+ choices=list(CHARACTERS.keys()),
197
+ label="Select Character",
198
+ value=list(CHARACTERS.keys())[0]
199
+ )
200
+
201
+ generate_btn = gr.Button("Generate Talking Character", variant="primary")
202
+
203
+ with gr.Column():
204
+ video_output = gr.Video(label="Generated Talking Character")
205
+ status_output = gr.Textbox(label="Status", interactive=False)
206
+
207
+ # Event handlers
208
+ generate_btn.click(
209
+ fn=create_talking_character,
210
+ inputs=[text_input, character_dropdown],
211
+ outputs=[video_output, status_output]
212
+ )
213
+
214
+ # Examples
215
+ gr.Examples(
216
+ examples=[
217
+ ["Hello! Welcome to the talking character generator. I'm excited to speak your text!", "Character 1 - Friendly Robot"],
218
+ ["Hi there! I'm a cartoon character and I love to chat with you!", "Character 2 - Cartoon Person"],
219
+ ["Woof! I'm a cute animal character ready to speak your words!", "Character 3 - Cute Animal"]
220
+ ],
221
+ inputs=[text_input, character_dropdown]
222
+ )
223
+
224
+ if __name__ == "__main__":
225
+ app.launch(
226
+ server_name="0.0.0.0",
227
+ server_port=7860,
228
+ share=True
229
+ )