Switch to Modal backend with security
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
MCP Video Agent -
|
| 3 |
-
|
| 4 |
-
Optimized for HF Space deployment with implicit caching
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
|
@@ -9,172 +8,62 @@ import gradio as gr
|
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
import base64
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# ==========================================
|
| 14 |
-
#
|
| 15 |
# ==========================================
|
| 16 |
-
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
return key
|
| 22 |
-
print(f"β οΈ {key_name} not found")
|
| 23 |
-
return None
|
| 24 |
-
|
| 25 |
-
# ==========================================
|
| 26 |
-
# Video Analysis with Implicit Caching
|
| 27 |
-
# ==========================================
|
| 28 |
-
|
| 29 |
-
# Cache for uploaded Gemini files
|
| 30 |
-
gemini_files_cache = {}
|
| 31 |
-
|
| 32 |
-
def analyze_video_with_gemini(query: str, video_path: str):
|
| 33 |
-
"""
|
| 34 |
-
Analyze video using Gemini 2.5 Flash with implicit caching.
|
| 35 |
-
|
| 36 |
-
Args:
|
| 37 |
-
query: User's question
|
| 38 |
-
video_path: Local path to video file
|
| 39 |
-
|
| 40 |
-
Returns:
|
| 41 |
-
str: Analysis result
|
| 42 |
-
"""
|
| 43 |
-
from google import genai
|
| 44 |
-
import hashlib
|
| 45 |
-
|
| 46 |
-
# Get API key
|
| 47 |
-
api_key = get_api_key("GOOGLE_API_KEY")
|
| 48 |
-
if not api_key:
|
| 49 |
-
return "β Error: GOOGLE_API_KEY not set. Please configure it in Space Settings β Secrets."
|
| 50 |
-
|
| 51 |
-
client = genai.Client(api_key=api_key)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
cache_key = f"{video_path}_{video_hash}"
|
| 58 |
-
|
| 59 |
-
try:
|
| 60 |
-
# Check if we already uploaded this file
|
| 61 |
-
if cache_key in gemini_files_cache:
|
| 62 |
-
file_name = gemini_files_cache[cache_key]
|
| 63 |
-
print(f"β»οΈ Using cached file: {file_name}")
|
| 64 |
-
|
| 65 |
-
try:
|
| 66 |
-
video_file = client.files.get(name=file_name)
|
| 67 |
-
if video_file.state.name == 'ACTIVE':
|
| 68 |
-
print(f"β
Cached file is active")
|
| 69 |
-
else:
|
| 70 |
-
print(f"β οΈ Cached file state: {video_file.state.name}, re-uploading...")
|
| 71 |
-
video_file = None
|
| 72 |
-
except Exception as e:
|
| 73 |
-
print(f"β οΈ Cached file retrieval failed: {e}")
|
| 74 |
-
video_file = None
|
| 75 |
-
else:
|
| 76 |
-
video_file = None
|
| 77 |
-
|
| 78 |
-
# Upload if needed
|
| 79 |
-
if video_file is None:
|
| 80 |
-
print(f"π€ Uploading video to Gemini...")
|
| 81 |
-
video_file = client.files.upload(file=video_path)
|
| 82 |
-
|
| 83 |
-
# Wait for processing
|
| 84 |
-
while video_file.state.name == 'PROCESSING':
|
| 85 |
-
print('.', end='', flush=True)
|
| 86 |
-
time.sleep(2)
|
| 87 |
-
video_file = client.files.get(name=video_file.name)
|
| 88 |
-
|
| 89 |
-
if video_file.state.name == 'FAILED':
|
| 90 |
-
return "β Video processing failed"
|
| 91 |
-
|
| 92 |
-
print(f"\nβ
Video uploaded: {video_file.uri}")
|
| 93 |
-
|
| 94 |
-
# Cache the file reference
|
| 95 |
-
gemini_files_cache[cache_key] = video_file.name
|
| 96 |
-
|
| 97 |
-
# Generate content (implicit caching happens automatically)
|
| 98 |
-
print(f"π§ Analyzing with Gemini 2.5 Flash...")
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
]
|
| 106 |
-
)
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
if
|
| 110 |
-
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
| 120 |
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
Returns:
|
| 130 |
-
str: Path to generated audio file or None
|
| 131 |
-
"""
|
| 132 |
-
from elevenlabs.client import ElevenLabs
|
| 133 |
-
|
| 134 |
-
# Get API key
|
| 135 |
-
api_key = get_api_key("ELEVENLABS_API_KEY")
|
| 136 |
-
if not api_key:
|
| 137 |
-
print("β οΈ ELEVENLABS_API_KEY not set, skipping TTS")
|
| 138 |
-
return None
|
| 139 |
-
|
| 140 |
try:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
safe_text = text[:max_chars] if len(text) > max_chars else text
|
| 144 |
-
|
| 145 |
-
if len(text) > max_chars:
|
| 146 |
-
safe_text = safe_text.rstrip() + "..."
|
| 147 |
-
print(f"β οΈ Text truncated from {len(text)} to {max_chars} chars")
|
| 148 |
-
|
| 149 |
-
print(f"π£οΈ Generating speech ({len(safe_text)} chars)...")
|
| 150 |
-
start_time = time.time()
|
| 151 |
-
|
| 152 |
-
client = ElevenLabs(api_key=api_key)
|
| 153 |
-
|
| 154 |
-
audio_generator = client.text_to_speech.convert(
|
| 155 |
-
voice_id="21m00Tcm4TlvDq8ikWAM",
|
| 156 |
-
output_format="mp3_44100_128",
|
| 157 |
-
text=safe_text,
|
| 158 |
-
model_id="eleven_multilingual_v2"
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
# Generate unique filename
|
| 162 |
-
timestamp = int(time.time())
|
| 163 |
-
output_path = f"response_{timestamp}.mp3"
|
| 164 |
-
|
| 165 |
-
with open(output_path, "wb") as f:
|
| 166 |
-
for chunk in audio_generator:
|
| 167 |
-
f.write(chunk)
|
| 168 |
-
|
| 169 |
-
elapsed = time.time() - start_time
|
| 170 |
-
print(f"β
Speech generated in {elapsed:.2f}s")
|
| 171 |
-
return output_path
|
| 172 |
-
|
| 173 |
except Exception as e:
|
| 174 |
-
print(f"β
|
| 175 |
return None
|
| 176 |
|
| 177 |
-
|
| 178 |
# ==========================================
|
| 179 |
# Gradio Interface Logic
|
| 180 |
# ==========================================
|
|
@@ -182,15 +71,25 @@ def generate_speech(text: str):
|
|
| 182 |
# Cache for uploaded videos
|
| 183 |
uploaded_videos_cache = {}
|
| 184 |
|
| 185 |
-
def process_interaction(user_message, history, video_file):
|
| 186 |
"""
|
| 187 |
-
Core chatbot logic
|
| 188 |
"""
|
| 189 |
if history is None:
|
| 190 |
history = []
|
| 191 |
|
| 192 |
-
#
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
# 1. Check video upload
|
| 196 |
if video_file is None:
|
|
@@ -205,30 +104,61 @@ def process_interaction(user_message, history, video_file):
|
|
| 205 |
yield history + [{"role": "assistant", "content": f"β Video too large! Size: {file_size_mb:.1f}MB. Please upload a video smaller than 100MB."}]
|
| 206 |
return
|
| 207 |
|
| 208 |
-
#
|
| 209 |
with open(local_path, 'rb') as f:
|
| 210 |
file_hash = hashlib.md5(f.read()).hexdigest()[:8]
|
| 211 |
|
|
|
|
|
|
|
| 212 |
cache_key = f"{local_path}_{file_hash}"
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
else:
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
#
|
| 221 |
-
history
|
| 222 |
-
history.append({"role": "assistant", "content": "π€ Gemini is analyzing the video..."})
|
| 223 |
yield history
|
| 224 |
|
| 225 |
-
# 3. Analyze video
|
| 226 |
try:
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
except Exception as e:
|
| 229 |
text_response = f"β Analysis error: {str(e)}"
|
| 230 |
|
| 231 |
-
# Store full text
|
| 232 |
full_text_response = text_response
|
| 233 |
|
| 234 |
# 4. Generate audio if successful
|
|
@@ -237,30 +167,38 @@ def process_interaction(user_message, history, video_file):
|
|
| 237 |
yield history
|
| 238 |
|
| 239 |
try:
|
| 240 |
-
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
-
#
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
# Read audio and create response
|
| 258 |
-
with open(audio_path, 'rb') as f:
|
| 259 |
-
audio_bytes = f.read()
|
| 260 |
-
audio_base64 = base64.b64encode(audio_bytes).decode()
|
| 261 |
-
|
| 262 |
-
# Create response with embedded audio
|
| 263 |
-
response_content = f"""ποΈ **Audio Response**
|
| 264 |
|
| 265 |
<audio controls autoplay style="width: 100%; margin: 10px 0; background: #f0f0f0; border-radius: 5px;">
|
| 266 |
<source src="data:audio/mpeg;base64,{audio_base64}" type="audio/mpeg">
|
|
@@ -271,57 +209,81 @@ def process_interaction(user_message, history, video_file):
|
|
| 271 |
<div style="background-color: #000000; color: #00ff00; padding: 25px; border-radius: 10px; font-family: 'Courier New', monospace; line-height: 1.8; font-size: 14px; white-space: normal; word-wrap: break-word; overflow-wrap: break-word; max-width: 100%;">
|
| 272 |
{full_text_response}
|
| 273 |
</div>"""
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
else:
|
| 278 |
-
# Audio file is empty
|
| 279 |
-
history[-1] = {"role": "assistant", "content": f"β οΈ Audio generation produced empty file.\n\n<div style='background: black; color: lime; padding: 20px; border-radius: 10px; white-space: normal; word-wrap: break-word;'>{full_text_response}</div>"}
|
| 280 |
-
yield history
|
| 281 |
else:
|
| 282 |
-
|
| 283 |
-
history[-1] = {"role": "assistant", "content": f"β οΈ Audio generation skipped (API key not set).\n\n<div style='background: black; color: lime; padding: 20px; border-radius: 10px; white-space: normal; word-wrap: break-word;'>{full_text_response}</div>"}
|
| 284 |
yield history
|
| 285 |
|
| 286 |
except Exception as e:
|
| 287 |
-
# Audio error
|
| 288 |
history[-1] = {"role": "assistant", "content": f"β Audio error: {str(e)}\n\n<div style='background: black; color: lime; padding: 20px; border-radius: 10px; white-space: normal; word-wrap: break-word;'>{full_text_response}</div>"}
|
| 289 |
yield history
|
| 290 |
else:
|
| 291 |
-
# Error in analysis
|
| 292 |
history[-1] = {"role": "assistant", "content": text_response}
|
| 293 |
yield history
|
| 294 |
|
| 295 |
|
| 296 |
# ==========================================
|
| 297 |
-
# Gradio Interface
|
| 298 |
# ==========================================
|
| 299 |
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
gr.Markdown("# π₯ MCP Video Agent")
|
| 302 |
-
gr.Markdown("**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
|
| 304 |
-
gr.Markdown("""
|
| 305 |
### π How to Use
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
""")
|
| 324 |
|
|
|
|
|
|
|
| 325 |
with gr.Row():
|
| 326 |
with gr.Column(scale=1):
|
| 327 |
video_input = gr.Video(label="πΉ Upload Video (MP4)", sources=["upload"])
|
|
@@ -346,25 +308,43 @@ with gr.Blocks(title="MCP Video Agent") as demo:
|
|
| 346 |
inputs=msg
|
| 347 |
)
|
| 348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
# Event handlers
|
| 350 |
submit_btn.click(
|
| 351 |
process_interaction,
|
| 352 |
-
inputs=[msg, chatbot, video_input],
|
| 353 |
outputs=[chatbot]
|
| 354 |
)
|
| 355 |
|
| 356 |
msg.submit(
|
| 357 |
process_interaction,
|
| 358 |
-
inputs=[msg, chatbot, video_input],
|
| 359 |
outputs=[chatbot]
|
| 360 |
)
|
| 361 |
|
| 362 |
# ==========================================
|
| 363 |
-
# Launch
|
| 364 |
# ==========================================
|
| 365 |
|
| 366 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
demo.launch(
|
|
|
|
| 368 |
show_error=True,
|
| 369 |
share=False
|
| 370 |
)
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
MCP Video Agent - HF Space with Modal Backend + Security
|
| 3 |
+
Connects to Modal backend with authentication and rate limiting
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
|
|
|
| 8 |
import time
|
| 9 |
import hashlib
|
| 10 |
import base64
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from collections import defaultdict
|
| 13 |
|
| 14 |
# ==========================================
|
| 15 |
+
# Security: Rate Limiting
|
| 16 |
# ==========================================
|
| 17 |
+
class RateLimiter:
|
| 18 |
+
"""Simple in-memory rate limiter"""
|
| 19 |
+
def __init__(self, max_requests_per_hour=10):
|
| 20 |
+
self.max_requests = max_requests_per_hour
|
| 21 |
+
self.requests = defaultdict(list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
def is_allowed(self, user_id):
|
| 24 |
+
"""Check if user is within rate limit"""
|
| 25 |
+
now = datetime.now()
|
| 26 |
+
cutoff = now - timedelta(hours=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
# Clean old requests
|
| 29 |
+
self.requests[user_id] = [
|
| 30 |
+
req_time for req_time in self.requests[user_id]
|
| 31 |
+
if req_time > cutoff
|
| 32 |
+
]
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# Check limit
|
| 35 |
+
if len(self.requests[user_id]) >= self.max_requests:
|
| 36 |
+
return False
|
| 37 |
|
| 38 |
+
# Record new request
|
| 39 |
+
self.requests[user_id].append(now)
|
| 40 |
+
return True
|
| 41 |
+
|
| 42 |
+
def get_remaining(self, user_id):
|
| 43 |
+
"""Get remaining requests for user"""
|
| 44 |
+
now = datetime.now()
|
| 45 |
+
cutoff = now - timedelta(hours=1)
|
| 46 |
+
recent = [t for t in self.requests[user_id] if t > cutoff]
|
| 47 |
+
return max(0, self.max_requests - len(recent))
|
| 48 |
|
| 49 |
+
# Initialize rate limiter (configurable via environment)
|
| 50 |
+
MAX_REQUESTS_PER_HOUR = int(os.environ.get("MAX_REQUESTS_PER_HOUR", "10"))
|
| 51 |
+
rate_limiter = RateLimiter(max_requests_per_hour=MAX_REQUESTS_PER_HOUR)
|
| 52 |
|
| 53 |
+
# ==========================================
|
| 54 |
+
# Modal Connection
|
| 55 |
+
# ==========================================
|
| 56 |
+
import modal
|
| 57 |
+
|
| 58 |
+
def get_modal_function(function_name):
|
| 59 |
+
"""Connect to Modal function"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
try:
|
| 61 |
+
func = modal.Function.from_name("mcp-video-agent", function_name)
|
| 62 |
+
return func
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
except Exception as e:
|
| 64 |
+
print(f"β Failed to connect to Modal: {e}")
|
| 65 |
return None
|
| 66 |
|
|
|
|
| 67 |
# ==========================================
|
| 68 |
# Gradio Interface Logic
|
| 69 |
# ==========================================
|
|
|
|
| 71 |
# Cache for uploaded videos
|
| 72 |
uploaded_videos_cache = {}
|
| 73 |
|
| 74 |
+
def process_interaction(user_message, history, video_file, username, request: gr.Request):
|
| 75 |
"""
|
| 76 |
+
Core chatbot logic with Modal backend and security.
|
| 77 |
"""
|
| 78 |
if history is None:
|
| 79 |
history = []
|
| 80 |
|
| 81 |
+
# Get user identifier for rate limiting
|
| 82 |
+
user_id = username # Use authenticated username
|
| 83 |
+
|
| 84 |
+
# Check rate limit
|
| 85 |
+
if not rate_limiter.is_allowed(user_id):
|
| 86 |
+
remaining = rate_limiter.get_remaining(user_id)
|
| 87 |
+
yield history + [{"role": "assistant", "content": f"β οΈ Rate limit exceeded. You have {remaining} requests remaining this hour. Please try again later."}]
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
# Show remaining requests
|
| 91 |
+
remaining = rate_limiter.get_remaining(user_id)
|
| 92 |
+
print(f"π‘ User {user_id}: {remaining} requests remaining this hour")
|
| 93 |
|
| 94 |
# 1. Check video upload
|
| 95 |
if video_file is None:
|
|
|
|
| 104 |
yield history + [{"role": "assistant", "content": f"β Video too large! Size: {file_size_mb:.1f}MB. Please upload a video smaller than 100MB."}]
|
| 105 |
return
|
| 106 |
|
| 107 |
+
# Generate unique filename
|
| 108 |
with open(local_path, 'rb') as f:
|
| 109 |
file_hash = hashlib.md5(f.read()).hexdigest()[:8]
|
| 110 |
|
| 111 |
+
timestamp = int(time.time())
|
| 112 |
+
unique_filename = f"video_{timestamp}_{file_hash}.mp4"
|
| 113 |
cache_key = f"{local_path}_{file_hash}"
|
| 114 |
|
| 115 |
+
# 2. Upload to Modal Volume if needed
|
| 116 |
+
if cache_key not in uploaded_videos_cache:
|
| 117 |
+
history.append({"role": "user", "content": user_message})
|
| 118 |
+
history.append({"role": "assistant", "content": f"π€ Uploading video ({file_size_mb:.1f}MB)..."})
|
| 119 |
+
yield history
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
import subprocess
|
| 123 |
+
result = subprocess.run(
|
| 124 |
+
["modal", "volume", "put", "video-storage", local_path, f"/{unique_filename}", "--force"],
|
| 125 |
+
capture_output=True,
|
| 126 |
+
text=True,
|
| 127 |
+
timeout=300
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
if result.returncode != 0:
|
| 131 |
+
history[-1] = {"role": "assistant", "content": f"β Upload failed: {result.stderr}"}
|
| 132 |
+
yield history
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
uploaded_videos_cache[cache_key] = unique_filename
|
| 136 |
+
print(f"β
Video uploaded: {unique_filename}")
|
| 137 |
+
except Exception as e:
|
| 138 |
+
history[-1] = {"role": "assistant", "content": f"β Upload error: {str(e)}"}
|
| 139 |
+
yield history
|
| 140 |
+
return
|
| 141 |
else:
|
| 142 |
+
unique_filename = uploaded_videos_cache[cache_key]
|
| 143 |
+
history.append({"role": "user", "content": user_message})
|
| 144 |
+
history.append({"role": "assistant", "content": "β»οΈ Using cached video..."})
|
| 145 |
+
yield history
|
| 146 |
|
| 147 |
+
# 3. Analyze video via Modal
|
| 148 |
+
history[-1] = {"role": "assistant", "content": "π€ Analyzing video with Gemini..."}
|
|
|
|
| 149 |
yield history
|
| 150 |
|
|
|
|
| 151 |
try:
|
| 152 |
+
analyze_fn = get_modal_function("_internal_analyze_video")
|
| 153 |
+
if analyze_fn is None:
|
| 154 |
+
history[-1] = {"role": "assistant", "content": "β Failed to connect to Modal backend. Please check deployment."}
|
| 155 |
+
yield history
|
| 156 |
+
return
|
| 157 |
+
|
| 158 |
+
text_response = analyze_fn.remote(user_message, video_filename=unique_filename)
|
| 159 |
except Exception as e:
|
| 160 |
text_response = f"β Analysis error: {str(e)}"
|
| 161 |
|
|
|
|
| 162 |
full_text_response = text_response
|
| 163 |
|
| 164 |
# 4. Generate audio if successful
|
|
|
|
| 167 |
yield history
|
| 168 |
|
| 169 |
try:
|
| 170 |
+
speak_fn = get_modal_function("_internal_speak_text")
|
| 171 |
+
if speak_fn is None:
|
| 172 |
+
history[-1] = {"role": "assistant", "content": f"β οΈ TTS unavailable.\n\n<div style='background: black; color: lime; padding: 20px; border-radius: 10px; white-space: normal; word-wrap: break-word;'>{full_text_response}</div>"}
|
| 173 |
+
yield history
|
| 174 |
+
return
|
| 175 |
+
|
| 176 |
+
audio_filename = f"audio_{unique_filename.replace('.mp4', '.mp3')}"
|
| 177 |
+
speak_fn.remote(text_response, audio_filename=audio_filename)
|
| 178 |
|
| 179 |
+
# Download audio
|
| 180 |
+
time.sleep(2)
|
| 181 |
+
import subprocess
|
| 182 |
+
local_audio = f"/tmp/{audio_filename}"
|
| 183 |
+
|
| 184 |
+
max_retries = 3
|
| 185 |
+
for retry in range(max_retries):
|
| 186 |
+
result = subprocess.run(
|
| 187 |
+
["modal", "volume", "get", "video-storage", f"/{audio_filename}", local_audio],
|
| 188 |
+
capture_output=True,
|
| 189 |
+
text=True
|
| 190 |
+
)
|
| 191 |
|
| 192 |
+
if result.returncode == 0 and os.path.exists(local_audio) and os.path.getsize(local_audio) > 1000:
|
| 193 |
+
break
|
| 194 |
+
time.sleep(2)
|
| 195 |
+
|
| 196 |
+
if os.path.exists(local_audio) and os.path.getsize(local_audio) > 1000:
|
| 197 |
+
with open(local_audio, 'rb') as f:
|
| 198 |
+
audio_bytes = f.read()
|
| 199 |
+
audio_base64 = base64.b64encode(audio_bytes).decode()
|
| 200 |
+
|
| 201 |
+
response_content = f"""ποΈ **Audio Response** ({remaining} requests remaining this hour)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
<audio controls autoplay style="width: 100%; margin: 10px 0; background: #f0f0f0; border-radius: 5px;">
|
| 204 |
<source src="data:audio/mpeg;base64,{audio_base64}" type="audio/mpeg">
|
|
|
|
| 209 |
<div style="background-color: #000000; color: #00ff00; padding: 25px; border-radius: 10px; font-family: 'Courier New', monospace; line-height: 1.8; font-size: 14px; white-space: normal; word-wrap: break-word; overflow-wrap: break-word; max-width: 100%;">
|
| 210 |
{full_text_response}
|
| 211 |
</div>"""
|
| 212 |
+
|
| 213 |
+
history[-1] = {"role": "assistant", "content": response_content}
|
| 214 |
+
yield history
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
else:
|
| 216 |
+
history[-1] = {"role": "assistant", "content": f"β οΈ Audio generation incomplete.\n\n<div style='background: black; color: lime; padding: 20px; border-radius: 10px; white-space: normal; word-wrap: break-word;'>{full_text_response}</div>"}
|
|
|
|
| 217 |
yield history
|
| 218 |
|
| 219 |
except Exception as e:
|
|
|
|
| 220 |
history[-1] = {"role": "assistant", "content": f"β Audio error: {str(e)}\n\n<div style='background: black; color: lime; padding: 20px; border-radius: 10px; white-space: normal; word-wrap: break-word;'>{full_text_response}</div>"}
|
| 221 |
yield history
|
| 222 |
else:
|
|
|
|
| 223 |
history[-1] = {"role": "assistant", "content": text_response}
|
| 224 |
yield history
|
| 225 |
|
| 226 |
|
| 227 |
# ==========================================
|
| 228 |
+
# Gradio Interface with Authentication
|
| 229 |
# ==========================================
|
| 230 |
|
| 231 |
+
# Get credentials from environment
|
| 232 |
+
GRADIO_USERNAME = os.environ.get("GRADIO_USERNAME", "admin")
|
| 233 |
+
GRADIO_PASSWORD = os.environ.get("GRADIO_PASSWORD")
|
| 234 |
+
|
| 235 |
+
# Authentication function (optional for Hackathon/Demo)
|
| 236 |
+
def authenticate(username, password):
|
| 237 |
+
"""Authenticate users - only if password is set"""
|
| 238 |
+
if GRADIO_PASSWORD is None:
|
| 239 |
+
# No password set, allow anyone (good for Hackathon/Demo)
|
| 240 |
+
return True
|
| 241 |
+
return username == GRADIO_USERNAME and password == GRADIO_PASSWORD
|
| 242 |
+
|
| 243 |
+
with gr.Blocks(title="π₯ MCP Video Agent") as demo:
|
| 244 |
gr.Markdown("# π₯ MCP Video Agent")
|
| 245 |
+
gr.Markdown("**π MCP 1st Birthday Hackathon** | Track: MCP in Action (Consumer & Creative)")
|
| 246 |
+
|
| 247 |
+
gr.Markdown(f"""
|
| 248 |
+
### β‘ Key Innovation: Smart Frame Caching
|
| 249 |
+
|
| 250 |
+
**First Query**: Video is analyzed deeply and cached (~8-12 seconds)
|
| 251 |
+
**Follow-up Queries**: Instant responses using cached context (~2-3 seconds, 90% cost reduction!)
|
| 252 |
+
**Cache Duration**: 1 hour - ask multiple questions without reprocessing
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
|
|
|
|
| 256 |
### π How to Use
|
| 257 |
+
|
| 258 |
+
1. **Upload** a video (MP4, max 100MB)
|
| 259 |
+
2. **Ask** your first question - video will be analyzed and cached
|
| 260 |
+
3. **Continue** asking follow-up questions - experience the speed boost!
|
| 261 |
+
4. **Listen** to voice responses (powered by ElevenLabs TTS)
|
| 262 |
+
|
| 263 |
+
**Pro Tip**: After your first question, try asking 2-3 more to see how fast cached responses are!
|
| 264 |
+
|
| 265 |
+
---
|
| 266 |
+
|
| 267 |
+
### π‘οΈ Fair Usage Policy
|
| 268 |
+
|
| 269 |
+
- **Rate Limit**: {MAX_REQUESTS_PER_HOUR} requests per hour per user
|
| 270 |
+
- **Video Size**: Max 100MB
|
| 271 |
+
- **Shared Resources**: This is a Hackathon demo - please use responsibly
|
| 272 |
+
|
| 273 |
+
---
|
| 274 |
+
|
| 275 |
+
### π§ Tech Stack
|
| 276 |
+
|
| 277 |
+
- **Gemini 2.5 Flash**: Multimodal video analysis + Context Caching
|
| 278 |
+
- **Modal**: Serverless backend + Persistent storage
|
| 279 |
+
- **ElevenLabs**: Neural text-to-speech
|
| 280 |
+
- **Gradio 6.0**: Interactive UI
|
| 281 |
+
|
| 282 |
+
**Sponsor Tech Used**: β
Modal | β
Google Gemini | β
ElevenLabs
|
| 283 |
""")
|
| 284 |
|
| 285 |
+
username_state = gr.State("")
|
| 286 |
+
|
| 287 |
with gr.Row():
|
| 288 |
with gr.Column(scale=1):
|
| 289 |
video_input = gr.Video(label="πΉ Upload Video (MP4)", sources=["upload"])
|
|
|
|
| 308 |
inputs=msg
|
| 309 |
)
|
| 310 |
|
| 311 |
+
# Get username from Gradio request
|
| 312 |
+
def set_username(request: gr.Request):
|
| 313 |
+
return request.username if hasattr(request, 'username') else "anonymous"
|
| 314 |
+
|
| 315 |
+
demo.load(set_username, None, username_state)
|
| 316 |
+
|
| 317 |
# Event handlers
|
| 318 |
submit_btn.click(
|
| 319 |
process_interaction,
|
| 320 |
+
inputs=[msg, chatbot, video_input, username_state],
|
| 321 |
outputs=[chatbot]
|
| 322 |
)
|
| 323 |
|
| 324 |
msg.submit(
|
| 325 |
process_interaction,
|
| 326 |
+
inputs=[msg, chatbot, video_input, username_state],
|
| 327 |
outputs=[chatbot]
|
| 328 |
)
|
| 329 |
|
| 330 |
# ==========================================
|
| 331 |
+
# Launch with Authentication
|
| 332 |
# ==========================================
|
| 333 |
|
| 334 |
if __name__ == "__main__":
|
| 335 |
+
# Optional authentication (for Hackathon, usually not needed)
|
| 336 |
+
auth_config = None
|
| 337 |
+
if GRADIO_PASSWORD:
|
| 338 |
+
auth_config = authenticate
|
| 339 |
+
print(f"π Authentication enabled. Username: {GRADIO_USERNAME}")
|
| 340 |
+
else:
|
| 341 |
+
print("π Public access enabled (no authentication required)")
|
| 342 |
+
print(" Rate limiting active to prevent abuse")
|
| 343 |
+
print(f" Limit: {MAX_REQUESTS_PER_HOUR} requests/hour per user")
|
| 344 |
+
|
| 345 |
demo.launch(
|
| 346 |
+
auth=auth_config,
|
| 347 |
show_error=True,
|
| 348 |
share=False
|
| 349 |
)
|
| 350 |
+
|