import gradio as gr import whisper import yt_dlp from transformers import pipeline import tempfile import os import json # Cache models globally MODEL = None CLASSIFIER = None def load_models(): global MODEL, CLASSIFIER if MODEL is None: print("Loading models...") MODEL = whisper.load_model("base") CLASSIFIER = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") return MODEL, CLASSIFIER def convert_cookies_to_single_line(): """Utility function to convert cookies.txt to single-line format""" try: with open("cookies.txt") as f: single_line = f.read().replace("\n", "\\n") print("Copy this to Hugging Face Secrets (YOUTUBE_COOKIES_TXT):") print(single_line) return single_line except FileNotFoundError: print("Error: cookies.txt file not found") return None def setup_cookies(): """Handle cookies from environment variable""" cookies_txt = os.getenv('YOUTUBE_COOKIES_TXT') if not cookies_txt: return False with open('cookies.txt', 'w') as f: f.write(cookies_txt.replace("\\n", "\n")) return True def normalize_youtube_url(url): """Convert various YouTube URL formats to standard watch URL""" url = url.strip() # Handle youtu.be short links if 'youtu.be' in url.lower(): video_id = url.split('/')[-1].split('?')[0] return f'https://www.youtube.com/watch?v={video_id}' # Ensure URL is in standard format if 'youtube.com/watch' not in url.lower(): return None return url.split('&')[0] # Remove any extra parameters def analyze_video(yt_url): try: # Normalize and validate URL normalized_url = normalize_youtube_url(yt_url) if not normalized_url: return "Error: Invalid YouTube URL. Must be from youtube.com or youtu.be", "", 0 model, classifier = load_models() has_cookies = setup_cookies() with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp: tmp_path = tmp.name try: ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': tmp_path, 'quiet': True, 'extract_audio': True, 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'http_headers': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://www.youtube.com/' }, 'socket_timeout': 30, 'noplaylist': True, 'verbose': False } if has_cookies: ydl_opts.update({ 'cookiefile': 'cookies.txt', 'extract_flat': 'in_playlist', }) with yt_dlp.YoutubeDL(ydl_opts) as ydl: try: info = ydl.extract_info(normalized_url, download=False) if not info.get('url') and not info.get('requested_downloads'): return "Error: Failed to extract video info. Cookies may be invalid.", "", 0 ydl.download([normalized_url]) except yt_dlp.utils.DownloadError as e: if "Sign in to confirm you're not a bot" in str(e): return "Error: YouTube requires authentication. Please ensure cookies are fresh and valid.", "", 0 raise e result = model.transcribe(tmp_path) transcription = result["text"] labels = ["educational", "entertainment", "news", "political", "religious", "technical"] classification = classifier( transcription, candidate_labels=labels, hypothesis_template="This content is about {}." ) return transcription, classification["labels"][0], round(classification["scores"][0], 3) finally: for f in [tmp_path, 'cookies.txt']: if os.path.exists(f): os.remove(f) except Exception as e: return f"Error: {str(e)}", "", 0 with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎬 YouTube Content Analyzer") with gr.Row(): url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=... or https://youtu.be/...") btn = gr.Button("Analyze", variant="primary") with gr.Row(): transcription = gr.Textbox(label="Transcription", interactive=False, lines=5) with gr.Column(): label = gr.Label(label="Category") confidence = gr.Number(label="Confidence Score", precision=2) btn.click(analyze_video, inputs=url, outputs=[transcription, label, confidence]) if __name__ == "__main__": if os.path.exists("cookies.txt"): convert_cookies_to_single_line() demo.launch()