import os import torch import torchaudio import psutil import time import sys import numpy as np import gc import gradio as gr from pydub import AudioSegment from audiocraft.models import MusicGen from torch.cuda.amp import autocast # Set PYTORCH_CUDA_ALLOC_CONF to manage memory fragmentation os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32" # Setup instructions: # 1. Create a virtual environment: python3 -m venv venv # 2. Activate the environment: source venv/bin/activate # 3. Install dependencies: # pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 # pip install -r requirements.txt # 4. Install ffmpeg for MP3 export: # sudo apt-get install ffmpeg # 5. Ensure model weights are in /home/ubuntu/ghostai_music_generator/models/musicgen-medium # 6. Log in to Hugging Face CLI: huggingface-cli login # 7. Request access to model if needed: https://huggingface.co/facebook/musicgen-medium # Check critical dependencies if np.__version__ != "1.23.5": print(f"ERROR: NumPy version {np.__version__} is not compatible. Please install numpy==1.23.5.") sys.exit(1) if not torch.__version__.startswith(("2.1.0", "2.3.1")): print(f"WARNING: PyTorch version {torch.__version__} may not be compatible. Expected torch==2.1.0 or 2.3.1.") # 1) DEVICE SETUP device = "cuda" if torch.cuda.is_available() else "cpu" if device != "cuda": print("ERROR: CUDA is required for GPU rendering. CPU rendering is disabled to avoid slow performance.") sys.exit(1) print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}") # 2) LOAD MUSICGEN INTO VRAM try: print("Loading MusicGen model into VRAM...") local_model_path = "/home/ubuntu/ghostai_music_generator/models/musicgen-medium" if not os.path.exists(local_model_path): print(f"ERROR: Local model path {local_model_path} does not exist. Please ensure the model weights are downloaded.") sys.exit(1) musicgen_model = MusicGen.get_pretrained(local_model_path, device=device) except Exception as e: print(f"ERROR: Failed to load MusicGen model: {e}") print("Please ensure the model weights are in the correct path and dependencies are installed.") sys.exit(1) # 3) RESOURCE MONITORING FUNCTION def print_resource_usage(stage: str): print(f"--- {stage} ---") print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / (1024**3):.2f} GB") print(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / (1024**3):.2f} GB") print("---------------") # 4) GENRE PROMPT FUNCTIONS (Updated for distinct song sections with balanced energy) def set_rock_prompt(): return "Hard rock with a dynamic intro, expressive verse, and powerful chorus, featuring electric guitars, steady heavy drums, and deep bass" def set_techno_prompt(): return "Techno with a pulsing intro, intense build-up, and energetic drop, featuring dark synths, driving bass, and rhythmic fast drums" def set_jazz_prompt(): return "Smooth jazz with a warm intro, expressive theme, and lively outro, featuring saxophone, piano, and soft rhythmic drums" def set_classical_prompt(): return "Classical orchestral piece with a gentle intro, dramatic development, and grand finale, featuring strings, piano, and subtle rhythmic percussion" def set_hiphop_prompt(): return "Hip-hop with a groovy intro, rhythmic verse, and catchy hook, featuring deep bass, tight crisp drums, and funky synths" # 5) AUDIO PROCESSING FUNCTIONS def apply_chorus(segment): # Enhanced chorus effect for richer sound delayed = segment - 4 # Slightly louder delayed copy at -4 dB (was -6 dB) delayed = delayed.set_frame_rate(segment.frame_rate) return segment.overlay(delayed, position=20) # Increased delay to 20ms for more noticeable effect def apply_eq(segment): # Apply EQ: Balanced filters for cleaner sound segment = segment.low_pass_filter(6000) # Loosen low-pass filter for brighter highs segment = segment.high_pass_filter(100) # Slightly lower cutoff for low rumble return segment def apply_limiter(segment, max_db=-8.0): # Apply a strict limiter to control peaks if segment.dBFS > max_db: segment = segment - (segment.dBFS - max_db) # Reduce gain to prevent clipping return segment # 6) GENERATION & I/O FUNCTIONS def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, crossfade_duration: int): global musicgen_model # Use global to modify the model instance if not instrumental_prompt.strip(): return None, "⚠️ Please enter a valid instrumental prompt!" try: # Record start time for total duration calculation start_time = time.time() # Ensure total duration is within reasonable bounds total_duration = min(max(total_duration, 10), 60) # Between 10 and 60 seconds chunk_duration = 15 # Duration of each chunk (e.g., 2 chunks of 15 seconds for a 30-second track) num_chunks = max(2, (total_duration + chunk_duration - 1) // chunk_duration) # Ensure at least 2 chunks chunk_duration = total_duration / num_chunks # Adjust chunk duration to fit total duration # Generate slightly longer chunks for overlap overlap_duration = min(1.0, crossfade_duration / 1000.0) # Convert ms to seconds, max 1 second generation_duration = chunk_duration + overlap_duration # Add overlap to each chunk # Initialize list to store audio chunks audio_chunks = [] sample_rate = musicgen_model.sample_rate # Generate audio in chunks with distinct prompts for song structure for i in range(num_chunks): # Vary the prompt for each chunk to create clear song sections if i == 0: chunk_prompt = instrumental_prompt + ", focusing on a dynamic intro and expressive verse" else: chunk_prompt = instrumental_prompt + ", focusing on a powerful chorus and energetic outro" print(f"Generating chunk {i+1}/{num_chunks} on GPU (prompt: {chunk_prompt})...") musicgen_model.set_generation_params( duration=generation_duration, use_sampling=True, top_k=top_k, top_p=top_p, temperature=temperature, cfg_coef=cfg_scale ) # Print resource usage before each chunk print_resource_usage(f"Before Chunk {i+1} Generation") with torch.no_grad(): with autocast(): # Use mixed precision for lower VRAM usage audio_chunk = musicgen_model.generate([chunk_prompt], progress=True)[0] # Shape: (2, samples) # Normalize to stereo audio_chunk = audio_chunk.cpu().to(dtype=torch.float32) if audio_chunk.dim() == 1: audio_chunk = torch.stack([audio_chunk, audio_chunk], dim=0) # Mono to stereo elif audio_chunk.dim() == 2 and audio_chunk.shape[0] == 1: audio_chunk = torch.cat([audio_chunk, audio_chunk], dim=0) elif audio_chunk.dim() == 2 and audio_chunk.shape[0] != 2: audio_chunk = audio_chunk[:1, :] audio_chunk = torch.cat([audio_chunk, audio_chunk], dim=0) elif audio_chunk.dim() > 2: audio_chunk = audio_chunk.view(2, -1) if audio_chunk.shape[0] != 2: raise ValueError(f"Expected stereo audio with shape (2, samples), got shape {audio_chunk.shape}") # Save chunk temporarily as WAV, then convert to MP3 temp_wav_path = f"temp_chunk_{i}.wav" chunk_path = f"chunk_{i}.mp3" torchaudio.save(temp_wav_path, audio_chunk, sample_rate, bits_per_sample=24) segment = AudioSegment.from_wav(temp_wav_path) segment = apply_limiter(segment, max_db=-8.0) # Apply limiter early to control peaks segment.export(chunk_path, format="mp3", bitrate="320k") os.remove(temp_wav_path) # Clean up temporary WAV file audio_chunks.append(chunk_path) # Free memory after generating each chunk torch.cuda.empty_cache() gc.collect() time.sleep(0.5) # Small delay to ensure memory is released print_resource_usage(f"After Chunk {i+1} Generation") # Combine chunks using pydub with crossfade to smooth transitions print("Combining audio chunks...") final_segment = AudioSegment.from_mp3(audio_chunks[0]) for i in range(1, len(audio_chunks)): next_segment = AudioSegment.from_mp3(audio_chunks[i]) # Apply a gentle gain boost to raise quieter sections next_segment = next_segment + 2 # Boost by 2 dB to avoid amplitude dips next_segment = apply_limiter(next_segment, max_db=-8.0) # Re-apply limiter after gain boost final_segment = final_segment.append(next_segment, crossfade=crossfade_duration) # Trim to exact total duration (in milliseconds) final_segment = final_segment[:total_duration * 1000] # Post-process to enhance audio quality print("Post-processing final track...") final_segment = apply_eq(final_segment) final_segment = apply_chorus(final_segment) # Apply final limiter and normalization with large headroom final_segment = apply_limiter(final_segment, max_db=-8.0) final_segment = final_segment.normalize(headroom=-10.0) # Large headroom to prevent clipping # Export as MP3 only mp3_path = "output_cleaned.mp3" final_segment.export( mp3_path, format="mp3", bitrate="320k", tags={"title": "GhostAI Instrumental", "artist": "GhostAI"} ) print(f"Saved final audio to {mp3_path}") # Clean up temporary chunk files for chunk_path in audio_chunks: os.remove(chunk_path) # Print resource usage after generation print_resource_usage("After Final Generation") print(f"Total Generation Time: {time.time() - start_time:.2f} seconds") return mp3_path, "✅ Done!" except Exception as e: return None, f"❌ Generation failed: {e}" finally: torch.cuda.empty_cache() gc.collect() def clear_inputs(): return "", 3.0, 300, 0.95, 1.0, 30, 500 # 7) CUSTOM CSS css = """ body { background: linear-gradient(135deg, #0A0A0A 0%, #1C2526 100%); color: #E0E0E0; font-family: 'Orbitron', sans-serif; margin: 0; padding: 0; } .header-container { text-align: center; padding: 15px 20px; background: rgba(0, 0, 0, 0.9); border-bottom: 1px solid #00FF9F; box-shadow: 0 0 10px rgba(161, 0, 255, 0.3); } #ghost-logo { font-size: 60px; display: block; margin: 0 auto; animation: glitch-ghost 1.5s infinite; text-shadow: 0 0 10px #A100FF, 0 0 20px #00FF9F; } h1 { color: #A100FF; font-size: 28px; margin: 5px 0; text-shadow: 0 0 5px #A100FF, 0 0 10px #00FF9F; animation: glitch-text 2s infinite; } p { color: #E0E0E0; font-size: 14px; margin: 5px 0; } .input-container { max-width: 1000px; margin: 20px auto; padding: 20px; background: rgba(28, 37, 38, 0.8); border-radius: 10px; box-shadow: 0 0 15px rgba(0, 255, 159, 0.3); } .textbox { background: #1A1A1A; border: 1px solid #A100FF; color: #E0E0E0; border-radius: 5px; padding: 10px; margin-bottom: 20px; } .genre-buttons { display: flex; justify-content: center; gap: 15px; margin-bottom: 20px; } .genre-btn { background: linear-gradient(45deg, #A100FF, #00FF9F); border: none; color: #0A0A0A; font-weight: bold; padding: 10px 20px; border-radius: 5px; transition: transform 0.3s ease, box-shadow 0.3s ease; } .genre-btn:hover { transform: scale(1.05); box-shadow: 0 0 15px #00FF9F; } .settings-container { max-width: 1000px; margin: 20px auto; padding: 20px; background: rgba(28, 37, 38, 0.8); border-radius: 10px; box-shadow: 0 0 15px rgba(0, 255, 159, 0.3); } .action-buttons { display: flex; justify-content: center; gap: 20px; margin-top: 20px; } button { background: linear-gradient(45deg, #A100FF, #00FF9F); border: none; color: #0A0A0A; font-weight: bold; padding: 12px 24px; border-radius: 5px; transition: transform 0.3s ease, box-shadow 0.3s ease; } button:hover { transform: scale(1.05); box-shadow: 0 0 15px #00FF9F; } .output-container { max-width: 1000px; margin: 20px auto; padding: 20px; background: rgba(28, 37, 38, 0.8); border-radius: 10px; box-shadow: 0 0 15px rgba(0, 255, 159, 0.3); text-align: center; } @keyframes glitch-ghost { 0% { transform: translate(0, 0); opacity: 1; } 20% { transform: translate(-5px, 2px); opacity: 0.8; } 40% { transform: translate(5px, -2px); opacity: 0.6; } 60% { transform: translate(-3px, 1px); opacity: 0.9; } 80% { transform: translate(3px, -1px); opacity: 0.7; } 100% { transform: translate(0, 0); opacity: 1; } } @keyframes glitch-text { 0% { transform: translate(0, 0); text-shadow: 0 0 10px #A100FF, 0 0 20px #00FF9F; } 20% { transform: translate(-2px, 1px); text-shadow: 0 0 15px #00FF9F, 0 0 25px #A100FF; } 40% { transform: translate(2px, -1px); text-shadow: 0 0 10px #A100FF, 0 0 30px #00FF9F; } 60% { transform: translate(-1px, 2px); text-shadow: 0 0 15px #00FF9F, 0 0 20px #A100FF; } 80% { transform: translate(1px, -2px); text-shadow: 0 0 10px #A100FF, 0 0 25px #00FF9F; } 100% { transform: translate(0, 0); text-shadow: 0 0 10px #A100FF, 0 0 20px #00FF9F; } } @font-face { font-family: 'Orbitron'; src: url('https://fonts.gstatic.com/s/orbitron/v29/yMJRMIlzdpvBhQQL_Qq7dy0.woff2') format('woff2'); } """ # 8) BUILD WITH BLOCKS with gr.Blocks(css=css) as demo: gr.Markdown("""
Summon the Sound of the Unknown