# scale_data.py - Scale up MAP-NEO Mini training data import subprocess import sys import time from pathlib import Path def scale_training_data(): print("šŸš€ MAP-NEO Mini Data Scaling") print("=" * 50) print("Target: 50,000 documents (10x current scale)") print("Expected result: ~25,000 training sequences") print("Estimated time: 45-60 minutes") print("=" * 50) # Check if we already have large dataset large_data = Path("data/tokens/packed_1024_large.txt") if large_data.exists(): print("āœ… Large dataset already exists!") print(f"Found: {large_data}") return str(large_data) # Check if small dataset exists (backup) small_data = Path("data/tokens/packed_1024.txt") if small_data.exists(): backup_path = Path("data/tokens/packed_1024_small_backup.txt") print(f"šŸ“ Backing up current dataset to: {backup_path}") small_data.rename(backup_path) # Process 50k documents print("\nšŸ”„ Starting data processing...") print("This will download and process 50,000 English documents") cmd = [ sys.executable, "data_prep.py", "--num_docs", "50000", "--seq_length", "1024" ] start_time = time.time() try: result = subprocess.run(cmd, check=True, capture_output=False, text=True) elapsed = time.time() - start_time print(f"\nāœ… Data scaling completed in {elapsed/60:.1f} minutes!") # Rename for clarity old_path = Path("data/tokens/packed_1024.txt") new_path = Path("data/tokens/packed_1024_large.txt") if old_path.exists(): old_path.rename(new_path) print(f"šŸ“Š Large dataset saved as: {new_path}") # Count sequences with open(new_path, 'r') as f: seq_count = sum(1 for _ in f) print(f"šŸ“ˆ Total sequences: {seq_count:,}") return str(new_path) else: print("āŒ Expected output file not found") return None except subprocess.CalledProcessError as e: print(f"āŒ Error in data processing:") print(f"Return code: {e.returncode}") return None except KeyboardInterrupt: print("\nā¹ļø Process interrupted by user") return None def update_training_config(): """Update train_neo.py to use large dataset""" print("\nšŸ”§ Updating training configuration...") train_file = Path("train_neo.py") if not train_file.exists(): print("āŒ train_neo.py not found") return # Read current file content = train_file.read_text(encoding='utf-8') # Update data path and training steps old_data_path = 'data_path: str = "data/tokens/packed_1024.txt"' new_data_path = 'data_path: str = "data/tokens/packed_1024_large.txt"' old_max_steps = 'max_steps: int = 50000' new_max_steps = 'max_steps: int = 100000' if old_data_path in content: content = content.replace(old_data_path, new_data_path) print("āœ… Updated data_path to use large dataset") if old_max_steps in content: content = content.replace(old_max_steps, new_max_steps) print("āœ… Updated max_steps to 100,000 for extended training") # Write back train_file.write_text(content, encoding='utf-8') print("šŸ’¾ Training configuration updated!") def main(): print("MAP-NEO Mini Data Scaling Pipeline") # Scale data result = scale_training_data() if result: # Update config update_training_config() print("\n" + "="*60) print("šŸŽ‰ DATA SCALING COMPLETE!") print("="*60) print("Next steps:") print("1. Your large dataset is ready for training") print("2. Training config updated for 100k steps") print("3. Run: python train_neo.py") print("4. Expected training time: ~3-4 hours") print("5. Expected quality: Much more coherent text!") print("="*60) else: print("\nāŒ Data scaling failed. Check the errors above.") if __name__ == "__main__": main()