Map-NEO / scale_data.py
Austin207's picture
Upload folder using huggingface_hub
a683148 verified
# scale_data.py - Scale up MAP-NEO Mini training data
import subprocess
import sys
import time
from pathlib import Path
def scale_training_data():
print("πŸš€ MAP-NEO Mini Data Scaling")
print("=" * 50)
print("Target: 50,000 documents (10x current scale)")
print("Expected result: ~25,000 training sequences")
print("Estimated time: 45-60 minutes")
print("=" * 50)
# Check if we already have large dataset
large_data = Path("data/tokens/packed_1024_large.txt")
if large_data.exists():
print("βœ… Large dataset already exists!")
print(f"Found: {large_data}")
return str(large_data)
# Check if small dataset exists (backup)
small_data = Path("data/tokens/packed_1024.txt")
if small_data.exists():
backup_path = Path("data/tokens/packed_1024_small_backup.txt")
print(f"πŸ“ Backing up current dataset to: {backup_path}")
small_data.rename(backup_path)
# Process 50k documents
print("\nπŸ”„ Starting data processing...")
print("This will download and process 50,000 English documents")
cmd = [
sys.executable, "data_prep.py",
"--num_docs", "50000",
"--seq_length", "1024"
]
start_time = time.time()
try:
result = subprocess.run(cmd, check=True, capture_output=False, text=True)
elapsed = time.time() - start_time
print(f"\nβœ… Data scaling completed in {elapsed/60:.1f} minutes!")
# Rename for clarity
old_path = Path("data/tokens/packed_1024.txt")
new_path = Path("data/tokens/packed_1024_large.txt")
if old_path.exists():
old_path.rename(new_path)
print(f"πŸ“Š Large dataset saved as: {new_path}")
# Count sequences
with open(new_path, 'r') as f:
seq_count = sum(1 for _ in f)
print(f"πŸ“ˆ Total sequences: {seq_count:,}")
return str(new_path)
else:
print("❌ Expected output file not found")
return None
except subprocess.CalledProcessError as e:
print(f"❌ Error in data processing:")
print(f"Return code: {e.returncode}")
return None
except KeyboardInterrupt:
print("\n⏹️ Process interrupted by user")
return None
def update_training_config():
"""Update train_neo.py to use large dataset"""
print("\nπŸ”§ Updating training configuration...")
train_file = Path("train_neo.py")
if not train_file.exists():
print("❌ train_neo.py not found")
return
# Read current file
content = train_file.read_text(encoding='utf-8')
# Update data path and training steps
old_data_path = 'data_path: str = "data/tokens/packed_1024.txt"'
new_data_path = 'data_path: str = "data/tokens/packed_1024_large.txt"'
old_max_steps = 'max_steps: int = 50000'
new_max_steps = 'max_steps: int = 100000'
if old_data_path in content:
content = content.replace(old_data_path, new_data_path)
print("βœ… Updated data_path to use large dataset")
if old_max_steps in content:
content = content.replace(old_max_steps, new_max_steps)
print("βœ… Updated max_steps to 100,000 for extended training")
# Write back
train_file.write_text(content, encoding='utf-8')
print("πŸ’Ύ Training configuration updated!")
def main():
print("MAP-NEO Mini Data Scaling Pipeline")
# Scale data
result = scale_training_data()
if result:
# Update config
update_training_config()
print("\n" + "="*60)
print("πŸŽ‰ DATA SCALING COMPLETE!")
print("="*60)
print("Next steps:")
print("1. Your large dataset is ready for training")
print("2. Training config updated for 100k steps")
print("3. Run: python train_neo.py")
print("4. Expected training time: ~3-4 hours")
print("5. Expected quality: Much more coherent text!")
print("="*60)
else:
print("\n❌ Data scaling failed. Check the errors above.")
if __name__ == "__main__":
main()