Austin207
/

Map-NEO

Text Generation

custom-architecture

flash-attention

Model card Files Files and versions

Map-NEO / scale_data.py

Austin207's picture

Upload folder using huggingface_hub

a683148 verified 3 months ago

history blame contribute delete

4.36 kB

	# scale_data.py - Scale up MAP-NEO Mini training data
	import subprocess
	import sys
	import time
	from pathlib import Path

	def scale_training_data():
	print("🚀 MAP-NEO Mini Data Scaling")
	print("=" * 50)
	print("Target: 50,000 documents (10x current scale)")
	print("Expected result: ~25,000 training sequences")
	print("Estimated time: 45-60 minutes")
	print("=" * 50)

	# Check if we already have large dataset
	large_data = Path("data/tokens/packed_1024_large.txt")
	if large_data.exists():
	print("✅ Large dataset already exists!")
	print(f"Found: {large_data}")
	return str(large_data)

	# Check if small dataset exists (backup)
	small_data = Path("data/tokens/packed_1024.txt")
	if small_data.exists():
	backup_path = Path("data/tokens/packed_1024_small_backup.txt")
	print(f"📁 Backing up current dataset to: {backup_path}")
	small_data.rename(backup_path)

	# Process 50k documents
	print("\n🔄 Starting data processing...")
	print("This will download and process 50,000 English documents")

	cmd = [
	sys.executable, "data_prep.py",
	"--num_docs", "50000",
	"--seq_length", "1024"
	]

	start_time = time.time()

	try:
	result = subprocess.run(cmd, check=True, capture_output=False, text=True)

	elapsed = time.time() - start_time
	print(f"\n✅ Data scaling completed in {elapsed/60:.1f} minutes!")

	# Rename for clarity
	old_path = Path("data/tokens/packed_1024.txt")
	new_path = Path("data/tokens/packed_1024_large.txt")
	if old_path.exists():
	old_path.rename(new_path)
	print(f"📊 Large dataset saved as: {new_path}")

	# Count sequences
	with open(new_path, 'r') as f:
	seq_count = sum(1 for _ in f)
	print(f"📈 Total sequences: {seq_count:,}")

	return str(new_path)
	else:
	print("❌ Expected output file not found")
	return None

	except subprocess.CalledProcessError as e:
	print(f"❌ Error in data processing:")
	print(f"Return code: {e.returncode}")
	return None
	except KeyboardInterrupt:
	print("\n⏹️ Process interrupted by user")
	return None

	def update_training_config():
	"""Update train_neo.py to use large dataset"""
	print("\n🔧 Updating training configuration...")

	train_file = Path("train_neo.py")
	if not train_file.exists():
	print("❌ train_neo.py not found")
	return

	# Read current file
	content = train_file.read_text(encoding='utf-8')

	# Update data path and training steps
	old_data_path = 'data_path: str = "data/tokens/packed_1024.txt"'
	new_data_path = 'data_path: str = "data/tokens/packed_1024_large.txt"'

	old_max_steps = 'max_steps: int = 50000'
	new_max_steps = 'max_steps: int = 100000'

	if old_data_path in content:
	content = content.replace(old_data_path, new_data_path)
	print("✅ Updated data_path to use large dataset")

	if old_max_steps in content:
	content = content.replace(old_max_steps, new_max_steps)
	print("✅ Updated max_steps to 100,000 for extended training")

	# Write back
	train_file.write_text(content, encoding='utf-8')
	print("💾 Training configuration updated!")

	def main():
	print("MAP-NEO Mini Data Scaling Pipeline")

	# Scale data
	result = scale_training_data()

	if result:
	# Update config
	update_training_config()

	print("\n" + "="*60)
	print("🎉 DATA SCALING COMPLETE!")
	print("="*60)
	print("Next steps:")
	print("1. Your large dataset is ready for training")
	print("2. Training config updated for 100k steps")
	print("3. Run: python train_neo.py")
	print("4. Expected training time: ~3-4 hours")
	print("5. Expected quality: Much more coherent text!")
	print("="*60)
	else:
	print("\n❌ Data scaling failed. Check the errors above.")

	if __name__ == "__main__":
	main()