Map-NEO / setup_project.py
Austin207's picture
Upload folder using huggingface_hub
a683148 verified
# MAP-NEO Mini Configuration and Setup
# Configuration files and helper scripts
import json
from pathlib import Path
# Training configuration optimized for RTX 5070 8GB
TRAINING_CONFIG = {
"model": {
"vocab_size": 50257,
"max_seq_len": 2048,
"dim": 1024,
"n_layers": 16,
"n_heads": 16,
"hidden_dim": 2736,
"dropout": 0.0
},
"training": {
"batch_size": 1,
"gradient_accumulation_steps": 32,
"max_steps": 50000,
"warmup_steps": 2000,
"learning_rate": 3e-4,
"weight_decay": 0.01,
"grad_clip": 1.0,
"mixed_precision": "bf16",
"gradient_checkpointing": True
},
"data": {
"seq_length": 1024,
"data_path": "data/tokens/packed_1024.txt"
},
"hardware": {
"device": "cuda",
"compile_model": False
},
"logging": {
"log_interval": 10,
"save_interval": 2000,
"output_dir": "checkpoints"
}
}
# Data preprocessing configuration
DATA_CONFIG = {
"num_docs": 20000, # Start with 20k documents
"seq_length": 1024,
"tokenizer": "gpt2", # Will switch to MAP-NEO tokenizer later
"output_dir": "data",
"min_text_length": 50, # Filter out very short texts
"max_text_length": 10000 # Filter out very long texts
}
def setup_project():
"""Create project directory structure"""
directories = [
"data/shards",
"data/processed",
"data/tokens",
"checkpoints",
"configs",
"logs",
"notebooks"
]
for dir_path in directories:
Path(dir_path).mkdir(parents=True, exist_ok=True)
print(f"Created directory: {dir_path}")
def save_configs():
"""Save configuration files"""
# Training config
with open("configs/training_config.json", "w") as f:
json.dump(TRAINING_CONFIG, f, indent=2)
# Data config
with open("configs/data_config.json", "w") as f:
json.dump(DATA_CONFIG, f, indent=2)
print("Configuration files saved to configs/")
def create_requirements_txt():
"""Create requirements.txt file"""
requirements = [
"torch>=2.0.0",
"transformers>=4.35.0",
"tokenizers>=0.14.0",
"datasets>=2.14.0",
"accelerate>=0.24.0",
"sentencepiece>=0.1.99",
"langdetect>=1.0.9",
"zstandard>=0.21.0",
"tqdm>=4.65.0",
"numpy>=1.24.0",
"matplotlib>=3.6.0",
"tensorboard>=2.14.0"
]
with open("requirements.txt", "w") as f:
f.write("\n".join(requirements))
print("Created requirements.txt")
def create_run_script():
"""Create a simple run script for training"""
run_script = '''#!/usr/bin/env python3
# Run MAP-NEO Mini training pipeline
import subprocess
import sys
from pathlib import Path
def run_command(cmd, description):
"""Run a command and handle errors"""
print(f"\\n{'='*50}")
print(f"Running: {description}")
print(f"Command: {cmd}")
print(f"{'='*50}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error in {description}:")
print(result.stderr)
sys.exit(1)
else:
print(f"Success: {description}")
if result.stdout:
print(result.stdout)
def main():
print("MAP-NEO Mini Training Pipeline")
print("Optimized for RTX 5070 8GB VRAM")
# Step 1: Data preprocessing
if not Path("data/tokens/packed_1024.txt").exists():
print("\\nStep 1: Data preprocessing")
run_command(
"python data_prep.py --num_docs 20000 --seq_length 1024",
"Data preprocessing"
)
else:
print("\\nSkipping data preprocessing (data exists)")
# Step 2: Model training
print("\\nStep 2: Starting model training")
run_command(
"python train_neo.py",
"Model training"
)
print("\\n" + "="*50)
print("Training pipeline completed!")
print("Check checkpoints/ directory for saved models")
print("="*50)
if __name__ == "__main__":
main()
'''
with open("run_training.py", "w") as f:
f.write(run_script)
print("Created run_training.py script")
if __name__ == "__main__":
print("Setting up MAP-NEO Mini project...")
setup_project()
save_configs()
create_requirements_txt()
create_run_script()
print("\nProject setup complete!")
print("\nNext steps:")
print("1. Run: python data_prep.py --num_docs 10000")
print("2. Run: python train_neo.py")
print("3. Or use: python run_training.py")