|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
TRAINING_CONFIG = {
|
|
|
"model": {
|
|
|
"vocab_size": 50257,
|
|
|
"max_seq_len": 2048,
|
|
|
"dim": 1024,
|
|
|
"n_layers": 16,
|
|
|
"n_heads": 16,
|
|
|
"hidden_dim": 2736,
|
|
|
"dropout": 0.0
|
|
|
},
|
|
|
"training": {
|
|
|
"batch_size": 1,
|
|
|
"gradient_accumulation_steps": 32,
|
|
|
"max_steps": 50000,
|
|
|
"warmup_steps": 2000,
|
|
|
"learning_rate": 3e-4,
|
|
|
"weight_decay": 0.01,
|
|
|
"grad_clip": 1.0,
|
|
|
"mixed_precision": "bf16",
|
|
|
"gradient_checkpointing": True
|
|
|
},
|
|
|
"data": {
|
|
|
"seq_length": 1024,
|
|
|
"data_path": "data/tokens/packed_1024.txt"
|
|
|
},
|
|
|
"hardware": {
|
|
|
"device": "cuda",
|
|
|
"compile_model": False
|
|
|
},
|
|
|
"logging": {
|
|
|
"log_interval": 10,
|
|
|
"save_interval": 2000,
|
|
|
"output_dir": "checkpoints"
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
DATA_CONFIG = {
|
|
|
"num_docs": 20000,
|
|
|
"seq_length": 1024,
|
|
|
"tokenizer": "gpt2",
|
|
|
"output_dir": "data",
|
|
|
"min_text_length": 50,
|
|
|
"max_text_length": 10000
|
|
|
}
|
|
|
|
|
|
|
|
|
def setup_project():
|
|
|
"""Create project directory structure"""
|
|
|
directories = [
|
|
|
"data/shards",
|
|
|
"data/processed",
|
|
|
"data/tokens",
|
|
|
"checkpoints",
|
|
|
"configs",
|
|
|
"logs",
|
|
|
"notebooks"
|
|
|
]
|
|
|
|
|
|
for dir_path in directories:
|
|
|
Path(dir_path).mkdir(parents=True, exist_ok=True)
|
|
|
print(f"Created directory: {dir_path}")
|
|
|
|
|
|
|
|
|
def save_configs():
|
|
|
"""Save configuration files"""
|
|
|
|
|
|
with open("configs/training_config.json", "w") as f:
|
|
|
json.dump(TRAINING_CONFIG, f, indent=2)
|
|
|
|
|
|
|
|
|
with open("configs/data_config.json", "w") as f:
|
|
|
json.dump(DATA_CONFIG, f, indent=2)
|
|
|
|
|
|
print("Configuration files saved to configs/")
|
|
|
|
|
|
|
|
|
def create_requirements_txt():
|
|
|
"""Create requirements.txt file"""
|
|
|
requirements = [
|
|
|
"torch>=2.0.0",
|
|
|
"transformers>=4.35.0",
|
|
|
"tokenizers>=0.14.0",
|
|
|
"datasets>=2.14.0",
|
|
|
"accelerate>=0.24.0",
|
|
|
"sentencepiece>=0.1.99",
|
|
|
"langdetect>=1.0.9",
|
|
|
"zstandard>=0.21.0",
|
|
|
"tqdm>=4.65.0",
|
|
|
"numpy>=1.24.0",
|
|
|
"matplotlib>=3.6.0",
|
|
|
"tensorboard>=2.14.0"
|
|
|
]
|
|
|
|
|
|
with open("requirements.txt", "w") as f:
|
|
|
f.write("\n".join(requirements))
|
|
|
|
|
|
print("Created requirements.txt")
|
|
|
|
|
|
|
|
|
def create_run_script():
|
|
|
"""Create a simple run script for training"""
|
|
|
run_script = '''#!/usr/bin/env python3
|
|
|
# Run MAP-NEO Mini training pipeline
|
|
|
|
|
|
import subprocess
|
|
|
import sys
|
|
|
from pathlib import Path
|
|
|
|
|
|
def run_command(cmd, description):
|
|
|
"""Run a command and handle errors"""
|
|
|
print(f"\\n{'='*50}")
|
|
|
print(f"Running: {description}")
|
|
|
print(f"Command: {cmd}")
|
|
|
print(f"{'='*50}")
|
|
|
|
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
|
|
|
|
if result.returncode != 0:
|
|
|
print(f"Error in {description}:")
|
|
|
print(result.stderr)
|
|
|
sys.exit(1)
|
|
|
else:
|
|
|
print(f"Success: {description}")
|
|
|
if result.stdout:
|
|
|
print(result.stdout)
|
|
|
|
|
|
def main():
|
|
|
print("MAP-NEO Mini Training Pipeline")
|
|
|
print("Optimized for RTX 5070 8GB VRAM")
|
|
|
|
|
|
# Step 1: Data preprocessing
|
|
|
if not Path("data/tokens/packed_1024.txt").exists():
|
|
|
print("\\nStep 1: Data preprocessing")
|
|
|
run_command(
|
|
|
"python data_prep.py --num_docs 20000 --seq_length 1024",
|
|
|
"Data preprocessing"
|
|
|
)
|
|
|
else:
|
|
|
print("\\nSkipping data preprocessing (data exists)")
|
|
|
|
|
|
# Step 2: Model training
|
|
|
print("\\nStep 2: Starting model training")
|
|
|
run_command(
|
|
|
"python train_neo.py",
|
|
|
"Model training"
|
|
|
)
|
|
|
|
|
|
print("\\n" + "="*50)
|
|
|
print("Training pipeline completed!")
|
|
|
print("Check checkpoints/ directory for saved models")
|
|
|
print("="*50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
'''
|
|
|
|
|
|
with open("run_training.py", "w") as f:
|
|
|
f.write(run_script)
|
|
|
|
|
|
print("Created run_training.py script")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("Setting up MAP-NEO Mini project...")
|
|
|
|
|
|
setup_project()
|
|
|
save_configs()
|
|
|
create_requirements_txt()
|
|
|
create_run_script()
|
|
|
|
|
|
print("\nProject setup complete!")
|
|
|
print("\nNext steps:")
|
|
|
print("1. Run: python data_prep.py --num_docs 10000")
|
|
|
print("2. Run: python train_neo.py")
|
|
|
print("3. Or use: python run_training.py") |