Map-NEO / advanced_generate.py

Upload folder using huggingface_hub

a683148 verified 3 months ago

12.2 kB

	"""advanced_generate.py - Advanced text generation with instruction prompts, context window info, and GPU monitoring"""

	import torch
	from transformers import AutoTokenizer
	from model_neo import NeoMini, NeoMiniConfig
	import os
	from pathlib import Path
	import gc

	def clear_gpu_cache():
	"""Clear GPU memory cache to free up VRAM"""
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	print("🧹 GPU cache cleared")

	def force_garbage_collection():
	"""Force garbage collection and clear caches"""
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	print("🗑️ Garbage collection and cache clearing completed")

	def reset_gpu():
	"""Quick GPU reset function for interactive use"""
	force_garbage_collection()
	print(f"🔄 GPU reset: {get_gpu_memory_info()}")

	def get_gpu_memory_info():
	"""Get GPU memory usage information"""
	if not torch.cuda.is_available():
	return "CUDA not available"

	try:
	# Get current GPU memory usage
	allocated = torch.cuda.memory_allocated(0) / 1024**3 # Convert to GB
	cached = torch.cuda.memory_reserved(0) / 1024**3
	total = torch.cuda.get_device_properties(0).total_memory / 1024**3

	return f"GPU Memory: {allocated:.2f}GB allocated, {cached:.2f}GB cached, {total:.2f}GB total"
	except Exception as e:
	return f"Could not get GPU memory info: {e}"

	def load_model(checkpoint_path="checkpoints/extended_context_model.pt"):
	print(f"Loading model from {checkpoint_path}...")

	# Clear cache before loading
	print("🧹 Clearing cache before model loading...")
	force_garbage_collection()

	if not os.path.exists(checkpoint_path):
	print(f"❌ Checkpoint {checkpoint_path} not found.")
	return None, None, None

	checkpoint = torch.load(checkpoint_path, map_location="cuda" if torch.cuda.is_available() else "cpu")

	# Get config from checkpoint or use default
	if 'config' in checkpoint:
	max_seq_len = checkpoint['config'].get('max_seq_len', 2048)
	else:
	max_seq_len = 2048 # fallback

	config = NeoMiniConfig()
	config.max_seq_len = max_seq_len

	model = NeoMini(config)
	model.load_state_dict(checkpoint['model_state_dict'])
	model.eval()

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)

	tokenizer_path = "data/tokenizer"
	if Path(tokenizer_path).exists():
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
	else:
	print("Tokenizer path not found, fallback to GPT-2 tokenizer.")
	tokenizer = AutoTokenizer.from_pretrained("gpt2")
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print(f"✅ Model loaded on {device}")
	print(f"📊 Tokenizer vocab size: {tokenizer.vocab_size:,}")
	print(f"🧠 Model parameters: {model.get_num_params():,}")
	print(f"📏 Max context window: {max_seq_len:,} tokens")

	# Clear cache after model loading
	clear_gpu_cache()
	print(f"💾 After model load: {get_gpu_memory_info()}")

	return model, tokenizer, max_seq_len

	def generate_text(model, tokenizer, max_context_length, prompt, max_length=100,
	temperature=0.4, top_k=20, top_p=0.8, repetition_penalty=1.2):
	device = next(model.parameters()).device
	input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

	# Clear cache for long contexts
	if input_ids.size(1) > 1000:
	clear_gpu_cache()

	# Check initial prompt length
	prompt_length = input_ids.size(1)
	print(f"📝 Prompt length: {prompt_length:,} tokens")

	if prompt_length >= max_context_length:
	print(f"⚠️ Warning: Prompt ({prompt_length}) exceeds max context ({max_context_length})")
	return "Error: Prompt too long for context window"

	# Adjust max_length if needed
	available_tokens = max_context_length - prompt_length
	if max_length > available_tokens:
	print(f"⚠️ Adjusting max_length from {max_length} to {available_tokens} (context limit)")
	max_length = available_tokens

	print(f"🎯 Generating max {max_length} tokens (temp={temperature}, top_k={top_k}, top_p={top_p}, rep_penalty={repetition_penalty})")
	print(f"💾 Before generation: {get_gpu_memory_info()}")

	with torch.no_grad():
	generated = input_ids
	tokens_generated = 0

	for step in range(max_length):
	# Check memory and clear cache periodically for long generations
	if step % 100 == 0 and step > 0:
	current_length = generated.size(1)
	print(f" 📊 Step {step}: {current_length:,}/{max_context_length:,} tokens")
	if current_length > 2000: # Clear cache for very long contexts
	clear_gpu_cache()
	print(f" {get_gpu_memory_info()}")

	logits = model(generated)
	next_token_logits = logits[0, -1, :] / temperature

	# Repetition penalty
	if repetition_penalty != 1.0:
	for token_id in set(generated[0].tolist()):
	if next_token_logits[token_id] < 0:
	next_token_logits[token_id] *= repetition_penalty
	else:
	next_token_logits[token_id] /= repetition_penalty

	# Top-k filtering
	if top_k > 0:
	top_k_logits, _ = torch.topk(next_token_logits, top_k)
	min_top_k = top_k_logits[-1]
	next_token_logits[next_token_logits < min_top_k] = float("-inf")

	# Top-p filtering
	if top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
	cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0
	indices_to_remove = sorted_indices[sorted_indices_to_remove]
	next_token_logits[indices_to_remove] = float("-inf")

	probs = torch.softmax(next_token_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	generated = torch.cat([generated, next_token.unsqueeze(0)], dim=1)
	tokens_generated += 1

	# Check stopping conditions
	if next_token.item() == tokenizer.eos_token_id:
	print(f"🛑 Stopped at EOS token (generated {tokens_generated} tokens)")
	break

	if generated.size(1) >= max_context_length:
	print(f"🛑 Stopped at max context length {max_context_length:,} (generated {tokens_generated} tokens)")
	break

	final_length = generated.size(1)
	print(f"✅ Generation complete: {final_length:,} total tokens ({tokens_generated} new tokens)")

	# Clear cache after long generations
	if final_length > 2000:
	clear_gpu_cache()

	print(f"💾 Final: {get_gpu_memory_info()}")

	return tokenizer.decode(generated[0], skip_special_tokens=True)

	def test_context_window_limits(model, tokenizer, max_context_length):
	"""Test how much context the model can actually handle"""
	print(f"\n🧪 Testing Context Window Limits (Max: {max_context_length:,} tokens)")
	print("="*60)

	# Create a long repetitive prompt to test limits
	base_text = "This is a test of the context window. " * 20 # ~140 tokens per repeat

	# Extended multipliers for better testing
	for multiplier in [1, 5, 10, 20, 50, 100, 150]:
	# Clear cache before each test
	print(f"\n🧹 Clearing GPU cache before test {multiplier}...")
	force_garbage_collection()

	test_prompt = base_text * multiplier
	token_count = len(tokenizer.encode(test_prompt))

	print(f"\n📏 Test prompt length: {token_count:,} tokens")

	if token_count > max_context_length:
	print(f"⚠️ Exceeds context limit ({max_context_length:,}), skipping...")
	continue

	print(f"💾 Before generation: {get_gpu_memory_info()}")

	try:
	result = generate_text(model, tokenizer, max_context_length,
	test_prompt + " In conclusion,", max_length=50, temperature=0.7)
	print(f"✅ Success at {token_count:,} tokens")
	print(f"💾 After generation: {get_gpu_memory_info()}")

	# Clear cache after successful test
	clear_gpu_cache()
	print(f"💾 After cache clear: {get_gpu_memory_info()}")

	except Exception as e:
	print(f"❌ Failed at {token_count:,} tokens: {e}")
	print("🧹 Cleaning up after failure...")
	force_garbage_collection()
	break

	def test_instruction_prompts(model, tokenizer, max_context_length):
	print(f"\n🎯 Testing Instruction Following")
	print("="*60)

	prompts = [
	"Complete this sentence in a helpful way: The weather today is",
	"Write a short explanation: Why is exercise important?",
	"Answer in 2-3 sentences: What is artificial intelligence?",
	"Continue this story logically: The scientist walked into the lab and saw"
	]

	for idx, prompt in enumerate(prompts, 1):
	print(f"\n--- Instruction Prompt {idx} ---")
	print(f"Prompt: {prompt}")

	# Clear cache before each instruction test
	if idx > 1: # Not needed for first test
	clear_gpu_cache()

	output = generate_text(model, tokenizer, max_context_length, prompt, max_length=100)
	print(f"Output: {output}")

	def test_long_context(model, tokenizer, max_context_length):
	print(f"\n💬 Testing Long Context Conversation")
	print("="*60)

	# Clear cache before long context test
	clear_gpu_cache()

	prompt = """The following is a conversation between a human and an AI assistant. The AI assistant is helpful, harmless, and honest.
	Human: Hello, who are you?
	AI: I am a large language model trained to assist you.
	Human: What can you do for me?
	AI: """

	output = generate_text(model, tokenizer, max_context_length, prompt, max_length=200)
	print(f"Output: {output}")

	def main():
	print("🚀 MAP-NEO Mini Advanced Text Generation with Context & VRAM Monitoring")
	print("="*80)

	# Force clear at startup
	print("🧹 Initial system cleanup...")
	force_garbage_collection()

	# Load model and get context info
	model, tokenizer, max_context_length = load_model()
	if model is None or tokenizer is None:
	print("❌ Failed to load model or tokenizer.")
	return

	print(f"\n🔥 Model ready! Context window: {max_context_length:,} tokens")

	# Run tests with cache management
	print("\n" + "="40 + " TESTS " + "="40)

	# Test 1: Instructions
	test_instruction_prompts(model, tokenizer, max_context_length)
	force_garbage_collection()
	print(f"💾 After instructions: {get_gpu_memory_info()}")

	# Test 2: Long context
	test_long_context(model, tokenizer, max_context_length)
	force_garbage_collection()
	print(f"💾 After long context: {get_gpu_memory_info()}")

	# Test 3: Context limits (most memory intensive)
	test_context_window_limits(model, tokenizer, max_context_length)

	print(f"\n🎉 All tests complete!")
	print(f"💾 Final GPU state: {get_gpu_memory_info()}")

	if __name__ == "__main__":
	main()