Spaces:

MCP-1st-Birthday
/

DS-STAR

Running

App Files Files Community

DS-STAR / src /agents /analyzer_agent.py

anurag-deo

Upload folder using huggingface_hub

8ff817c verified 18 days ago

raw

history blame contribute delete

5.06 kB

	"""
	Analyzer Agent: Analyzes data files and generates descriptions.

	This agent runs once at the beginning to understand available data.
	"""

	import os
	from pathlib import Path

	from langchain_core.messages import AIMessage

	from ..utils.code_execution import execute_with_debug
	from ..utils.formatters import extract_code, gemini_text
	from ..utils.state import DSStarState


	def analyzer_node(state: DSStarState) -> dict:
	"""
	Analyzer Agent Node: Analyzes all data files in the data/ directory.

	For each file, generates and executes Python code to:
	- Load the file
	- Print structure, types, and sample data
	- Capture essential information

	Args:
	state: Current DSStarState

	Returns:
	Dictionary with updated state fields:
	- data_descriptions: Dict mapping filename to analysis result
	- messages: Agent communication messages
	- next: Next node to visit ("planner" or "__end__")
	"""
	print("=" * 60)
	print("DATA ANALYZER AGENT STARTING...")
	print("=" * 60)

	data_dir = "data/"
	descriptions = {}

	# Check if data directory exists
	if not os.path.exists(data_dir):
	print(f"Error: {data_dir} directory not found")
	return {
	"data_descriptions": {"error": "Data directory not found"},
	"messages": [AIMessage(content="Error: data/ directory not found")],
	"next": "__end__",
	}

	# Get list of files
	files = [
	f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))
	]

	if not files:
	print(f"Error: No files found in {data_dir}")
	return {
	"data_descriptions": {"error": "No data files found"},
	"messages": [AIMessage(content="Error: No files in data/ directory")],
	"next": "__end__",
	}

	print(f"Found {len(files)} files to analyze")

	# Analyze each file
	for filename in files:
	filepath = os.path.join(data_dir, filename)
	file_ext = Path(filepath).suffix.lower()

	print(f"\nAnalyzing: {filename}")

	# Generate analysis script
	analysis_prompt = f"""Generate a Python script to analyze the file: {filepath}

	File type: {file_ext}

	Requirements:
	- Load the file using appropriate method for {file_ext} format
	- Print essential information:
	* Data structure and types
	* Column names (for structured data like CSV, Excel)
	* First 3-5 rows/examples
	* Shape/size information
	- Handle common formats: CSV, JSON, Excel, TXT, MD
	- Use pandas for structured data
	- No try-except blocks
	- All files are in 'data/' directory
	- Print output clearly

	Provide ONLY the Python code in a markdown code block."""

	try:
	# Get LLM response
	response = state["llm"].invoke(analysis_prompt)

	# Handle different response formats (Gemini vs OpenAI)
	if hasattr(response, "content") and isinstance(response.content, list):
	# Gemini format
	response_text = gemini_text(response)
	elif hasattr(response, "content"):
	response_text = response.content
	else:
	response_text = str(response)

	code = extract_code(response_text)

	# Execute with debugging
	result = execute_with_debug(code, state["llm"], is_analysis=True)

	descriptions[filename] = result
	print(f"✓ Successfully analyzed {filename}")

	except Exception as e:
	descriptions[filename] = f"Error analyzing file: {str(e)}"
	print(f"✗ Failed to analyze {filename}: {str(e)}")

	print("\n" + "=" * 60)
	print(f"ANALYSIS COMPLETE: {len(files)} files processed")
	print("=" * 60)

	return {
	"data_descriptions": descriptions,
	"messages": [AIMessage(content=f"Analyzed {len(files)} data files")],
	"next": "planner",
	}


	# Standalone test function
	def test_analyzer(llm, data_dir: str = "data/"):
	"""
	Test the analyzer agent independently.

	Args:
	llm: LLM instance
	data_dir: Directory containing data files

	Returns:
	Dictionary with analysis results
	"""
	# Create minimal test state
	test_state = {
	"llm": llm,
	"query": "Test query",
	"data_descriptions": {},
	"plan": [],
	"current_code": "",
	"execution_result": "",
	"is_sufficient": False,
	"router_decision": "",
	"iteration": 0,
	"max_iterations": 20,
	"messages": [],
	"next": "analyzer",
	}

	result = analyzer_node(test_state)

	print("\n" + "=" * 60)
	print("ANALYZER TEST RESULTS")
	print("=" * 60)
	for filename, description in result["data_descriptions"].items():
	print(f"\n{filename}:")
	print("-" * 60)
	print(description)

	return result