DS-STAR / src /agents /analyzer_agent.py
anurag-deo's picture
Upload folder using huggingface_hub
8ff817c verified
"""
Analyzer Agent: Analyzes data files and generates descriptions.
This agent runs once at the beginning to understand available data.
"""
import os
from pathlib import Path
from langchain_core.messages import AIMessage
from ..utils.code_execution import execute_with_debug
from ..utils.formatters import extract_code, gemini_text
from ..utils.state import DSStarState
def analyzer_node(state: DSStarState) -> dict:
"""
Analyzer Agent Node: Analyzes all data files in the data/ directory.
For each file, generates and executes Python code to:
- Load the file
- Print structure, types, and sample data
- Capture essential information
Args:
state: Current DSStarState
Returns:
Dictionary with updated state fields:
- data_descriptions: Dict mapping filename to analysis result
- messages: Agent communication messages
- next: Next node to visit ("planner" or "__end__")
"""
print("=" * 60)
print("DATA ANALYZER AGENT STARTING...")
print("=" * 60)
data_dir = "data/"
descriptions = {}
# Check if data directory exists
if not os.path.exists(data_dir):
print(f"Error: {data_dir} directory not found")
return {
"data_descriptions": {"error": "Data directory not found"},
"messages": [AIMessage(content="Error: data/ directory not found")],
"next": "__end__",
}
# Get list of files
files = [
f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))
]
if not files:
print(f"Error: No files found in {data_dir}")
return {
"data_descriptions": {"error": "No data files found"},
"messages": [AIMessage(content="Error: No files in data/ directory")],
"next": "__end__",
}
print(f"Found {len(files)} files to analyze")
# Analyze each file
for filename in files:
filepath = os.path.join(data_dir, filename)
file_ext = Path(filepath).suffix.lower()
print(f"\nAnalyzing: {filename}")
# Generate analysis script
analysis_prompt = f"""Generate a Python script to analyze the file: {filepath}
File type: {file_ext}
Requirements:
- Load the file using appropriate method for {file_ext} format
- Print essential information:
* Data structure and types
* Column names (for structured data like CSV, Excel)
* First 3-5 rows/examples
* Shape/size information
- Handle common formats: CSV, JSON, Excel, TXT, MD
- Use pandas for structured data
- No try-except blocks
- All files are in 'data/' directory
- Print output clearly
Provide ONLY the Python code in a markdown code block."""
try:
# Get LLM response
response = state["llm"].invoke(analysis_prompt)
# Handle different response formats (Gemini vs OpenAI)
if hasattr(response, "content") and isinstance(response.content, list):
# Gemini format
response_text = gemini_text(response)
elif hasattr(response, "content"):
response_text = response.content
else:
response_text = str(response)
code = extract_code(response_text)
# Execute with debugging
result = execute_with_debug(code, state["llm"], is_analysis=True)
descriptions[filename] = result
print(f"✓ Successfully analyzed {filename}")
except Exception as e:
descriptions[filename] = f"Error analyzing file: {str(e)}"
print(f"✗ Failed to analyze {filename}: {str(e)}")
print("\n" + "=" * 60)
print(f"ANALYSIS COMPLETE: {len(files)} files processed")
print("=" * 60)
return {
"data_descriptions": descriptions,
"messages": [AIMessage(content=f"Analyzed {len(files)} data files")],
"next": "planner",
}
# Standalone test function
def test_analyzer(llm, data_dir: str = "data/"):
"""
Test the analyzer agent independently.
Args:
llm: LLM instance
data_dir: Directory containing data files
Returns:
Dictionary with analysis results
"""
# Create minimal test state
test_state = {
"llm": llm,
"query": "Test query",
"data_descriptions": {},
"plan": [],
"current_code": "",
"execution_result": "",
"is_sufficient": False,
"router_decision": "",
"iteration": 0,
"max_iterations": 20,
"messages": [],
"next": "analyzer",
}
result = analyzer_node(test_state)
print("\n" + "=" * 60)
print("ANALYZER TEST RESULTS")
print("=" * 60)
for filename, description in result["data_descriptions"].items():
print(f"\n{filename}:")
print("-" * 60)
print(description)
return result