""" Analyzer Agent: Analyzes data files and generates descriptions. This agent runs once at the beginning to understand available data. """ import os from pathlib import Path from langchain_core.messages import AIMessage from ..utils.code_execution import execute_with_debug from ..utils.formatters import extract_code, gemini_text from ..utils.state import DSStarState def analyzer_node(state: DSStarState) -> dict: """ Analyzer Agent Node: Analyzes all data files in the data/ directory. For each file, generates and executes Python code to: - Load the file - Print structure, types, and sample data - Capture essential information Args: state: Current DSStarState Returns: Dictionary with updated state fields: - data_descriptions: Dict mapping filename to analysis result - messages: Agent communication messages - next: Next node to visit ("planner" or "__end__") """ print("=" * 60) print("DATA ANALYZER AGENT STARTING...") print("=" * 60) data_dir = "data/" descriptions = {} # Check if data directory exists if not os.path.exists(data_dir): print(f"Error: {data_dir} directory not found") return { "data_descriptions": {"error": "Data directory not found"}, "messages": [AIMessage(content="Error: data/ directory not found")], "next": "__end__", } # Get list of files files = [ f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) ] if not files: print(f"Error: No files found in {data_dir}") return { "data_descriptions": {"error": "No data files found"}, "messages": [AIMessage(content="Error: No files in data/ directory")], "next": "__end__", } print(f"Found {len(files)} files to analyze") # Analyze each file for filename in files: filepath = os.path.join(data_dir, filename) file_ext = Path(filepath).suffix.lower() print(f"\nAnalyzing: {filename}") # Generate analysis script analysis_prompt = f"""Generate a Python script to analyze the file: {filepath} File type: {file_ext} Requirements: - Load the file using appropriate method for {file_ext} format - Print essential information: * Data structure and types * Column names (for structured data like CSV, Excel) * First 3-5 rows/examples * Shape/size information - Handle common formats: CSV, JSON, Excel, TXT, MD - Use pandas for structured data - No try-except blocks - All files are in 'data/' directory - Print output clearly Provide ONLY the Python code in a markdown code block.""" try: # Get LLM response response = state["llm"].invoke(analysis_prompt) # Handle different response formats (Gemini vs OpenAI) if hasattr(response, "content") and isinstance(response.content, list): # Gemini format response_text = gemini_text(response) elif hasattr(response, "content"): response_text = response.content else: response_text = str(response) code = extract_code(response_text) # Execute with debugging result = execute_with_debug(code, state["llm"], is_analysis=True) descriptions[filename] = result print(f"✓ Successfully analyzed {filename}") except Exception as e: descriptions[filename] = f"Error analyzing file: {str(e)}" print(f"✗ Failed to analyze {filename}: {str(e)}") print("\n" + "=" * 60) print(f"ANALYSIS COMPLETE: {len(files)} files processed") print("=" * 60) return { "data_descriptions": descriptions, "messages": [AIMessage(content=f"Analyzed {len(files)} data files")], "next": "planner", } # Standalone test function def test_analyzer(llm, data_dir: str = "data/"): """ Test the analyzer agent independently. Args: llm: LLM instance data_dir: Directory containing data files Returns: Dictionary with analysis results """ # Create minimal test state test_state = { "llm": llm, "query": "Test query", "data_descriptions": {}, "plan": [], "current_code": "", "execution_result": "", "is_sufficient": False, "router_decision": "", "iteration": 0, "max_iterations": 20, "messages": [], "next": "analyzer", } result = analyzer_node(test_state) print("\n" + "=" * 60) print("ANALYZER TEST RESULTS") print("=" * 60) for filename, description in result["data_descriptions"].items(): print(f"\n{filename}:") print("-" * 60) print(description) return result