Spaces:

MCP-1st-Birthday
/

DS-STAR

Running

File size: 5,057 Bytes

8ff817c

"""

Analyzer Agent: Analyzes data files and generates descriptions.



This agent runs once at the beginning to understand available data.

"""

import os
from pathlib import Path

from langchain_core.messages import AIMessage

from ..utils.code_execution import execute_with_debug
from ..utils.formatters import extract_code, gemini_text
from ..utils.state import DSStarState


def analyzer_node(state: DSStarState) -> dict:
    """

    Analyzer Agent Node: Analyzes all data files in the data/ directory.



    For each file, generates and executes Python code to:

    - Load the file

    - Print structure, types, and sample data

    - Capture essential information



    Args:

        state: Current DSStarState



    Returns:

        Dictionary with updated state fields:

        - data_descriptions: Dict mapping filename to analysis result

        - messages: Agent communication messages

        - next: Next node to visit ("planner" or "__end__")

    """
    print("=" * 60)
    print("DATA ANALYZER AGENT STARTING...")
    print("=" * 60)

    data_dir = "data/"
    descriptions = {}

    # Check if data directory exists
    if not os.path.exists(data_dir):
        print(f"Error: {data_dir} directory not found")
        return {
            "data_descriptions": {"error": "Data directory not found"},
            "messages": [AIMessage(content="Error: data/ directory not found")],
            "next": "__end__",
        }

    # Get list of files
    files = [
        f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))
    ]

    if not files:
        print(f"Error: No files found in {data_dir}")
        return {
            "data_descriptions": {"error": "No data files found"},
            "messages": [AIMessage(content="Error: No files in data/ directory")],
            "next": "__end__",
        }

    print(f"Found {len(files)} files to analyze")

    # Analyze each file
    for filename in files:
        filepath = os.path.join(data_dir, filename)
        file_ext = Path(filepath).suffix.lower()

        print(f"\nAnalyzing: {filename}")

        # Generate analysis script
        analysis_prompt = f"""Generate a Python script to analyze the file: {filepath}



File type: {file_ext}



Requirements:

- Load the file using appropriate method for {file_ext} format

- Print essential information:

  * Data structure and types

  * Column names (for structured data like CSV, Excel)

  * First 3-5 rows/examples

  * Shape/size information

- Handle common formats: CSV, JSON, Excel, TXT, MD

- Use pandas for structured data

- No try-except blocks

- All files are in 'data/' directory

- Print output clearly



Provide ONLY the Python code in a markdown code block."""

        try:
            # Get LLM response
            response = state["llm"].invoke(analysis_prompt)

            # Handle different response formats (Gemini vs OpenAI)
            if hasattr(response, "content") and isinstance(response.content, list):
                # Gemini format
                response_text = gemini_text(response)
            elif hasattr(response, "content"):
                response_text = response.content
            else:
                response_text = str(response)

            code = extract_code(response_text)

            # Execute with debugging
            result = execute_with_debug(code, state["llm"], is_analysis=True)

            descriptions[filename] = result
            print(f"✓ Successfully analyzed {filename}")

        except Exception as e:
            descriptions[filename] = f"Error analyzing file: {str(e)}"
            print(f"✗ Failed to analyze {filename}: {str(e)}")

    print("\n" + "=" * 60)
    print(f"ANALYSIS COMPLETE: {len(files)} files processed")
    print("=" * 60)

    return {
        "data_descriptions": descriptions,
        "messages": [AIMessage(content=f"Analyzed {len(files)} data files")],
        "next": "planner",
    }


# Standalone test function
def test_analyzer(llm, data_dir: str = "data/"):
    """

    Test the analyzer agent independently.



    Args:

        llm: LLM instance

        data_dir: Directory containing data files



    Returns:

        Dictionary with analysis results

    """
    # Create minimal test state
    test_state = {
        "llm": llm,
        "query": "Test query",
        "data_descriptions": {},
        "plan": [],
        "current_code": "",
        "execution_result": "",
        "is_sufficient": False,
        "router_decision": "",
        "iteration": 0,
        "max_iterations": 20,
        "messages": [],
        "next": "analyzer",
    }

    result = analyzer_node(test_state)

    print("\n" + "=" * 60)
    print("ANALYZER TEST RESULTS")
    print("=" * 60)
    for filename, description in result["data_descriptions"].items():
        print(f"\n{filename}:")
        print("-" * 60)
        print(description)

    return result