Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

Aaron Mueller commited on Jan 28

Commit

2817fcb

1 Parent(s): 4493851

support all model/task combinations

Browse files

Files changed (6) hide show

app.py +17 -17
caulsal_metric.py +6 -6
src/about.py +6 -2
src/display/utils.py +1 -1
src/leaderboard/read_evals.py +42 -28
src/populate.py +10 -10

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ def restart_space():
 ### Space initialisation
 try:
-    print(EVAL_REQUESTS_PATH)
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
@@ -54,7 +54,7 @@ except Exception:
 try:
-    print(RESULTS_REPO_MIB_SUBGRAPH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
@@ -63,7 +63,7 @@ except Exception:
 try:
-    print(RESULTS_REPO_MIB_CAUSALGRAPH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
@@ -95,7 +95,7 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
 def init_leaderboard_mib_subgraph(dataframe, track):
-    print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -103,7 +103,7 @@ def init_leaderboard_mib_subgraph(dataframe, track):
     # filter for correct track
     # dataframe = dataframe.loc[dataframe["Track"] == track]
-    print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
     return Leaderboard(
         value=dataframe,
@@ -120,20 +120,20 @@ def init_leaderboard_mib_subgraph(dataframe, track):
     )
 def init_leaderboard_mib_causalgraph(dataframe, track):
-    print("Debugging column issues:")
-    print("\nActual DataFrame columns:")
-    print(dataframe.columns.tolist())
-    print("\nExpected columns for Leaderboard:")
     expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
-    print(expected_cols)
-    print("\nMissing columns:")
     missing_cols = [col for col in expected_cols if col not in dataframe.columns]
-    print(missing_cols)
-    print("\nSample of DataFrame content:")
-    print(dataframe.head().to_string())
     return Leaderboard(
         value=dataframe,
@@ -150,9 +150,9 @@ def init_leaderboard_mib_causalgraph(dataframe, track):
     )
 def init_leaderboard_mib_causalgraph(dataframe, track):
-    print("Debugging column issues:")
-    print("\nActual DataFrame columns:")
-    print(dataframe.columns.tolist())
     # Create only necessary columns
     return Leaderboard(

 ### Space initialisation
 try:
+    # print(EVAL_REQUESTS_PATH)
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 try:
+    # print(RESULTS_REPO_MIB_SUBGRAPH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_SUBGRAPH, local_dir=EVAL_RESULTS_MIB_SUBGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 try:
+    # print(RESULTS_REPO_MIB_CAUSALGRAPH)
     snapshot_download(
         repo_id=RESULTS_REPO_MIB_CAUSALGRAPH, local_dir=EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 def init_leaderboard_mib_subgraph(dataframe, track):
+    # print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     # filter for correct track
     # dataframe = dataframe.loc[dataframe["Track"] == track]
+    # print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
     return Leaderboard(
         value=dataframe,
     )
 def init_leaderboard_mib_causalgraph(dataframe, track):
+    # print("Debugging column issues:")
+    # print("\nActual DataFrame columns:")
+    # print(dataframe.columns.tolist())
+    # print("\nExpected columns for Leaderboard:")
     expected_cols = [c.name for c in fields(AutoEvalColumn_mib_causalgraph)]
+    # print(expected_cols)
+    # print("\nMissing columns:")
     missing_cols = [col for col in expected_cols if col not in dataframe.columns]
+    # print(missing_cols)
+    # print("\nSample of DataFrame content:")
+    # print(dataframe.head().to_string())
     return Leaderboard(
         value=dataframe,
     )
 def init_leaderboard_mib_causalgraph(dataframe, track):
+    # print("Debugging column issues:")
+    # print("\nActual DataFrame columns:")
+    # print(dataframe.columns.tolist())
     # Create only necessary columns
     return Leaderboard(

caulsal_metric.py CHANGED Viewed

@@ -235,9 +235,9 @@ if __name__ == "__main__":
     folder_path = "./json_files"
     detailed_df, aggregated_df, intervention_averaged_df = process_json_folder(folder_path)
-    print("Detailed Results (including duplicates):")
-    print(detailed_df)
-    print("\nAggregated Results (max scores per method):")
-    print(aggregated_df)
-    print("\nIntervention-Averaged Results:")
-    print(intervention_averaged_df)

     folder_path = "./json_files"
     detailed_df, aggregated_df, intervention_averaged_df = process_json_folder(folder_path)
+    # print("Detailed Results (including duplicates):")
+    # print(detailed_df)
+    # print("\nAggregated Results (max scores per method):")
+    # print(aggregated_df)
+    # print("\nIntervention-Averaged Results:")
+    # print(intervention_averaged_df)

src/about.py CHANGED Viewed

@@ -40,8 +40,12 @@ class TaskMIB_Subgraph:
     metrics: list[str]  # metrics to store (edge_counts, faithfulness)
 class TasksMib_Subgraph(Enum):
-    task0 = TaskMIB_Subgraph("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
-    task1 = TaskMIB_Subgraph("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])

     metrics: list[str]  # metrics to store (edge_counts, faithfulness)
 class TasksMib_Subgraph(Enum):
+    task0 = TaskMIB_Subgraph("ioi", ["gpt2", "qwen2_5", "gemma2", "llama3"], "IOI", ["edge_counts", "faithfulness"])
+    task1 = TaskMIB_Subgraph("mcqa", ["qwen2_5", "gemma2", "llama3"], "MCQA", ["edge_counts", "faithfulness"])
+    task2 = TaskMIB_Subgraph("arithmetic_addition", ["llama3"], "arithmetic_addition", ["edge_counts", "faithfulness"])
+    task3 = TaskMIB_Subgraph("arithmetic_subtraction", ["llama3"], "arithmetic_subtraction", ["edge_counts", "faithfulness"])
+    task4 = TaskMIB_Subgraph("arc_easy", ["gemma2", "llama3"], "arc_easy", ["edge_counts", "faithfulness"])
+    task5 = TaskMIB_Subgraph("arc_challenge", ["llama3"], "arc_challenge", ["edge_counts", "faithfulness"])

src/display/utils.py CHANGED Viewed

@@ -68,7 +68,7 @@ auto_eval_column_dict_mib_subgraph.append(["method", ColumnContent, ColumnConten
 # For each task and model combination
 for task in TasksMib_Subgraph:
     for model in task.value.models:
-        col_name = f"{task.value.benchmark}_{model}"  # ioi_meta_llama, mcqa_qwen, etc.
         auto_eval_column_dict_mib_subgraph.append([
             col_name,
             ColumnContent,

 # For each task and model combination
 for task in TasksMib_Subgraph:
     for model in task.value.models:
+        col_name = f"{task.value.benchmark}_{model}"  # ioi_gpt2, mcqa_qwen2.5, etc.
         auto_eval_column_dict_mib_subgraph.append([
             col_name,
             ColumnContent,

src/leaderboard/read_evals.py CHANGED Viewed

@@ -29,9 +29,9 @@ def compute_area(edge_counts, faithfulnesses, log_scale=True):
         x_1 = percentages[i_1]
         x_2 = percentages[i_2]
         # area from point to 100
-        if log_scale:
-            x_1 = math.log(x_1)
-            x_2 = math.log(x_2)
         trapezoidal = (percentages[i_2] - percentages[i_1]) * \
                         (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
         area_from_100 += trapezoidal
@@ -58,7 +58,7 @@ class EvalResult_MIB_SUBGRAPH:
         # Initialize results dictionary with the exact structure from JSON
         results = {}
-        for task in ["ioi", "mcqa"]:  # Use exact task names from JSON
             results[task] = {}
         # Process each model's results maintaining original structure
@@ -67,17 +67,19 @@ class EvalResult_MIB_SUBGRAPH:
             if "/" in model_id:
                 org = model_id.split("/")[0]
                 if org == "meta-llama":
-                    model_name = "meta_llama"
                 elif org == "Qwen":
-                    model_name = "qwen"
                 elif "gpt" in model_id.lower():
                     model_name = "gpt2"
             else:
-                model_name = model_id
             # Keep exact scores structure from JSON
             scores = model_result.get("scores", {})
-            for task in ["ioi", "mcqa"]:
                 if task in scores:
                     results[task][model_name] = {
                         "edge_counts": scores[task]["edge_counts"],
@@ -100,10 +102,16 @@ class EvalResult_MIB_SUBGRAPH:
         }
         # Initialize all possible columns with '-'
-        expected_models = ["meta_llama", "qwen", "gpt2"]
-        expected_tasks = ["ioi", "mcqa"]
         for task in expected_tasks:
             for model in expected_models:
                 data_dict[f"{task}_{model}"] = '-'
         all_scores = []
@@ -117,24 +125,30 @@ class EvalResult_MIB_SUBGRAPH:
                 faithfulness = metrics["faithfulness"]
                 if isinstance(faithfulness[0], list):
                     faithfulness = faithfulness[0]
                 result = compute_area(metrics["edge_counts"], faithfulness)
                 if result is None or result[0] is None:
                     continue
                 area_under, _, _ = result
-                score = area_under * 100
                 data_dict[col_name] = round(score, 2)
                 all_scores.append(score)
         # All entries must be present for average
         required_entries = [
-            data_dict['ioi_meta_llama'] != '-',
-            data_dict['ioi_qwen'] != '-',
             data_dict['ioi_gpt2'] != '-',
-            data_dict['mcqa_meta_llama'] != '-',
-            data_dict['mcqa_qwen'] != '-',
-            data_dict['mcqa_gpt2'] != '-'
         ]
         data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
@@ -145,10 +159,10 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
     """From the path of the results folder root, extract all needed info for MIB results"""
     model_result_filepaths = []
-    print(f"results_path is {results_path}")
     for root, dirnames, files in os.walk(results_path):
-        print(f"root is {root}, dirnames is {dirnames}, files is {files}")
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
@@ -162,14 +176,14 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
-    print(f"model_result_filepaths is {model_result_filepaths}")
     eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
             eval_result = EvalResult_MIB_SUBGRAPH("", "", {})  # Create empty instance
             result = eval_result.init_from_json_file(model_result_filepath)
-            print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
             # Verify the result can be converted to dict format
             result.to_dict()
             eval_results.append(result)
@@ -424,10 +438,10 @@ class EvalResult_MIB_CAUSALGRAPH:
 def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
     model_result_filepaths = []
-    print(f"Scanning directory: {results_path}")
     for root, dirnames, files in os.walk(results_path):
-        print(f"Current directory: {root}")
-        print(f"Found files: {files}")
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
@@ -439,21 +453,21 @@ def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str)
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
-    print(f"Found json files: {model_result_filepaths}")
     eval_results = []
     for filepath in model_result_filepaths:
         try:
             eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
             result = eval_result.init_from_json_file(filepath)
-            print(f"Processed file {filepath}")
-            print(f"Got result: {result}")
             eval_results.append(result)
         except Exception as e:
             print(f"Error processing {filepath}: {e}")
             continue
-    print(f"Total results processed: {len(eval_results)}")
     return eval_results

         x_1 = percentages[i_1]
         x_2 = percentages[i_2]
         # area from point to 100
+        # if log_scale:
+        #     x_1 = math.log(x_1)
+        #     x_2 = math.log(x_2)
         trapezoidal = (percentages[i_2] - percentages[i_1]) * \
                         (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
         area_from_100 += trapezoidal
         # Initialize results dictionary with the exact structure from JSON
         results = {}
+        for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:  # Use exact task names from JSON
             results[task] = {}
         # Process each model's results maintaining original structure
             if "/" in model_id:
                 org = model_id.split("/")[0]
                 if org == "meta-llama":
+                    model_name = "llama3"
                 elif org == "Qwen":
+                    model_name = "qwen2_5"
                 elif "gpt" in model_id.lower():
                     model_name = "gpt2"
+                elif org == "google":
+                    model_name = "gemma2"
             else:
+                model_name = model_id.replace(".", "_")
             # Keep exact scores structure from JSON
             scores = model_result.get("scores", {})
+            for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
                 if task in scores:
                     results[task][model_name] = {
                         "edge_counts": scores[task]["edge_counts"],
         }
         # Initialize all possible columns with '-'
+        expected_models = ["llama3", "qwen2_5", "gpt2", "gemma2"]
+        expected_tasks = ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
         for task in expected_tasks:
             for model in expected_models:
+                if model == "gpt2" and task != "ioi":
+                    continue
+                if model == "qwen2_5" and task.startswith(("arithmetic", "arc")):
+                    continue
+                if model == "gemma2" and (task.startswith("arithmetic") or task == "arc_challenge"):
+                    continue
                 data_dict[f"{task}_{model}"] = '-'
         all_scores = []
                 faithfulness = metrics["faithfulness"]
                 if isinstance(faithfulness[0], list):
                     faithfulness = faithfulness[0]
                 result = compute_area(metrics["edge_counts"], faithfulness)
                 if result is None or result[0] is None:
                     continue
                 area_under, _, _ = result
+                score = area_under
                 data_dict[col_name] = round(score, 2)
                 all_scores.append(score)
         # All entries must be present for average
         required_entries = [
+            data_dict['ioi_llama3'] != '-',
+            data_dict['ioi_qwen2_5'] != '-',
             data_dict['ioi_gpt2'] != '-',
+            data_dict['ioi_gemma2'] != '-',
+            data_dict['mcqa_llama3'] != '-',
+            data_dict['mcqa_qwen2_5'] != '-',
+            data_dict['mcqa_gemma2'] != '-',
+            data_dict['arithmetic_addition_llama3'] != '-',
+            data_dict['arithmetic_subtraction_llama3'] != '-',
+            data_dict['arc_easy_gemma2'] != '-',
+            data_dict['arc_easy_llama3'] != '-',
+            data_dict['arc_challenge_llama3'] != '-'
         ]
         data_dict["Average"] = round(np.mean(all_scores), 2) if all(required_entries) else '-'
     """From the path of the results folder root, extract all needed info for MIB results"""
     model_result_filepaths = []
+    # print(f"results_path is {results_path}")
     for root, dirnames, files in os.walk(results_path):
+        # print(f"root is {root}, dirnames is {dirnames}, files is {files}")
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    # print(f"model_result_filepaths is {model_result_filepaths}")
     eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
             eval_result = EvalResult_MIB_SUBGRAPH("", "", {})  # Create empty instance
             result = eval_result.init_from_json_file(model_result_filepath)
+            # print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
             # Verify the result can be converted to dict format
             result.to_dict()
             eval_results.append(result)
 def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
     model_result_filepaths = []
+    # print(f"Scanning directory: {results_path}")
     for root, dirnames, files in os.walk(results_path):
+        # print(f"Current directory: {root}")
+        # print(f"Found files: {files}")
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    # print(f"Found json files: {model_result_filepaths}")
     eval_results = []
     for filepath in model_result_filepaths:
         try:
             eval_result = EvalResult_MIB_CAUSALGRAPH("", "", {})
             result = eval_result.init_from_json_file(filepath)
+            # print(f"Processed file {filepath}")
+            # print(f"Got result: {result}")
             eval_results.append(result)
         except Exception as e:
             print(f"Error processing {filepath}: {e}")
             continue
+    # print(f"Total results processed: {len(eval_results)}")
     return eval_results

src/populate.py CHANGED Viewed

@@ -10,11 +10,11 @@ from src.about import TasksMib_Causalgraph
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
-    print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results(results_path, requests_path)
-    print(f"raw_data is {raw_data}")
     all_data_json = [v.to_dict() for v in raw_data]
-    print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
     all_data_json_filtered = []
     for item in all_data_json:
         item["Track"] = item["eval_name"].split("_")[-1]
@@ -32,7 +32,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
     # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
-    print(f"df is {df}")
     # df = df[cols].round(decimals=1)
@@ -44,13 +44,13 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
 def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the MIB experiment results"""
-    print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
-    print(f"raw_data is {raw_data}")
     # Convert each result to dict format
     all_data_json = [v.to_dict() for v in raw_data]
-    print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
     # Convert to dataframe
     df = pd.DataFrame.from_records(all_data_json)
@@ -242,7 +242,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
 #     return detailed_df, aggregated_df, intervention_averaged_df
 def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
     # Convert each result to dict format for detailed df
@@ -250,7 +250,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
     detailed_df = pd.DataFrame.from_records(all_data_json)
     # Print the actual columns for debugging
-    print("Original columns:", detailed_df.columns.tolist())
     # Rename columns to match schema
     column_mapping = {}
@@ -271,7 +271,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
     # Create intervention-averaged df
     intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
-    print("Transformed columns:", detailed_df.columns.tolist())
     return detailed_df, aggregated_df, intervention_averaged_df

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    # print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results(results_path, requests_path)
+    # print(f"raw_data is {raw_data}")
     all_data_json = [v.to_dict() for v in raw_data]
+    # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
     all_data_json_filtered = []
     for item in all_data_json:
         item["Track"] = item["eval_name"].split("_")[-1]
     # df = df.sort_values(by=[Tasks.task0.value.col_name], ascending=False)
     # df = df.sort_values(by=[AutoEvalColumn.track.name], ascending=False)
+    # print(f"df is {df}")
     # df = df[cols].round(decimals=1)
 def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the MIB experiment results"""
+    # print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
+    # print(f"raw_data is {raw_data}")
     # Convert each result to dict format
     all_data_json = [v.to_dict() for v in raw_data]
+    # print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
     # Convert to dataframe
     df = pd.DataFrame.from_records(all_data_json)
 #     return detailed_df, aggregated_df, intervention_averaged_df
 def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    # print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
     # Convert each result to dict format for detailed df
     detailed_df = pd.DataFrame.from_records(all_data_json)
     # Print the actual columns for debugging
+    # print("Original columns:", detailed_df.columns.tolist())
     # Rename columns to match schema
     column_mapping = {}
     # Create intervention-averaged df
     intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
+    # print("Transformed columns:", detailed_df.columns.tolist())
     return detailed_df, aggregated_df, intervention_averaged_df