hbp5181
/

BindPred

@@ -1,93 +1,94 @@
-import os
-import pandas as pd
-import numpy as np
-from sklearn.model_selection import KFold
-from sklearn.metrics import mean_squared_error, r2_score
-from scipy.stats import pearsonr, ttest_ind
-from catboost import CatBoostRegressor
-# Load dataset
-data = pd.read_csv("embeddings/ESM2_interaction.csv")
-# Fill missing feature strings
-for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
-    data[col] = data[col].fillna("")
-# Parse comma-separated floats
-for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
-    data[col] = data[col].apply(
-        lambda s: [float(x) for x in str(s).split(",") if x.strip()]
-    )
-# Build feature arrays
-X_ligand = np.vstack(data["Ligand Features"].values)
-X_receptor = np.vstack(data["Receptor Features"].values)
-X_physical = np.vstack(data["Physical Features"].values)
-# Convert KD(M) into log10 scale
-raw_y = data["KD(M)"].values
-y = np.log10(raw_y)  # assumes all KD values are positive
-records = []
-# Repeat 5×5-fold CV, with and without physical features
-for repeat in range(1, 6):
-    kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
-    for include_phys in (False, True):
-        X_base = np.hstack([X_ligand, X_receptor])
-        X_full = np.hstack([X_base, X_physical])
-        X_data = X_full if include_phys else X_base
-        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
-            X_train, X_test = X_data[train_idx], X_data[test_idx]
-            y_train, y_test = y[train_idx], y[test_idx]
-            # Initialize with your chosen hyperparameters and GPU support
-            model = CatBoostRegressor(
-                iterations=2000,
-                learning_rate=0.08,
-                depth=4,
-                verbose=500,
-                task_type="GPU",
-                devices="0"
-            )
-            # Train and time this fold
-            model.fit(X_train, y_train)
-            preds = model.predict(X_test)
-            rmse = np.sqrt(mean_squared_error(y_test, preds))
-            r2   = r2_score(y_test, preds)
-            pcc  = pearsonr(y_test, preds)[0]
-            records.append({
-                "repeat": repeat,
-                "fold": fold_idx,
-                "with_physical": include_phys,
-                "pearson_r": pcc,
-                "r2": r2,
-                "rmse": rmse
-            })
-# Aggregate metrics
-metrics_df = pd.DataFrame(records)
-# Save to CSV
-out_dir = "metrics"
-os.makedirs(out_dir, exist_ok=True)
-csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
-metrics_df.to_csv(csv_path, index=False)
-print(f"All metrics saved to {csv_path}")
-# Conduct independent t tests for each metric
-results = {}
-for metric in ["pearson_r", "r2", "rmse"]:
-    grp_with = metrics_df.loc[metrics_df.with_physical, metric]
-    grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
-    t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
-    results[metric] = (t_stat, p_val)
-print("\nT test results comparing with vs without physical features:")
-for m, (t_stat, p_val) in results.items():
-    print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")

+import os
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import KFold
+from sklearn.metrics import mean_squared_error, r2_score
+from scipy.stats import pearsonr, ttest_ind
+from catboost import CatBoostRegressor
+# Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms)
+data = pd.read_csv("embeddings/ESM2_interaction.csv")
+# Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained.
+# Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features
+for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
+    data[col] = data[col].fillna("")
+# Parse comma-separated floats
+for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
+    data[col] = data[col].apply(
+        lambda s: [float(x) for x in str(s).split(",") if x.strip()]
+    )
+# Build feature arrays
+X_ligand = np.vstack(data["Ligand Features"].values)
+X_receptor = np.vstack(data["Receptor Features"].values)
+# optional: X_physical = np.vstack(data["Physical Features"].values)
+# Convert KD(M) into log10 scale
+raw_y = data["KD(M)"].values
+y = np.log10(raw_y)  # assumes all KD values are positive
+records = []
+# Repeat 5×5-fold CV, with and without physical features
+for repeat in range(1, 6):
+    kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
+    for include_phys in (False, True):
+        X_base = np.hstack([X_ligand, X_receptor])
+        X_full = np.hstack([X_base, X_physical])
+        X_data = X_full if include_phys else X_base
+        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
+            X_train, X_test = X_data[train_idx], X_data[test_idx]
+            y_train, y_test = y[train_idx], y[test_idx]
+            # Initialize with your chosen hyperparameters and GPU support
+            model = CatBoostRegressor(
+                iterations=2000,
+                learning_rate=0.08,
+                depth=4,
+                verbose=500,
+                task_type="GPU",
+                devices="0"
+            )
+            # Train and time this fold
+            model.fit(X_train, y_train)
+            preds = model.predict(X_test)
+            rmse = np.sqrt(mean_squared_error(y_test, preds))
+            r2   = r2_score(y_test, preds)
+            pcc  = pearsonr(y_test, preds)[0]
+            records.append({
+                "repeat": repeat,
+                "fold": fold_idx,
+                "with_physical": include_phys,
+                "pearson_r": pcc,
+                "r2": r2,
+                "rmse": rmse
+            })
+# Aggregate metrics
+metrics_df = pd.DataFrame(records)
+# Save to CSV
+out_dir = "metrics"
+os.makedirs(out_dir, exist_ok=True)
+csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
+metrics_df.to_csv(csv_path, index=False)
+print(f"All metrics saved to {csv_path}")
+# Conduct independent t tests for each metric
+results = {}
+for metric in ["pearson_r", "r2", "rmse"]:
+    grp_with = metrics_df.loc[metrics_df.with_physical, metric]
+    grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
+    t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
+    results[metric] = (t_stat, p_val)
+print("\nT test results comparing with vs without physical features:")
+for m, (t_stat, p_val) in results.items():
+    print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")