Update train.py
Browse files
train.py
CHANGED
|
@@ -1,93 +1,94 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import numpy as np
|
| 4 |
-
from sklearn.model_selection import KFold
|
| 5 |
-
from sklearn.metrics import mean_squared_error, r2_score
|
| 6 |
-
from scipy.stats import pearsonr, ttest_ind
|
| 7 |
-
from catboost import CatBoostRegressor
|
| 8 |
-
|
| 9 |
-
# Load dataset
|
| 10 |
-
data = pd.read_csv("embeddings/ESM2_interaction.csv")
|
| 11 |
-
|
| 12 |
-
# Fill missing feature strings
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
-
"
|
| 70 |
-
"
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.model_selection import KFold
|
| 5 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
| 6 |
+
from scipy.stats import pearsonr, ttest_ind
|
| 7 |
+
from catboost import CatBoostRegressor
|
| 8 |
+
|
| 9 |
+
# Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms)
|
| 10 |
+
data = pd.read_csv("embeddings/ESM2_interaction.csv")
|
| 11 |
+
|
| 12 |
+
# Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained.
|
| 13 |
+
# Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features
|
| 14 |
+
for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
|
| 15 |
+
data[col] = data[col].fillna("")
|
| 16 |
+
|
| 17 |
+
# Parse comma-separated floats
|
| 18 |
+
for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
|
| 19 |
+
data[col] = data[col].apply(
|
| 20 |
+
lambda s: [float(x) for x in str(s).split(",") if x.strip()]
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Build feature arrays
|
| 24 |
+
X_ligand = np.vstack(data["Ligand Features"].values)
|
| 25 |
+
X_receptor = np.vstack(data["Receptor Features"].values)
|
| 26 |
+
# optional: X_physical = np.vstack(data["Physical Features"].values)
|
| 27 |
+
|
| 28 |
+
# Convert KD(M) into log10 scale
|
| 29 |
+
raw_y = data["KD(M)"].values
|
| 30 |
+
y = np.log10(raw_y) # assumes all KD values are positive
|
| 31 |
+
|
| 32 |
+
records = []
|
| 33 |
+
|
| 34 |
+
# Repeat 5×5-fold CV, with and without physical features
|
| 35 |
+
for repeat in range(1, 6):
|
| 36 |
+
kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
|
| 37 |
+
|
| 38 |
+
for include_phys in (False, True):
|
| 39 |
+
X_base = np.hstack([X_ligand, X_receptor])
|
| 40 |
+
X_full = np.hstack([X_base, X_physical])
|
| 41 |
+
X_data = X_full if include_phys else X_base
|
| 42 |
+
|
| 43 |
+
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
|
| 44 |
+
X_train, X_test = X_data[train_idx], X_data[test_idx]
|
| 45 |
+
y_train, y_test = y[train_idx], y[test_idx]
|
| 46 |
+
|
| 47 |
+
# Initialize with your chosen hyperparameters and GPU support
|
| 48 |
+
model = CatBoostRegressor(
|
| 49 |
+
iterations=2000,
|
| 50 |
+
learning_rate=0.08,
|
| 51 |
+
depth=4,
|
| 52 |
+
verbose=500,
|
| 53 |
+
task_type="GPU",
|
| 54 |
+
devices="0"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Train and time this fold
|
| 58 |
+
model.fit(X_train, y_train)
|
| 59 |
+
|
| 60 |
+
preds = model.predict(X_test)
|
| 61 |
+
rmse = np.sqrt(mean_squared_error(y_test, preds))
|
| 62 |
+
r2 = r2_score(y_test, preds)
|
| 63 |
+
pcc = pearsonr(y_test, preds)[0]
|
| 64 |
+
|
| 65 |
+
records.append({
|
| 66 |
+
"repeat": repeat,
|
| 67 |
+
"fold": fold_idx,
|
| 68 |
+
"with_physical": include_phys,
|
| 69 |
+
"pearson_r": pcc,
|
| 70 |
+
"r2": r2,
|
| 71 |
+
"rmse": rmse
|
| 72 |
+
})
|
| 73 |
+
|
| 74 |
+
# Aggregate metrics
|
| 75 |
+
metrics_df = pd.DataFrame(records)
|
| 76 |
+
|
| 77 |
+
# Save to CSV
|
| 78 |
+
out_dir = "metrics"
|
| 79 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 80 |
+
csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
|
| 81 |
+
metrics_df.to_csv(csv_path, index=False)
|
| 82 |
+
print(f"All metrics saved to {csv_path}")
|
| 83 |
+
|
| 84 |
+
# Conduct independent t tests for each metric
|
| 85 |
+
results = {}
|
| 86 |
+
for metric in ["pearson_r", "r2", "rmse"]:
|
| 87 |
+
grp_with = metrics_df.loc[metrics_df.with_physical, metric]
|
| 88 |
+
grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
|
| 89 |
+
t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
|
| 90 |
+
results[metric] = (t_stat, p_val)
|
| 91 |
+
|
| 92 |
+
print("\nT test results comparing with vs without physical features:")
|
| 93 |
+
for m, (t_stat, p_val) in results.items():
|
| 94 |
+
print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")
|