hbp5181 commited on
Commit
4f89f53
·
verified ·
1 Parent(s): 2e7b09e

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +94 -93
train.py CHANGED
@@ -1,93 +1,94 @@
1
- import os
2
- import pandas as pd
3
- import numpy as np
4
- from sklearn.model_selection import KFold
5
- from sklearn.metrics import mean_squared_error, r2_score
6
- from scipy.stats import pearsonr, ttest_ind
7
- from catboost import CatBoostRegressor
8
-
9
- # Load dataset
10
- data = pd.read_csv("embeddings/ESM2_interaction.csv")
11
-
12
- # Fill missing feature strings
13
- for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
14
- data[col] = data[col].fillna("")
15
-
16
- # Parse comma-separated floats
17
- for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
18
- data[col] = data[col].apply(
19
- lambda s: [float(x) for x in str(s).split(",") if x.strip()]
20
- )
21
-
22
- # Build feature arrays
23
- X_ligand = np.vstack(data["Ligand Features"].values)
24
- X_receptor = np.vstack(data["Receptor Features"].values)
25
- X_physical = np.vstack(data["Physical Features"].values)
26
-
27
- # Convert KD(M) into log10 scale
28
- raw_y = data["KD(M)"].values
29
- y = np.log10(raw_y) # assumes all KD values are positive
30
-
31
- records = []
32
-
33
- # Repeat 5×5-fold CV, with and without physical features
34
- for repeat in range(1, 6):
35
- kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
36
-
37
- for include_phys in (False, True):
38
- X_base = np.hstack([X_ligand, X_receptor])
39
- X_full = np.hstack([X_base, X_physical])
40
- X_data = X_full if include_phys else X_base
41
-
42
- for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
43
- X_train, X_test = X_data[train_idx], X_data[test_idx]
44
- y_train, y_test = y[train_idx], y[test_idx]
45
-
46
- # Initialize with your chosen hyperparameters and GPU support
47
- model = CatBoostRegressor(
48
- iterations=2000,
49
- learning_rate=0.08,
50
- depth=4,
51
- verbose=500,
52
- task_type="GPU",
53
- devices="0"
54
- )
55
-
56
- # Train and time this fold
57
- model.fit(X_train, y_train)
58
-
59
- preds = model.predict(X_test)
60
- rmse = np.sqrt(mean_squared_error(y_test, preds))
61
- r2 = r2_score(y_test, preds)
62
- pcc = pearsonr(y_test, preds)[0]
63
-
64
- records.append({
65
- "repeat": repeat,
66
- "fold": fold_idx,
67
- "with_physical": include_phys,
68
- "pearson_r": pcc,
69
- "r2": r2,
70
- "rmse": rmse
71
- })
72
-
73
- # Aggregate metrics
74
- metrics_df = pd.DataFrame(records)
75
-
76
- # Save to CSV
77
- out_dir = "metrics"
78
- os.makedirs(out_dir, exist_ok=True)
79
- csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
80
- metrics_df.to_csv(csv_path, index=False)
81
- print(f"All metrics saved to {csv_path}")
82
-
83
- # Conduct independent t tests for each metric
84
- results = {}
85
- for metric in ["pearson_r", "r2", "rmse"]:
86
- grp_with = metrics_df.loc[metrics_df.with_physical, metric]
87
- grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
88
- t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
89
- results[metric] = (t_stat, p_val)
90
-
91
- print("\nT test results comparing with vs without physical features:")
92
- for m, (t_stat, p_val) in results.items():
93
- print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import KFold
5
+ from sklearn.metrics import mean_squared_error, r2_score
6
+ from scipy.stats import pearsonr, ttest_ind
7
+ from catboost import CatBoostRegressor
8
+
9
+ # Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms)
10
+ data = pd.read_csv("embeddings/ESM2_interaction.csv")
11
+
12
+ # Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained.
13
+ # Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features
14
+ for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
15
+ data[col] = data[col].fillna("")
16
+
17
+ # Parse comma-separated floats
18
+ for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
19
+ data[col] = data[col].apply(
20
+ lambda s: [float(x) for x in str(s).split(",") if x.strip()]
21
+ )
22
+
23
+ # Build feature arrays
24
+ X_ligand = np.vstack(data["Ligand Features"].values)
25
+ X_receptor = np.vstack(data["Receptor Features"].values)
26
+ # optional: X_physical = np.vstack(data["Physical Features"].values)
27
+
28
+ # Convert KD(M) into log10 scale
29
+ raw_y = data["KD(M)"].values
30
+ y = np.log10(raw_y) # assumes all KD values are positive
31
+
32
+ records = []
33
+
34
+ # Repeat 5×5-fold CV, with and without physical features
35
+ for repeat in range(1, 6):
36
+ kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
37
+
38
+ for include_phys in (False, True):
39
+ X_base = np.hstack([X_ligand, X_receptor])
40
+ X_full = np.hstack([X_base, X_physical])
41
+ X_data = X_full if include_phys else X_base
42
+
43
+ for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
44
+ X_train, X_test = X_data[train_idx], X_data[test_idx]
45
+ y_train, y_test = y[train_idx], y[test_idx]
46
+
47
+ # Initialize with your chosen hyperparameters and GPU support
48
+ model = CatBoostRegressor(
49
+ iterations=2000,
50
+ learning_rate=0.08,
51
+ depth=4,
52
+ verbose=500,
53
+ task_type="GPU",
54
+ devices="0"
55
+ )
56
+
57
+ # Train and time this fold
58
+ model.fit(X_train, y_train)
59
+
60
+ preds = model.predict(X_test)
61
+ rmse = np.sqrt(mean_squared_error(y_test, preds))
62
+ r2 = r2_score(y_test, preds)
63
+ pcc = pearsonr(y_test, preds)[0]
64
+
65
+ records.append({
66
+ "repeat": repeat,
67
+ "fold": fold_idx,
68
+ "with_physical": include_phys,
69
+ "pearson_r": pcc,
70
+ "r2": r2,
71
+ "rmse": rmse
72
+ })
73
+
74
+ # Aggregate metrics
75
+ metrics_df = pd.DataFrame(records)
76
+
77
+ # Save to CSV
78
+ out_dir = "metrics"
79
+ os.makedirs(out_dir, exist_ok=True)
80
+ csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
81
+ metrics_df.to_csv(csv_path, index=False)
82
+ print(f"All metrics saved to {csv_path}")
83
+
84
+ # Conduct independent t tests for each metric
85
+ results = {}
86
+ for metric in ["pearson_r", "r2", "rmse"]:
87
+ grp_with = metrics_df.loc[metrics_df.with_physical, metric]
88
+ grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
89
+ t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
90
+ results[metric] = (t_stat, p_val)
91
+
92
+ print("\nT test results comparing with vs without physical features:")
93
+ for m, (t_stat, p_val) in results.items():
94
+ print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")