Spaces:
Build error
Build error
| #import the necessary dependencies | |
| import pandas as pd | |
| import numpy as np | |
| import lightgbm as lgb | |
| from lightgbm.callback import early_stopping | |
| import shap | |
| import streamlit as st | |
| from sklearn.ensemble import GradientBoostingRegressor | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_squared_error | |
| from sklearn.metrics import r2_score | |
| from sklearn.metrics import accuracy_score | |
| #load the data | |
| train_data = pd.read_csv("train.csv") | |
| test_data = pd.read_csv("test.csv") | |
| #clean the data | |
| cleanup_cats = {"MSZoning": {"A": 1, "C (all)": 2, "FV": 3, "I": 4, "RH": 5, "RL": 6, "RP": 7, "RM": 0}, | |
| "Street": {"Grvl": 1, "Pave": 0}, | |
| "Alley" : {"NA": 1, "Grvl": 2, "Pave": 0}, | |
| "LotShape": {"Reg": 1, "IR1": 2, "IR2": 3, "IR3": 0}, | |
| "LandContour": {"Lvl": 1, "Bnk": 2, "HLS": 3, "Low": 0}, | |
| "Utilities": {"AllPub": 0, "NoSewr": 3, "NoSeWa" : 2, "ELO": 1}, | |
| "LotConfig": {"Inside": 1, "Corner": 2, "CulDSac": 3, "FR2": 4, "FR3": 0}, | |
| "LandSlope": {"Gtl": 1, "Mod": 2, "Sev": 0}, | |
| "Neighborhood": {"Blmngtn": 1, "Blueste": 2, "BrDale": 3, "BrkSide": 4, "ClearCr": 5, "CollgCr": 6, "Crawfor": 7, "Edwards" : 8, "Gilbert": 9, "IDOTRR": 10, "MeadowV": 11, "Mitchel": 12, "NAmes": 13, "NoRidge": 14, "NPkVill": 15, "NridgHt": 16, "NWAmes": 17, "OldTown": 18, "SWISU": 19, "Sawyer": 20, "SawyerW": 21, "Somerst": 22, "StoneBr": 23, "Timber": 24, "Veenker": 0}, | |
| "Condition1": {"Artery": 1, "Feedr": 2, "Norm": 3, "RRNn": 4, "RRAn": 5, "PosN": 6, "PosA": 7, "RRNe": 8, "RRAe": 0}, | |
| "Condition2": {"Artery": 1, "Feedr": 2, "Norm": 3, "RRNn": 4, "RRAn": 5, "PosN": 6, "PosA": 7, "RRNe": 8, "RRAe": 0}, | |
| "BldgType": {"1Fam": 1, "2fmCon": 2, "Duplex": 3, "TwnhsE": 4, "TwnhsI": 5, "Twnhs": 0}, | |
| "HouseStyle": {"1Story": 1, "1.5Fin": 2, "1.5Unf": 3, "2Story": 4, "2.5Fin": 5, "2.5Unf": 6, "SFoyer": 7, "SLvl": 0}, | |
| "RoofStyle": {"Flat": 1, "Gable": 2, "Gambrel": 3, "Hip": 4, "Mansard": 5, "Shed": 0}, | |
| "RoofMatl": {"ClyTile": 1, "CompShg": 2, "Membran": 3, "Metal": 4, "Roll": 5, "Tar&Grv": 6, "WdShake": 7, "WdShngl": 0}, | |
| "Exterior1st": {"AsbShng": 1, "AsphShn": 2, "BrkComm": 3, "BrkFace": 4, "CBlock": 5, "CemntBd": 6, "HdBoard": 7, "ImStucc": 8, "MetalSd": 9, "Other": 10, "Plywood": 11, "Precast": 12, "Stone": 13, "Stucco": 14, "VinylSd": 15, "WdShing": 16, "Wd Sdng": 0}, | |
| "Exterior2nd": {"AsbShng": 1, "AsphShn": 2, "Brk Cmn": 3, "BrkFace": 4, "CBlock": 5, "CmentBd": 6, "HdBoard": 7, "ImStucc": 8, "MetalSd": 9, "Other": 10, "Plywood": 11, "Precast": 12, "Stone": 13, "Stucco": 14, "VinylSd": 15, "Wd Shng": 16, "Wd Sdng": 0}, | |
| "MasVnrType": {"None": 1, "BrkCmn": 2, "BrkFace": 3, "CBlock": 4, "Stone": 0}, | |
| "ExterQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 0}, | |
| "ExterCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 0}, | |
| "Foundation": {"BrkTil": 1, "CBlock": 2, "PConc": 3, "Slab": 4, "Stone": 5, "Wood": 0}, | |
| "BsmtQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "NA": 0}, | |
| "BsmtCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "NA": 0}, | |
| "BsmtExposure": {"NA": 1, "No": 2, "Mn": 3, "Av": 4, "Gd": 0}, | |
| "BsmtFinType1": {"GLQ": 1, "ALQ": 2, "BLQ": 3, "Rec": 4, "LwQ": 5, "Unf": 6, "NA": 0}, | |
| "BsmtFinType2": {"GLQ": 1, "ALQ": 2, "BLQ": 3, "Rec": 4, "LwQ": 5, "Unf": 6, "NA": 0}, | |
| "Heating": {"Floor": 1, "GasA": 2, "GasW": 3, "Grav": 4, "OthW": 5, "Wall": 0}, | |
| "HeatingQC": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 0}, | |
| "CentralAir": {"N": 0, "Y": 1}, | |
| "Electrical": {"SBrkr": 1, "FuseA": 2, "FuseF": 3, "FuseP": 4, "Mix": 0}, | |
| "KitchenQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 0}, | |
| "Functional": {"Typ": 1, "Min1": 2, "Min2": 3, "Mod": 4, "Maj1": 5, "Maj2": 6, "Sev": 7, "Sal": 0}, | |
| "FireplaceQu": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "NA": 0}, | |
| "GarageType": {"2Types": 1, "Attchd": 2, "Basment": 3, "BuiltIn": 4, "CarPort": 5, "Detchd": 6, "NA": 0}, | |
| "GarageFinish": {"NA": 1, "Unf": 2, "RFn": 3, "Fin": 0}, | |
| "GarageQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "NA": 0}, | |
| "GarageCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "NA": 0}, | |
| "PavedDrive": {"N": 0, "P": 1, "Y": 2}, | |
| "PoolQC": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 0}, | |
| "Fence": {"NA": 1, "MnWw": 2,"GdWo": 3, "MnPrv": 4, "GdPrv": 0}, | |
| "MiscFeature": {"Elev": 1, "Gar2": 2, "Othr": 3, "Shed": 4, "TenC": 5, "NA": 0}, | |
| "SaleType": {"WD": 1, "CWD": 2, "VWD": 3, "New": 4, "COD": 5, "Con": 6, "ConLw": 7, "ConLI": 8, "ConLD": 9, "Oth": 0}, | |
| "SaleCondition":{"Normal": 1, "Abnorml": 2, "AdjLand": 3, "Alloca": 4, "Family": 5, "Partial": 0}} | |
| #Drop ID columns | |
| train_data = train_data.drop("Id", axis = 1) | |
| test_data = test_data.drop("Id", axis = 1) | |
| #this is to encode each category with integers | |
| train_data = train_data.replace(cleanup_cats); | |
| test_data = test_data.replace(cleanup_cats); | |
| #remove columns with NaN (since the columns that do include them are flooded with them) | |
| removals = train_data.columns[train_data.isnull().any()] | |
| Train_data = train_data.drop(removals, axis = 1) | |
| Test_data = test_data.drop(removals,axis=1 ) | |
| #split the training data set | |
| X_train, X_test, y_train, y_test = train_test_split(Train_data.drop('SalePrice', axis=1), Train_data['SalePrice'], test_size=0.2, shuffle = True, random_state=42) | |
| training_data=lgb.Dataset(X_train,label=y_train) | |
| valid_data=lgb.Dataset(X_test,label=y_test) | |
| #parameters taken from optuna training | |
| params = { | |
| "objective": "regression", | |
| "n_estimators": 10000, | |
| "learning_rate": 0.08828308704850689, | |
| "num_leaves": 256, | |
| "max_depth": 11, | |
| "min_data_in_leaf": 100, | |
| "lambda_l1": 55, | |
| "lambda_l2": 60, | |
| "min_gain_to_split": 13.827512822883651, | |
| "bagging_fraction": 1.0, | |
| "bagging_freq": 1, | |
| "feature_fraction": 0.4 | |
| } | |
| model = lgb.train(params, | |
| training_data, | |
| valid_sets=valid_data, | |
| callbacks = [early_stopping(300)] | |
| ) | |
| #Unoptimized (hand_picked) | |
| params = { | |
| 'metric': 'rmse', | |
| 'boosting_type' : 'gbdt', | |
| 'objective' : 'regression', | |
| 'max_depth' : 2, | |
| 'n_estimators' : 10000, | |
| 'num_leaves' : 4, | |
| 'verbose' : 1, | |
| 'learning_rate': 0.01, | |
| 'min_data_in_leaf': 100 | |
| } | |
| lgbm = lgb.train(params, | |
| training_data, | |
| valid_sets=valid_data, | |
| early_stopping_rounds=3000 | |
| ) | |
| #function to call for predictions | |
| def make_a_prediction(X): | |
| opt_prediction = model.predict(X) | |
| unopt_prediction = lgbm.predict(X) | |
| predictions = [opt_prediction, unopt_prediction] | |
| return predictions | |
| predictions = make_a_prediction(X_test) | |
| print(predictions) | |
| #to be called when needed for optimized results | |
| def create_opt_shap_models(data): | |
| explainer = shap.TreeExplainer(model) | |
| shap_values = explainer.shap_values(data) | |
| shap.initjs() | |
| obj1 = shap.force_plot(explainer.expected_value, shap_values=shap_values, feature_names=data.columns) | |
| shap.initjs() | |
| shap.decision_plot(explainer.expected_value, shap_values, feature_names=np.array(data.columns)) | |
| st.pyplot(bbox_inches='tight') | |
| shap.initjs() | |
| shap.summary_plot(shap_values=shap_values, feature_names=data.columns) | |
| st.pyplot(bbox_inches='tight') | |
| interaction_values = explainer.shap_interaction_values(data) | |
| interaction_values[0].round(2) | |
| st.write(pd.DataFrame(interaction_values[0].round(2)).head(60)) | |
| return obj1 | |
| #to be called when needed for optimized results | |
| def create_unopt_shap_models(data): | |
| explainer = shap.TreeExplainer(lgbm) | |
| shap_values = explainer.shap_values(data) | |
| shap.initjs() | |
| obj2 = shap.force_plot(explainer.expected_value, shap_values=shap_values, feature_names=data.columns) | |
| shap.initjs() | |
| shap.decision_plot(explainer.expected_value, shap_values, feature_names=np.array(data.columns)) | |
| st.pyplot(bbox_inches='tight') | |
| shap.initjs() | |
| shap.summary_plot(shap_values=shap_values, feature_names=data.columns) | |
| st.pyplot(bbox_inches='tight') | |
| interaction_values = explainer.shap_interaction_values(data) | |
| interaction_values[0].round(2) | |
| st.write(pd.DataFrame(interaction_values[0].round(2)).head(60)) | |
| return obj2 |