File size: 3,212 Bytes
f7f3a00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# streamlit_app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf

# ─── Caching loaders so they only run once ───────────────────────────────────
@st.cache(allow_output_mutation=True)
def load_preprocessor(path="preprocessor.pkl"):
    return joblib.load(path)

@st.cache(allow_output_mutation=True)
def load_label_encoder(path="label_encoder.pkl"):
    return joblib.load(path)

@st.cache(allow_output_mutation=True)
def load_model(path="keystroke_dnn.h5"):
    return tf.keras.models.load_model(path)

# ─── Prediction function ────────────────────────────────────────────────────
def predict_subjects(df_raw):
    """
    Takes a DataFrame of raw keystroke features, drops any
    'subject'/'sessionIndex'/'rep' columns, re-orders to the
    exact list the preprocessor saw at train-time, scales,
    runs the DNN, and returns predicted IDs + probabilities.
    """
    preprocessor  = load_preprocessor()
    label_encoder = load_label_encoder()
    model         = load_model()

    # 1) Drop unused cols if present
    for c in ("subject", "sessionIndex", "rep"):
        if c in df_raw.columns:
            df_raw = df_raw.drop(columns=[c])

    # 2) Re-order columns exactly as during training
    feature_cols = preprocessor.transformers_[0][2]
    df_features  = df_raw[feature_cols]

    # 3) Scale
    X_scaled = preprocessor.transform(df_features)

    # 4) Model inference
    y_prob   = model.predict(X_scaled)
    idx_pred = np.argmax(y_prob, axis=1)

    # 5) Decode one‐hot back to original labels
    labels   = label_encoder.categories_[0][idx_pred]

    # 6) Build output
    df_out = pd.DataFrame({"predicted_subject": labels})
    for i, cls in enumerate(label_encoder.categories_[0]):
        df_out[f"prob_{cls}"] = y_prob[:, i]

    return df_out

# ─── Streamlit UI ──────────────────────────────────────────────────────────
def main():
    st.set_page_config(page_title="Keystroke Dynamics Auth", layout="wide")
    st.title("πŸ”‘ Keystroke Dynamics Authentication")
    st.markdown(
        "Upload a CSV of raw keystroke‐feature vectors (one row per sample). "
        "The app will drop any `subject`/`sessionIndex`/`rep` columns, scale, "
        "run through the DNN, and return predicted subject IDs + confidence scores."
    )

    uploaded = st.file_uploader("Choose CSV file", type="csv")
    if not uploaded:
        return

    df = pd.read_csv(uploaded)
    st.write("### Raw feature preview (first 5 rows)")
    st.dataframe(df.head(), use_container_width=True)

    try:
        df_preds = predict_subjects(df)
        st.write("### Predictions")
        st.dataframe(df_preds, use_container_width=True)
    except KeyError as e:
        st.error(f"Missing expected feature column: {e}")
    except Exception as e:
        st.error(f"Error during prediction: {e}")

if __name__ == "__main__":
    main()