# app.py
import streamlit as st
import joblib
import nltk
import torch
import torch.nn.functional as F
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# ——— 1) NLTK setup ———
nltk.download('stopwords')
_STOP_WORDS = set(stopwords.words('english'))
_TOKENIZER = RegexpTokenizer(r'\w+')

def preprocess_text(text: str) -> str:
    tokens = _TOKENIZER.tokenize(text.lower())
    return " ".join([t for t in tokens if t not in _STOP_WORDS])

# ——— 2) Load heavy resources once ———
@st.cache_resource
def load_resources():
    tfidf: TfidfVectorizer = joblib.load("tfidf_vectorizer.pkl")
    sage_model: torch.nn.Module = joblib.load("sage_model.pkl")
    sage_model.eval()
    return tfidf, sage_model

tfidf, sage_model = load_resources()

# ——— 3) Streamlit UI ———
st.title("Disinformation Detection")
st.write(
    """
    Paste in some text and click **Predict**.
    The model will output the probability it’s **True Information** vs. **Disinformation**.
    """
)

user_input = st.text_area("Your text here", height=200)

if st.button("Predict"):
    if not user_input.strip():
        st.warning("Please enter some text first.")
    else:
        # Preprocess & vectorize
        cleaned = preprocess_text(user_input)
        vec = tfidf.transform([cleaned]).toarray()
        x = torch.from_numpy(vec).float()   # shape [1, D]

        # Empty graph so GraphSAGE layers still run
        edge_index = torch.empty((2, 0), dtype=torch.long)

        # Inference
        with torch.no_grad():
            logits = sage_model(x, edge_index)    # [1, 2]
            probs = torch.exp(logits).numpy()[0]  # convert log‑softmax → probabilities

        # Display
        st.markdown("### Prediction probabilities")
        st.write(f"• 🔵 True information:  {probs[1]:.2%}")
        st.write(f"• 🔴 Disinformation:    {probs[0]:.2%}")

        verdict = "✅ Likely TRUE" if probs[1] > probs[0] else "❌ Likely DISINFORMATION"
        st.markdown(f"## **{verdict}**")