# app.py import streamlit as st import joblib import nltk import torch import torch.nn.functional as F import numpy as np from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer from sklearn.feature_extraction.text import TfidfVectorizer # ——— 1) NLTK setup ——— nltk.download('stopwords') _STOP_WORDS = set(stopwords.words('english')) _TOKENIZER = RegexpTokenizer(r'\w+') def preprocess_text(text: str) -> str: tokens = _TOKENIZER.tokenize(text.lower()) return " ".join([t for t in tokens if t not in _STOP_WORDS]) # ——— 2) Load heavy resources once ——— @st.cache_resource def load_resources(): tfidf: TfidfVectorizer = joblib.load("tfidf_vectorizer.pkl") sage_model: torch.nn.Module = joblib.load("sage_model.pkl") sage_model.eval() return tfidf, sage_model tfidf, sage_model = load_resources() # ——— 3) Streamlit UI ——— st.title("Disinformation Detection") st.write( """ Paste in some text and click **Predict**. The model will output the probability it’s **True Information** vs. **Disinformation**. """ ) user_input = st.text_area("Your text here", height=200) if st.button("Predict"): if not user_input.strip(): st.warning("Please enter some text first.") else: # Preprocess & vectorize cleaned = preprocess_text(user_input) vec = tfidf.transform([cleaned]).toarray() x = torch.from_numpy(vec).float() # shape [1, D] # Empty graph so GraphSAGE layers still run edge_index = torch.empty((2, 0), dtype=torch.long) # Inference with torch.no_grad(): logits = sage_model(x, edge_index) # [1, 2] probs = torch.exp(logits).numpy()[0] # convert log‑softmax → probabilities # Display st.markdown("### Prediction probabilities") st.write(f"• 🔵 True information: {probs[1]:.2%}") st.write(f"• 🔴 Disinformation: {probs[0]:.2%}") verdict = "✅ Likely TRUE" if probs[1] > probs[0] else "❌ Likely DISINFORMATION" st.markdown(f"## **{verdict}**")