Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,13 @@ import json
|
|
| 11 |
import sqlite3
|
| 12 |
from datetime import datetime
|
| 13 |
import hashlib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
|
| 16 |
SEARCH_ENGINE_ID = "f34f8a4816771488b"
|
|
@@ -21,8 +28,23 @@ genai.configure(api_key=GEMINI_API_KEY)
|
|
| 21 |
|
| 22 |
# Knowledge Base Configuration
|
| 23 |
KNOWLEDGE_BASE_DB = "knowledge_base.db"
|
| 24 |
-
CONFIDENCE_THRESHOLD = 0.95 # 95% threshold for
|
| 25 |
-
ENABLE_KNOWLEDGE_BASE_SEARCH =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
print("Loading the DistilBERT model we trained...")
|
| 28 |
try:
|
|
@@ -53,6 +75,141 @@ except Exception as e:
|
|
| 53 |
tokenizer = None
|
| 54 |
model = None
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# --- KNOWLEDGE BASE MANAGEMENT ---
|
| 57 |
def init_knowledge_base():
|
| 58 |
"""Initialize the SQLite knowledge base"""
|
|
@@ -84,37 +241,73 @@ def add_to_knowledge_base(news_text, prediction, confidence, search_results, gem
|
|
| 84 |
# Create content hash for deduplication
|
| 85 |
content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
conn.close()
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
(content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
|
| 101 |
-
VALUES (?, ?, ?, ?, ?, ?)
|
| 102 |
-
''', (
|
| 103 |
-
content_hash,
|
| 104 |
-
news_text,
|
| 105 |
-
prediction,
|
| 106 |
-
confidence,
|
| 107 |
-
json.dumps(search_results, ensure_ascii=False),
|
| 108 |
-
gemini_analysis
|
| 109 |
-
))
|
| 110 |
-
|
| 111 |
-
conn.commit()
|
| 112 |
-
conn.close()
|
| 113 |
-
|
| 114 |
-
print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})")
|
| 115 |
-
print(f" Hash: {content_hash[:8]}...")
|
| 116 |
-
print(f" Prediction: {prediction}")
|
| 117 |
-
return True
|
| 118 |
|
| 119 |
except Exception as e:
|
| 120 |
print(f"Error adding to knowledge base: {e}")
|
|
@@ -123,37 +316,71 @@ def add_to_knowledge_base(news_text, prediction, confidence, search_results, gem
|
|
| 123 |
def search_knowledge_base(query_text, limit=5):
|
| 124 |
"""Search the knowledge base for similar entries"""
|
| 125 |
try:
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
| 155 |
else:
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
except Exception as e:
|
| 159 |
print(f"Error searching knowledge base: {e}")
|
|
@@ -213,6 +440,92 @@ def get_knowledge_base_stats():
|
|
| 213 |
# Initialize knowledge base on startup
|
| 214 |
init_knowledge_base()
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
CREDIBLE_SOURCES = {
|
| 217 |
'vnexpress.net': 0.95,
|
| 218 |
'tuoitre.vn': 0.95,
|
|
@@ -520,10 +833,10 @@ def analyze_source_support(news_text, search_results):
|
|
| 520 |
def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence):
|
| 521 |
"""Use Gemini AI to analyze the news and compare with our model results"""
|
| 522 |
try:
|
| 523 |
-
# Knowledge base search
|
| 524 |
if ENABLE_KNOWLEDGE_BASE_SEARCH:
|
| 525 |
print("🔍 Searching knowledge base for similar entries...")
|
| 526 |
-
knowledge_results = search_knowledge_base(news_text, limit=
|
| 527 |
knowledge_context = format_knowledge_for_rag(knowledge_results)
|
| 528 |
else:
|
| 529 |
knowledge_context = ""
|
|
@@ -993,17 +1306,19 @@ def analyze_news(news_text):
|
|
| 993 |
real_confidence = combined_confidence
|
| 994 |
fake_confidence = 1 - combined_confidence
|
| 995 |
|
| 996 |
-
# Step 7: Check if result should be added to knowledge base
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
|
|
|
|
|
|
| 1001 |
|
| 1002 |
# Add to knowledge base
|
| 1003 |
success = add_to_knowledge_base(
|
| 1004 |
news_text=news_text,
|
| 1005 |
prediction=final_prediction,
|
| 1006 |
-
confidence=
|
| 1007 |
search_results=search_results,
|
| 1008 |
gemini_analysis=gemini_analysis
|
| 1009 |
)
|
|
|
|
| 11 |
import sqlite3
|
| 12 |
from datetime import datetime
|
| 13 |
import hashlib
|
| 14 |
+
import io
|
| 15 |
+
import os
|
| 16 |
+
from google.oauth2.credentials import Credentials
|
| 17 |
+
from google_auth_oauthlib.flow import InstalledAppFlow
|
| 18 |
+
from google.auth.transport.requests import Request
|
| 19 |
+
from googleapiclient.discovery import build
|
| 20 |
+
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
|
| 21 |
|
| 22 |
GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
|
| 23 |
SEARCH_ENGINE_ID = "f34f8a4816771488b"
|
|
|
|
| 28 |
|
| 29 |
# Knowledge Base Configuration
|
| 30 |
KNOWLEDGE_BASE_DB = "knowledge_base.db"
|
| 31 |
+
CONFIDENCE_THRESHOLD = 0.95 # 95% Gemini confidence threshold for RAG knowledge base
|
| 32 |
+
ENABLE_KNOWLEDGE_BASE_SEARCH = True # Enable knowledge base search with training data
|
| 33 |
+
|
| 34 |
+
# Cloud Storage Configuration
|
| 35 |
+
USE_CLOUD_STORAGE = True # Set to True to use cloud storage instead of local DB
|
| 36 |
+
CLOUD_STORAGE_TYPE = "google_drive" # Options: "google_drive", "google_cloud", "local"
|
| 37 |
+
GOOGLE_DRIVE_FILE_ID = None # Will be set when file is created
|
| 38 |
+
|
| 39 |
+
# Load Google Drive file ID if it exists
|
| 40 |
+
try:
|
| 41 |
+
if os.path.exists('google_drive_file_id.txt'):
|
| 42 |
+
with open('google_drive_file_id.txt', 'r') as f:
|
| 43 |
+
GOOGLE_DRIVE_FILE_ID = f.read().strip()
|
| 44 |
+
print(f"📁 Loaded Google Drive file ID: {GOOGLE_DRIVE_FILE_ID}")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Could not load Google Drive file ID: {e}")
|
| 47 |
+
GOOGLE_CLOUD_BUCKET = "your-bucket-name" # For Google Cloud Storage
|
| 48 |
|
| 49 |
print("Loading the DistilBERT model we trained...")
|
| 50 |
try:
|
|
|
|
| 75 |
tokenizer = None
|
| 76 |
model = None
|
| 77 |
|
| 78 |
+
# --- CLOUD STORAGE FUNCTIONS ---
|
| 79 |
+
def get_google_drive_service():
|
| 80 |
+
"""Get authenticated Google Drive service for Hugging Face Spaces"""
|
| 81 |
+
try:
|
| 82 |
+
SCOPES = ['https://www.googleapis.com/auth/drive.file']
|
| 83 |
+
creds = None
|
| 84 |
+
|
| 85 |
+
# Check if running on Hugging Face Spaces
|
| 86 |
+
import os
|
| 87 |
+
is_hf_space = os.getenv('SPACE_ID') is not None
|
| 88 |
+
|
| 89 |
+
if is_hf_space:
|
| 90 |
+
# For Hugging Face Spaces, use environment variables
|
| 91 |
+
client_id = os.getenv('GOOGLE_CLIENT_ID')
|
| 92 |
+
client_secret = os.getenv('GOOGLE_CLIENT_SECRET')
|
| 93 |
+
refresh_token = os.getenv('GOOGLE_REFRESH_TOKEN')
|
| 94 |
+
|
| 95 |
+
if client_id and client_secret and refresh_token:
|
| 96 |
+
creds = Credentials.from_authorized_user_info({
|
| 97 |
+
'client_id': client_id,
|
| 98 |
+
'client_secret': client_secret,
|
| 99 |
+
'refresh_token': refresh_token,
|
| 100 |
+
'token_uri': 'https://oauth2.googleapis.com/token'
|
| 101 |
+
}, SCOPES)
|
| 102 |
+
else:
|
| 103 |
+
print("⚠️ Google Drive credentials not found in Hugging Face secrets")
|
| 104 |
+
return None
|
| 105 |
+
else:
|
| 106 |
+
# For local development, use files
|
| 107 |
+
if os.path.exists('token.json'):
|
| 108 |
+
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
|
| 109 |
+
|
| 110 |
+
# If no valid credentials, request authorization
|
| 111 |
+
if not creds or not creds.valid:
|
| 112 |
+
if creds and creds.expired and creds.refresh_token:
|
| 113 |
+
creds.refresh(Request())
|
| 114 |
+
else:
|
| 115 |
+
if os.path.exists('credentials.json'):
|
| 116 |
+
flow = InstalledAppFlow.from_client_secrets_file(
|
| 117 |
+
'credentials.json', SCOPES)
|
| 118 |
+
creds = flow.run_local_server(port=0)
|
| 119 |
+
else:
|
| 120 |
+
print("⚠️ credentials.json not found for local development")
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
# Save credentials for next run
|
| 124 |
+
with open('token.json', 'w') as token:
|
| 125 |
+
token.write(creds.to_json())
|
| 126 |
+
|
| 127 |
+
return build('drive', 'v3', credentials=creds)
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"Error setting up Google Drive: {e}")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
def upload_to_google_drive(data, filename="knowledge_base.json"):
|
| 133 |
+
"""Upload knowledge base data to Google Drive"""
|
| 134 |
+
try:
|
| 135 |
+
service = get_google_drive_service()
|
| 136 |
+
if not service:
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
# Convert data to JSON
|
| 140 |
+
json_data = json.dumps(data, ensure_ascii=False, indent=2)
|
| 141 |
+
file_metadata = {
|
| 142 |
+
'name': filename,
|
| 143 |
+
'parents': [] # Root folder
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
media = MediaIoBaseUpload(
|
| 147 |
+
io.BytesIO(json_data.encode('utf-8')),
|
| 148 |
+
mimetype='application/json'
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
file = service.files().create(
|
| 152 |
+
body=file_metadata,
|
| 153 |
+
media_body=media,
|
| 154 |
+
fields='id'
|
| 155 |
+
).execute()
|
| 156 |
+
|
| 157 |
+
print(f"✅ Uploaded {filename} to Google Drive (ID: {file.get('id')})")
|
| 158 |
+
return file.get('id')
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"Error uploading to Google Drive: {e}")
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
def download_from_google_drive(file_id):
|
| 165 |
+
"""Download knowledge base data from Google Drive"""
|
| 166 |
+
try:
|
| 167 |
+
service = get_google_drive_service()
|
| 168 |
+
if not service:
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
request = service.files().get_media(fileId=file_id)
|
| 172 |
+
file_content = io.BytesIO()
|
| 173 |
+
downloader = MediaIoBaseDownload(file_content, request)
|
| 174 |
+
|
| 175 |
+
done = False
|
| 176 |
+
while done is False:
|
| 177 |
+
status, done = downloader.next_chunk()
|
| 178 |
+
|
| 179 |
+
file_content.seek(0)
|
| 180 |
+
data = json.loads(file_content.read().decode('utf-8'))
|
| 181 |
+
|
| 182 |
+
print(f"✅ Downloaded knowledge base from Google Drive")
|
| 183 |
+
return data
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print(f"Error downloading from Google Drive: {e}")
|
| 187 |
+
return []
|
| 188 |
+
|
| 189 |
+
def save_knowledge_base_cloud(data):
|
| 190 |
+
"""Save knowledge base to cloud storage"""
|
| 191 |
+
if CLOUD_STORAGE_TYPE == "google_drive":
|
| 192 |
+
file_id = upload_to_google_drive(data)
|
| 193 |
+
if file_id:
|
| 194 |
+
global GOOGLE_DRIVE_FILE_ID
|
| 195 |
+
GOOGLE_DRIVE_FILE_ID = file_id
|
| 196 |
+
return file_id is not None
|
| 197 |
+
elif CLOUD_STORAGE_TYPE == "google_cloud":
|
| 198 |
+
# TODO: Implement Google Cloud Storage
|
| 199 |
+
print("Google Cloud Storage not implemented yet")
|
| 200 |
+
return False
|
| 201 |
+
return False
|
| 202 |
+
|
| 203 |
+
def load_knowledge_base_cloud():
|
| 204 |
+
"""Load knowledge base from cloud storage"""
|
| 205 |
+
if CLOUD_STORAGE_TYPE == "google_drive" and GOOGLE_DRIVE_FILE_ID:
|
| 206 |
+
return download_from_google_drive(GOOGLE_DRIVE_FILE_ID)
|
| 207 |
+
elif CLOUD_STORAGE_TYPE == "google_cloud":
|
| 208 |
+
# TODO: Implement Google Cloud Storage
|
| 209 |
+
print("Google Cloud Storage not implemented yet")
|
| 210 |
+
return []
|
| 211 |
+
return []
|
| 212 |
+
|
| 213 |
# --- KNOWLEDGE BASE MANAGEMENT ---
|
| 214 |
def init_knowledge_base():
|
| 215 |
"""Initialize the SQLite knowledge base"""
|
|
|
|
| 241 |
# Create content hash for deduplication
|
| 242 |
content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
|
| 243 |
|
| 244 |
+
if USE_CLOUD_STORAGE:
|
| 245 |
+
# Add to cloud storage
|
| 246 |
+
data = load_knowledge_base_cloud()
|
| 247 |
+
|
| 248 |
+
# Check if entry already exists
|
| 249 |
+
for entry in data:
|
| 250 |
+
if entry.get('content_hash') == content_hash:
|
| 251 |
+
print(f"Entry already exists in cloud knowledge base (hash: {content_hash[:8]}...)")
|
| 252 |
+
return False
|
| 253 |
+
|
| 254 |
+
# Create new entry
|
| 255 |
+
new_entry = {
|
| 256 |
+
'content_hash': content_hash,
|
| 257 |
+
'news_text': news_text,
|
| 258 |
+
'prediction': prediction,
|
| 259 |
+
'confidence': confidence,
|
| 260 |
+
'search_results': search_results,
|
| 261 |
+
'gemini_analysis': gemini_analysis,
|
| 262 |
+
'created_at': datetime.now().isoformat(),
|
| 263 |
+
'last_accessed': datetime.now().isoformat(),
|
| 264 |
+
'access_count': 1
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
# Add to data and save to cloud
|
| 268 |
+
data.append(new_entry)
|
| 269 |
+
success = save_knowledge_base_cloud(data)
|
| 270 |
+
|
| 271 |
+
if success:
|
| 272 |
+
print(f"✅ Added high-confidence result to cloud knowledge base (confidence: {confidence:.1%})")
|
| 273 |
+
print(f" Hash: {content_hash[:8]}...")
|
| 274 |
+
print(f" Prediction: {prediction}")
|
| 275 |
+
return True
|
| 276 |
+
else:
|
| 277 |
+
return False
|
| 278 |
+
else:
|
| 279 |
+
# Add to local SQLite database
|
| 280 |
+
conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
|
| 281 |
+
cursor = conn.cursor()
|
| 282 |
+
|
| 283 |
+
# Check if entry already exists
|
| 284 |
+
cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
|
| 285 |
+
if cursor.fetchone():
|
| 286 |
+
print(f"Entry already exists in knowledge base (hash: {content_hash[:8]}...)")
|
| 287 |
+
conn.close()
|
| 288 |
+
return False
|
| 289 |
+
|
| 290 |
+
# Insert new entry
|
| 291 |
+
cursor.execute('''
|
| 292 |
+
INSERT INTO knowledge_entries
|
| 293 |
+
(content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
|
| 294 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 295 |
+
''', (
|
| 296 |
+
content_hash,
|
| 297 |
+
news_text,
|
| 298 |
+
prediction,
|
| 299 |
+
confidence,
|
| 300 |
+
json.dumps(search_results, ensure_ascii=False),
|
| 301 |
+
gemini_analysis
|
| 302 |
+
))
|
| 303 |
+
|
| 304 |
+
conn.commit()
|
| 305 |
conn.close()
|
| 306 |
+
|
| 307 |
+
print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})")
|
| 308 |
+
print(f" Hash: {content_hash[:8]}...")
|
| 309 |
+
print(f" Prediction: {prediction}")
|
| 310 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
except Exception as e:
|
| 313 |
print(f"Error adding to knowledge base: {e}")
|
|
|
|
| 316 |
def search_knowledge_base(query_text, limit=5):
|
| 317 |
"""Search the knowledge base for similar entries"""
|
| 318 |
try:
|
| 319 |
+
if USE_CLOUD_STORAGE:
|
| 320 |
+
# Search in cloud storage
|
| 321 |
+
data = load_knowledge_base_cloud()
|
| 322 |
+
if not data:
|
| 323 |
+
return []
|
| 324 |
+
|
| 325 |
+
# Simple text similarity search in JSON data
|
| 326 |
+
results = []
|
| 327 |
+
query_lower = query_text[:50].lower()
|
| 328 |
+
|
| 329 |
+
for entry in data:
|
| 330 |
+
if (query_lower in entry.get('news_text', '').lower() or
|
| 331 |
+
query_lower in entry.get('gemini_analysis', '').lower()):
|
| 332 |
+
results.append((
|
| 333 |
+
entry['news_text'],
|
| 334 |
+
entry['prediction'],
|
| 335 |
+
entry['confidence'],
|
| 336 |
+
entry.get('search_results', []),
|
| 337 |
+
entry.get('gemini_analysis', ''),
|
| 338 |
+
entry.get('created_at', ''),
|
| 339 |
+
entry.get('access_count', 1)
|
| 340 |
+
))
|
| 341 |
+
|
| 342 |
+
# Sort by confidence and access count
|
| 343 |
+
results.sort(key=lambda x: (x[2], x[6]), reverse=True)
|
| 344 |
+
results = results[:limit]
|
| 345 |
+
|
| 346 |
+
if results:
|
| 347 |
+
print(f"📚 Found {len(results)} similar entries in cloud knowledge base")
|
| 348 |
+
return results
|
| 349 |
+
else:
|
| 350 |
+
return []
|
| 351 |
else:
|
| 352 |
+
# Search in local SQLite database
|
| 353 |
+
conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
|
| 354 |
+
cursor = conn.cursor()
|
| 355 |
+
|
| 356 |
+
# Simple text similarity search (you can enhance this with embeddings later)
|
| 357 |
+
cursor.execute('''
|
| 358 |
+
SELECT news_text, prediction, confidence, search_results, gemini_analysis,
|
| 359 |
+
created_at, access_count
|
| 360 |
+
FROM knowledge_entries
|
| 361 |
+
WHERE news_text LIKE ? OR gemini_analysis LIKE ?
|
| 362 |
+
ORDER BY confidence DESC, access_count DESC
|
| 363 |
+
LIMIT ?
|
| 364 |
+
''', (f'%{query_text[:50]}%', f'%{query_text[:50]}%', limit))
|
| 365 |
+
|
| 366 |
+
results = cursor.fetchall()
|
| 367 |
+
|
| 368 |
+
# Update access count and last_accessed
|
| 369 |
+
for result in results:
|
| 370 |
+
cursor.execute('''
|
| 371 |
+
UPDATE knowledge_entries
|
| 372 |
+
SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
|
| 373 |
+
WHERE news_text = ?
|
| 374 |
+
''', (result[0],))
|
| 375 |
+
|
| 376 |
+
conn.commit()
|
| 377 |
+
conn.close()
|
| 378 |
+
|
| 379 |
+
if results:
|
| 380 |
+
print(f"📚 Found {len(results)} similar entries in knowledge base")
|
| 381 |
+
return results
|
| 382 |
+
else:
|
| 383 |
+
return []
|
| 384 |
|
| 385 |
except Exception as e:
|
| 386 |
print(f"Error searching knowledge base: {e}")
|
|
|
|
| 440 |
# Initialize knowledge base on startup
|
| 441 |
init_knowledge_base()
|
| 442 |
|
| 443 |
+
def populate_knowledge_base_from_training_data():
|
| 444 |
+
"""Populate knowledge base with existing training data"""
|
| 445 |
+
try:
|
| 446 |
+
import pandas as pd
|
| 447 |
+
|
| 448 |
+
# Load training data
|
| 449 |
+
df = pd.read_csv('train_final.csv')
|
| 450 |
+
print(f"📚 Loading {len(df)} training samples into knowledge base...")
|
| 451 |
+
|
| 452 |
+
conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
|
| 453 |
+
cursor = conn.cursor()
|
| 454 |
+
|
| 455 |
+
added_count = 0
|
| 456 |
+
skipped_count = 0
|
| 457 |
+
|
| 458 |
+
for index, row in df.iterrows():
|
| 459 |
+
news_text = str(row['content'])
|
| 460 |
+
label = int(row['label'])
|
| 461 |
+
prediction = "REAL" if label == 0 else "FAKE"
|
| 462 |
+
|
| 463 |
+
# Create content hash for deduplication
|
| 464 |
+
content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
|
| 465 |
+
|
| 466 |
+
# Check if entry already exists
|
| 467 |
+
cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
|
| 468 |
+
if cursor.fetchone():
|
| 469 |
+
skipped_count += 1
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
# Create synthetic analysis for training data
|
| 473 |
+
synthetic_analysis = f"""1. KẾT LUẬN: {prediction}
|
| 474 |
+
|
| 475 |
+
2. ĐỘ TIN CẬY: THẬT: {95 if prediction == 'REAL' else 5}% / GIẢ: {5 if prediction == 'REAL' else 95}%
|
| 476 |
+
|
| 477 |
+
3. PHÂN TÍCH CHI TIẾT:
|
| 478 |
+
- Nội dung: {'Tin tức được xác minh từ nguồn đào tạo' if prediction == 'REAL' else 'Tin tức giả được xác định từ nguồn đào tạo'}
|
| 479 |
+
- Nguồn tin: Dữ liệu huấn luyện đã được xác minh
|
| 480 |
+
- Ngữ cảnh: Mẫu từ bộ dữ liệu huấn luyện DistilBERT
|
| 481 |
+
- Ngôn ngữ: {'Ngôn ngữ khách quan, tin cậy' if prediction == 'REAL' else 'Ngôn ngữ có dấu hiệu tin giả'}
|
| 482 |
+
- Thời gian: Dữ liệu huấn luyện đã được kiểm chứng
|
| 483 |
+
|
| 484 |
+
4. CÁC DẤU HIỆU CẢNH BÁO: {'Không có dấu hiệu cảnh báo' if prediction == 'REAL' else 'Tin tức được xác định là giả từ nguồn đào tạo'}
|
| 485 |
+
|
| 486 |
+
5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:
|
| 487 |
+
- Nguồn: Dữ liệu huấn luyện đã được xác minh
|
| 488 |
+
- Độ tin cậy: Cao (từ bộ dữ liệu đào tạo)
|
| 489 |
+
- Lưu ý: Mẫu từ tập huấn luyện DistilBERT"""
|
| 490 |
+
|
| 491 |
+
# Insert training sample
|
| 492 |
+
cursor.execute('''
|
| 493 |
+
INSERT INTO knowledge_entries
|
| 494 |
+
(content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
|
| 495 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 496 |
+
''', (
|
| 497 |
+
content_hash,
|
| 498 |
+
news_text,
|
| 499 |
+
prediction,
|
| 500 |
+
0.95, # High confidence for training data
|
| 501 |
+
json.dumps([], ensure_ascii=False), # Empty search results for training data
|
| 502 |
+
synthetic_analysis
|
| 503 |
+
))
|
| 504 |
+
|
| 505 |
+
added_count += 1
|
| 506 |
+
|
| 507 |
+
# Show progress every 1000 entries
|
| 508 |
+
if added_count % 1000 == 0:
|
| 509 |
+
print(f" Added {added_count} entries...")
|
| 510 |
+
|
| 511 |
+
conn.commit()
|
| 512 |
+
conn.close()
|
| 513 |
+
|
| 514 |
+
print(f"✅ Knowledge base populated successfully!")
|
| 515 |
+
print(f" 📊 Added: {added_count} entries")
|
| 516 |
+
print(f" ⏭️ Skipped: {skipped_count} duplicates")
|
| 517 |
+
print(f" 🎯 Total entries: {added_count}")
|
| 518 |
+
|
| 519 |
+
return True
|
| 520 |
+
|
| 521 |
+
except Exception as e:
|
| 522 |
+
print(f"❌ Error populating knowledge base: {e}")
|
| 523 |
+
return False
|
| 524 |
+
|
| 525 |
+
# Populate knowledge base with training data on startup
|
| 526 |
+
print("🚀 Populating knowledge base with training data...")
|
| 527 |
+
populate_knowledge_base_from_training_data()
|
| 528 |
+
|
| 529 |
CREDIBLE_SOURCES = {
|
| 530 |
'vnexpress.net': 0.95,
|
| 531 |
'tuoitre.vn': 0.95,
|
|
|
|
| 833 |
def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence):
|
| 834 |
"""Use Gemini AI to analyze the news and compare with our model results"""
|
| 835 |
try:
|
| 836 |
+
# Knowledge base search with training data
|
| 837 |
if ENABLE_KNOWLEDGE_BASE_SEARCH:
|
| 838 |
print("🔍 Searching knowledge base for similar entries...")
|
| 839 |
+
knowledge_results = search_knowledge_base(news_text, limit=2) # Reduced to 2 for speed
|
| 840 |
knowledge_context = format_knowledge_for_rag(knowledge_results)
|
| 841 |
else:
|
| 842 |
knowledge_context = ""
|
|
|
|
| 1306 |
real_confidence = combined_confidence
|
| 1307 |
fake_confidence = 1 - combined_confidence
|
| 1308 |
|
| 1309 |
+
# Step 7: Check if result should be added to knowledge base (using only Gemini confidence for RAG)
|
| 1310 |
+
gemini_real_confidence, gemini_fake_confidence = extract_gemini_percentage(gemini_analysis)
|
| 1311 |
+
gemini_max_confidence = max(gemini_real_confidence, gemini_fake_confidence)
|
| 1312 |
+
|
| 1313 |
+
if gemini_max_confidence > CONFIDENCE_THRESHOLD:
|
| 1314 |
+
print(f"🚀 High Gemini confidence detected ({gemini_max_confidence:.1%}) - adding to knowledge base for RAG...")
|
| 1315 |
+
final_prediction = "REAL" if gemini_real_confidence > gemini_fake_confidence else "FAKE"
|
| 1316 |
|
| 1317 |
# Add to knowledge base
|
| 1318 |
success = add_to_knowledge_base(
|
| 1319 |
news_text=news_text,
|
| 1320 |
prediction=final_prediction,
|
| 1321 |
+
confidence=gemini_max_confidence, # Use Gemini confidence for RAG storage
|
| 1322 |
search_results=search_results,
|
| 1323 |
gemini_analysis=gemini_analysis
|
| 1324 |
)
|