NLong commited on
Commit
5c79a89
·
verified ·
1 Parent(s): 5eace42

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +385 -70
app.py CHANGED
@@ -11,6 +11,13 @@ import json
11
  import sqlite3
12
  from datetime import datetime
13
  import hashlib
 
 
 
 
 
 
 
14
 
15
  GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
16
  SEARCH_ENGINE_ID = "f34f8a4816771488b"
@@ -21,8 +28,23 @@ genai.configure(api_key=GEMINI_API_KEY)
21
 
22
  # Knowledge Base Configuration
23
  KNOWLEDGE_BASE_DB = "knowledge_base.db"
24
- CONFIDENCE_THRESHOLD = 0.95 # 95% threshold for auto-updating knowledge base
25
- ENABLE_KNOWLEDGE_BASE_SEARCH = False # Set to True to enable knowledge base search (slower)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  print("Loading the DistilBERT model we trained...")
28
  try:
@@ -53,6 +75,141 @@ except Exception as e:
53
  tokenizer = None
54
  model = None
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # --- KNOWLEDGE BASE MANAGEMENT ---
57
  def init_knowledge_base():
58
  """Initialize the SQLite knowledge base"""
@@ -84,37 +241,73 @@ def add_to_knowledge_base(news_text, prediction, confidence, search_results, gem
84
  # Create content hash for deduplication
85
  content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
86
 
87
- conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
88
- cursor = conn.cursor()
89
-
90
- # Check if entry already exists
91
- cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
92
- if cursor.fetchone():
93
- print(f"Entry already exists in knowledge base (hash: {content_hash[:8]}...)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  conn.close()
95
- return False
96
-
97
- # Insert new entry
98
- cursor.execute('''
99
- INSERT INTO knowledge_entries
100
- (content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
101
- VALUES (?, ?, ?, ?, ?, ?)
102
- ''', (
103
- content_hash,
104
- news_text,
105
- prediction,
106
- confidence,
107
- json.dumps(search_results, ensure_ascii=False),
108
- gemini_analysis
109
- ))
110
-
111
- conn.commit()
112
- conn.close()
113
-
114
- print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})")
115
- print(f" Hash: {content_hash[:8]}...")
116
- print(f" Prediction: {prediction}")
117
- return True
118
 
119
  except Exception as e:
120
  print(f"Error adding to knowledge base: {e}")
@@ -123,37 +316,71 @@ def add_to_knowledge_base(news_text, prediction, confidence, search_results, gem
123
  def search_knowledge_base(query_text, limit=5):
124
  """Search the knowledge base for similar entries"""
125
  try:
126
- conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
127
- cursor = conn.cursor()
128
-
129
- # Simple text similarity search (you can enhance this with embeddings later)
130
- cursor.execute('''
131
- SELECT news_text, prediction, confidence, search_results, gemini_analysis,
132
- created_at, access_count
133
- FROM knowledge_entries
134
- WHERE news_text LIKE ? OR gemini_analysis LIKE ?
135
- ORDER BY confidence DESC, access_count DESC
136
- LIMIT ?
137
- ''', (f'%{query_text[:50]}%', f'%{query_text[:50]}%', limit))
138
-
139
- results = cursor.fetchall()
140
-
141
- # Update access count and last_accessed
142
- for result in results:
143
- cursor.execute('''
144
- UPDATE knowledge_entries
145
- SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
146
- WHERE news_text = ?
147
- ''', (result[0],))
148
-
149
- conn.commit()
150
- conn.close()
151
-
152
- if results:
153
- print(f"📚 Found {len(results)} similar entries in knowledge base")
154
- return results
 
 
 
155
  else:
156
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  except Exception as e:
159
  print(f"Error searching knowledge base: {e}")
@@ -213,6 +440,92 @@ def get_knowledge_base_stats():
213
  # Initialize knowledge base on startup
214
  init_knowledge_base()
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  CREDIBLE_SOURCES = {
217
  'vnexpress.net': 0.95,
218
  'tuoitre.vn': 0.95,
@@ -520,10 +833,10 @@ def analyze_source_support(news_text, search_results):
520
  def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence):
521
  """Use Gemini AI to analyze the news and compare with our model results"""
522
  try:
523
- # Knowledge base search (optional for faster performance)
524
  if ENABLE_KNOWLEDGE_BASE_SEARCH:
525
  print("🔍 Searching knowledge base for similar entries...")
526
- knowledge_results = search_knowledge_base(news_text, limit=3)
527
  knowledge_context = format_knowledge_for_rag(knowledge_results)
528
  else:
529
  knowledge_context = ""
@@ -993,17 +1306,19 @@ def analyze_news(news_text):
993
  real_confidence = combined_confidence
994
  fake_confidence = 1 - combined_confidence
995
 
996
- # Step 7: Check if result should be added to knowledge base
997
- max_confidence = max(real_confidence, fake_confidence)
998
- if max_confidence > CONFIDENCE_THRESHOLD:
999
- print(f"🚀 High confidence result detected ({max_confidence:.1%}) - adding to knowledge base...")
1000
- final_prediction = "REAL" if real_confidence > fake_confidence else "FAKE"
 
 
1001
 
1002
  # Add to knowledge base
1003
  success = add_to_knowledge_base(
1004
  news_text=news_text,
1005
  prediction=final_prediction,
1006
- confidence=max_confidence,
1007
  search_results=search_results,
1008
  gemini_analysis=gemini_analysis
1009
  )
 
11
  import sqlite3
12
  from datetime import datetime
13
  import hashlib
14
+ import io
15
+ import os
16
+ from google.oauth2.credentials import Credentials
17
+ from google_auth_oauthlib.flow import InstalledAppFlow
18
+ from google.auth.transport.requests import Request
19
+ from googleapiclient.discovery import build
20
+ from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
21
 
22
  GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
23
  SEARCH_ENGINE_ID = "f34f8a4816771488b"
 
28
 
29
  # Knowledge Base Configuration
30
  KNOWLEDGE_BASE_DB = "knowledge_base.db"
31
+ CONFIDENCE_THRESHOLD = 0.95 # 95% Gemini confidence threshold for RAG knowledge base
32
+ ENABLE_KNOWLEDGE_BASE_SEARCH = True # Enable knowledge base search with training data
33
+
34
+ # Cloud Storage Configuration
35
+ USE_CLOUD_STORAGE = True # Set to True to use cloud storage instead of local DB
36
+ CLOUD_STORAGE_TYPE = "google_drive" # Options: "google_drive", "google_cloud", "local"
37
+ GOOGLE_DRIVE_FILE_ID = None # Will be set when file is created
38
+
39
+ # Load Google Drive file ID if it exists
40
+ try:
41
+ if os.path.exists('google_drive_file_id.txt'):
42
+ with open('google_drive_file_id.txt', 'r') as f:
43
+ GOOGLE_DRIVE_FILE_ID = f.read().strip()
44
+ print(f"📁 Loaded Google Drive file ID: {GOOGLE_DRIVE_FILE_ID}")
45
+ except Exception as e:
46
+ print(f"Could not load Google Drive file ID: {e}")
47
+ GOOGLE_CLOUD_BUCKET = "your-bucket-name" # For Google Cloud Storage
48
 
49
  print("Loading the DistilBERT model we trained...")
50
  try:
 
75
  tokenizer = None
76
  model = None
77
 
78
+ # --- CLOUD STORAGE FUNCTIONS ---
79
+ def get_google_drive_service():
80
+ """Get authenticated Google Drive service for Hugging Face Spaces"""
81
+ try:
82
+ SCOPES = ['https://www.googleapis.com/auth/drive.file']
83
+ creds = None
84
+
85
+ # Check if running on Hugging Face Spaces
86
+ import os
87
+ is_hf_space = os.getenv('SPACE_ID') is not None
88
+
89
+ if is_hf_space:
90
+ # For Hugging Face Spaces, use environment variables
91
+ client_id = os.getenv('GOOGLE_CLIENT_ID')
92
+ client_secret = os.getenv('GOOGLE_CLIENT_SECRET')
93
+ refresh_token = os.getenv('GOOGLE_REFRESH_TOKEN')
94
+
95
+ if client_id and client_secret and refresh_token:
96
+ creds = Credentials.from_authorized_user_info({
97
+ 'client_id': client_id,
98
+ 'client_secret': client_secret,
99
+ 'refresh_token': refresh_token,
100
+ 'token_uri': 'https://oauth2.googleapis.com/token'
101
+ }, SCOPES)
102
+ else:
103
+ print("⚠️ Google Drive credentials not found in Hugging Face secrets")
104
+ return None
105
+ else:
106
+ # For local development, use files
107
+ if os.path.exists('token.json'):
108
+ creds = Credentials.from_authorized_user_file('token.json', SCOPES)
109
+
110
+ # If no valid credentials, request authorization
111
+ if not creds or not creds.valid:
112
+ if creds and creds.expired and creds.refresh_token:
113
+ creds.refresh(Request())
114
+ else:
115
+ if os.path.exists('credentials.json'):
116
+ flow = InstalledAppFlow.from_client_secrets_file(
117
+ 'credentials.json', SCOPES)
118
+ creds = flow.run_local_server(port=0)
119
+ else:
120
+ print("⚠️ credentials.json not found for local development")
121
+ return None
122
+
123
+ # Save credentials for next run
124
+ with open('token.json', 'w') as token:
125
+ token.write(creds.to_json())
126
+
127
+ return build('drive', 'v3', credentials=creds)
128
+ except Exception as e:
129
+ print(f"Error setting up Google Drive: {e}")
130
+ return None
131
+
132
+ def upload_to_google_drive(data, filename="knowledge_base.json"):
133
+ """Upload knowledge base data to Google Drive"""
134
+ try:
135
+ service = get_google_drive_service()
136
+ if not service:
137
+ return None
138
+
139
+ # Convert data to JSON
140
+ json_data = json.dumps(data, ensure_ascii=False, indent=2)
141
+ file_metadata = {
142
+ 'name': filename,
143
+ 'parents': [] # Root folder
144
+ }
145
+
146
+ media = MediaIoBaseUpload(
147
+ io.BytesIO(json_data.encode('utf-8')),
148
+ mimetype='application/json'
149
+ )
150
+
151
+ file = service.files().create(
152
+ body=file_metadata,
153
+ media_body=media,
154
+ fields='id'
155
+ ).execute()
156
+
157
+ print(f"✅ Uploaded {filename} to Google Drive (ID: {file.get('id')})")
158
+ return file.get('id')
159
+
160
+ except Exception as e:
161
+ print(f"Error uploading to Google Drive: {e}")
162
+ return None
163
+
164
+ def download_from_google_drive(file_id):
165
+ """Download knowledge base data from Google Drive"""
166
+ try:
167
+ service = get_google_drive_service()
168
+ if not service:
169
+ return []
170
+
171
+ request = service.files().get_media(fileId=file_id)
172
+ file_content = io.BytesIO()
173
+ downloader = MediaIoBaseDownload(file_content, request)
174
+
175
+ done = False
176
+ while done is False:
177
+ status, done = downloader.next_chunk()
178
+
179
+ file_content.seek(0)
180
+ data = json.loads(file_content.read().decode('utf-8'))
181
+
182
+ print(f"✅ Downloaded knowledge base from Google Drive")
183
+ return data
184
+
185
+ except Exception as e:
186
+ print(f"Error downloading from Google Drive: {e}")
187
+ return []
188
+
189
+ def save_knowledge_base_cloud(data):
190
+ """Save knowledge base to cloud storage"""
191
+ if CLOUD_STORAGE_TYPE == "google_drive":
192
+ file_id = upload_to_google_drive(data)
193
+ if file_id:
194
+ global GOOGLE_DRIVE_FILE_ID
195
+ GOOGLE_DRIVE_FILE_ID = file_id
196
+ return file_id is not None
197
+ elif CLOUD_STORAGE_TYPE == "google_cloud":
198
+ # TODO: Implement Google Cloud Storage
199
+ print("Google Cloud Storage not implemented yet")
200
+ return False
201
+ return False
202
+
203
+ def load_knowledge_base_cloud():
204
+ """Load knowledge base from cloud storage"""
205
+ if CLOUD_STORAGE_TYPE == "google_drive" and GOOGLE_DRIVE_FILE_ID:
206
+ return download_from_google_drive(GOOGLE_DRIVE_FILE_ID)
207
+ elif CLOUD_STORAGE_TYPE == "google_cloud":
208
+ # TODO: Implement Google Cloud Storage
209
+ print("Google Cloud Storage not implemented yet")
210
+ return []
211
+ return []
212
+
213
  # --- KNOWLEDGE BASE MANAGEMENT ---
214
  def init_knowledge_base():
215
  """Initialize the SQLite knowledge base"""
 
241
  # Create content hash for deduplication
242
  content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
243
 
244
+ if USE_CLOUD_STORAGE:
245
+ # Add to cloud storage
246
+ data = load_knowledge_base_cloud()
247
+
248
+ # Check if entry already exists
249
+ for entry in data:
250
+ if entry.get('content_hash') == content_hash:
251
+ print(f"Entry already exists in cloud knowledge base (hash: {content_hash[:8]}...)")
252
+ return False
253
+
254
+ # Create new entry
255
+ new_entry = {
256
+ 'content_hash': content_hash,
257
+ 'news_text': news_text,
258
+ 'prediction': prediction,
259
+ 'confidence': confidence,
260
+ 'search_results': search_results,
261
+ 'gemini_analysis': gemini_analysis,
262
+ 'created_at': datetime.now().isoformat(),
263
+ 'last_accessed': datetime.now().isoformat(),
264
+ 'access_count': 1
265
+ }
266
+
267
+ # Add to data and save to cloud
268
+ data.append(new_entry)
269
+ success = save_knowledge_base_cloud(data)
270
+
271
+ if success:
272
+ print(f"✅ Added high-confidence result to cloud knowledge base (confidence: {confidence:.1%})")
273
+ print(f" Hash: {content_hash[:8]}...")
274
+ print(f" Prediction: {prediction}")
275
+ return True
276
+ else:
277
+ return False
278
+ else:
279
+ # Add to local SQLite database
280
+ conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
281
+ cursor = conn.cursor()
282
+
283
+ # Check if entry already exists
284
+ cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
285
+ if cursor.fetchone():
286
+ print(f"Entry already exists in knowledge base (hash: {content_hash[:8]}...)")
287
+ conn.close()
288
+ return False
289
+
290
+ # Insert new entry
291
+ cursor.execute('''
292
+ INSERT INTO knowledge_entries
293
+ (content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
294
+ VALUES (?, ?, ?, ?, ?, ?)
295
+ ''', (
296
+ content_hash,
297
+ news_text,
298
+ prediction,
299
+ confidence,
300
+ json.dumps(search_results, ensure_ascii=False),
301
+ gemini_analysis
302
+ ))
303
+
304
+ conn.commit()
305
  conn.close()
306
+
307
+ print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})")
308
+ print(f" Hash: {content_hash[:8]}...")
309
+ print(f" Prediction: {prediction}")
310
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  except Exception as e:
313
  print(f"Error adding to knowledge base: {e}")
 
316
  def search_knowledge_base(query_text, limit=5):
317
  """Search the knowledge base for similar entries"""
318
  try:
319
+ if USE_CLOUD_STORAGE:
320
+ # Search in cloud storage
321
+ data = load_knowledge_base_cloud()
322
+ if not data:
323
+ return []
324
+
325
+ # Simple text similarity search in JSON data
326
+ results = []
327
+ query_lower = query_text[:50].lower()
328
+
329
+ for entry in data:
330
+ if (query_lower in entry.get('news_text', '').lower() or
331
+ query_lower in entry.get('gemini_analysis', '').lower()):
332
+ results.append((
333
+ entry['news_text'],
334
+ entry['prediction'],
335
+ entry['confidence'],
336
+ entry.get('search_results', []),
337
+ entry.get('gemini_analysis', ''),
338
+ entry.get('created_at', ''),
339
+ entry.get('access_count', 1)
340
+ ))
341
+
342
+ # Sort by confidence and access count
343
+ results.sort(key=lambda x: (x[2], x[6]), reverse=True)
344
+ results = results[:limit]
345
+
346
+ if results:
347
+ print(f"📚 Found {len(results)} similar entries in cloud knowledge base")
348
+ return results
349
+ else:
350
+ return []
351
  else:
352
+ # Search in local SQLite database
353
+ conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
354
+ cursor = conn.cursor()
355
+
356
+ # Simple text similarity search (you can enhance this with embeddings later)
357
+ cursor.execute('''
358
+ SELECT news_text, prediction, confidence, search_results, gemini_analysis,
359
+ created_at, access_count
360
+ FROM knowledge_entries
361
+ WHERE news_text LIKE ? OR gemini_analysis LIKE ?
362
+ ORDER BY confidence DESC, access_count DESC
363
+ LIMIT ?
364
+ ''', (f'%{query_text[:50]}%', f'%{query_text[:50]}%', limit))
365
+
366
+ results = cursor.fetchall()
367
+
368
+ # Update access count and last_accessed
369
+ for result in results:
370
+ cursor.execute('''
371
+ UPDATE knowledge_entries
372
+ SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
373
+ WHERE news_text = ?
374
+ ''', (result[0],))
375
+
376
+ conn.commit()
377
+ conn.close()
378
+
379
+ if results:
380
+ print(f"📚 Found {len(results)} similar entries in knowledge base")
381
+ return results
382
+ else:
383
+ return []
384
 
385
  except Exception as e:
386
  print(f"Error searching knowledge base: {e}")
 
440
  # Initialize knowledge base on startup
441
  init_knowledge_base()
442
 
443
+ def populate_knowledge_base_from_training_data():
444
+ """Populate knowledge base with existing training data"""
445
+ try:
446
+ import pandas as pd
447
+
448
+ # Load training data
449
+ df = pd.read_csv('train_final.csv')
450
+ print(f"📚 Loading {len(df)} training samples into knowledge base...")
451
+
452
+ conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
453
+ cursor = conn.cursor()
454
+
455
+ added_count = 0
456
+ skipped_count = 0
457
+
458
+ for index, row in df.iterrows():
459
+ news_text = str(row['content'])
460
+ label = int(row['label'])
461
+ prediction = "REAL" if label == 0 else "FAKE"
462
+
463
+ # Create content hash for deduplication
464
+ content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
465
+
466
+ # Check if entry already exists
467
+ cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
468
+ if cursor.fetchone():
469
+ skipped_count += 1
470
+ continue
471
+
472
+ # Create synthetic analysis for training data
473
+ synthetic_analysis = f"""1. KẾT LUẬN: {prediction}
474
+
475
+ 2. ĐỘ TIN CẬY: THẬT: {95 if prediction == 'REAL' else 5}% / GIẢ: {5 if prediction == 'REAL' else 95}%
476
+
477
+ 3. PHÂN TÍCH CHI TIẾT:
478
+ - Nội dung: {'Tin tức được xác minh từ nguồn đào tạo' if prediction == 'REAL' else 'Tin tức giả được xác định từ nguồn đào tạo'}
479
+ - Nguồn tin: Dữ liệu huấn luyện đã được xác minh
480
+ - Ngữ cảnh: Mẫu từ bộ dữ liệu huấn luyện DistilBERT
481
+ - Ngôn ngữ: {'Ngôn ngữ khách quan, tin cậy' if prediction == 'REAL' else 'Ngôn ngữ có dấu hiệu tin giả'}
482
+ - Thời gian: Dữ liệu huấn luyện đã được kiểm chứng
483
+
484
+ 4. CÁC DẤU HIỆU CẢNH BÁO: {'Không có dấu hiệu cảnh báo' if prediction == 'REAL' else 'Tin tức được xác định là giả từ nguồn đào tạo'}
485
+
486
+ 5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:
487
+ - Nguồn: Dữ liệu huấn luyện đã được xác minh
488
+ - Độ tin cậy: Cao (từ bộ dữ liệu đào tạo)
489
+ - Lưu ý: Mẫu từ tập huấn luyện DistilBERT"""
490
+
491
+ # Insert training sample
492
+ cursor.execute('''
493
+ INSERT INTO knowledge_entries
494
+ (content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
495
+ VALUES (?, ?, ?, ?, ?, ?)
496
+ ''', (
497
+ content_hash,
498
+ news_text,
499
+ prediction,
500
+ 0.95, # High confidence for training data
501
+ json.dumps([], ensure_ascii=False), # Empty search results for training data
502
+ synthetic_analysis
503
+ ))
504
+
505
+ added_count += 1
506
+
507
+ # Show progress every 1000 entries
508
+ if added_count % 1000 == 0:
509
+ print(f" Added {added_count} entries...")
510
+
511
+ conn.commit()
512
+ conn.close()
513
+
514
+ print(f"✅ Knowledge base populated successfully!")
515
+ print(f" 📊 Added: {added_count} entries")
516
+ print(f" ⏭️ Skipped: {skipped_count} duplicates")
517
+ print(f" 🎯 Total entries: {added_count}")
518
+
519
+ return True
520
+
521
+ except Exception as e:
522
+ print(f"❌ Error populating knowledge base: {e}")
523
+ return False
524
+
525
+ # Populate knowledge base with training data on startup
526
+ print("🚀 Populating knowledge base with training data...")
527
+ populate_knowledge_base_from_training_data()
528
+
529
  CREDIBLE_SOURCES = {
530
  'vnexpress.net': 0.95,
531
  'tuoitre.vn': 0.95,
 
833
  def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence):
834
  """Use Gemini AI to analyze the news and compare with our model results"""
835
  try:
836
+ # Knowledge base search with training data
837
  if ENABLE_KNOWLEDGE_BASE_SEARCH:
838
  print("🔍 Searching knowledge base for similar entries...")
839
+ knowledge_results = search_knowledge_base(news_text, limit=2) # Reduced to 2 for speed
840
  knowledge_context = format_knowledge_for_rag(knowledge_results)
841
  else:
842
  knowledge_context = ""
 
1306
  real_confidence = combined_confidence
1307
  fake_confidence = 1 - combined_confidence
1308
 
1309
+ # Step 7: Check if result should be added to knowledge base (using only Gemini confidence for RAG)
1310
+ gemini_real_confidence, gemini_fake_confidence = extract_gemini_percentage(gemini_analysis)
1311
+ gemini_max_confidence = max(gemini_real_confidence, gemini_fake_confidence)
1312
+
1313
+ if gemini_max_confidence > CONFIDENCE_THRESHOLD:
1314
+ print(f"🚀 High Gemini confidence detected ({gemini_max_confidence:.1%}) - adding to knowledge base for RAG...")
1315
+ final_prediction = "REAL" if gemini_real_confidence > gemini_fake_confidence else "FAKE"
1316
 
1317
  # Add to knowledge base
1318
  success = add_to_knowledge_base(
1319
  news_text=news_text,
1320
  prediction=final_prediction,
1321
+ confidence=gemini_max_confidence, # Use Gemini confidence for RAG storage
1322
  search_results=search_results,
1323
  gemini_analysis=gemini_analysis
1324
  )