Spaces:
Running
Running
| import gradio as gr | |
| from googleapiclient.discovery import build | |
| import google.generativeai as genai | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import re | |
| import os | |
| import numpy as np | |
| import json | |
| import sqlite3 | |
| from datetime import datetime | |
| import hashlib | |
| import io | |
| import os | |
| from google.oauth2.credentials import Credentials | |
| from google_auth_oauthlib.flow import InstalledAppFlow | |
| from google.auth.transport.requests import Request | |
| from googleapiclient.discovery import build | |
| from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload | |
| GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg" | |
| SEARCH_ENGINE_ID = "f34f8a4816771488b" | |
| GEMINI_API_KEY = "AIzaSyBXInbM_CAtquC9f80_DoAMQx6E2vE1lC8" | |
| MODEL_PATH = "./vietnamese_fake_news_model" | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| # Knowledge Base Configuration | |
| KNOWLEDGE_BASE_DB = "knowledge_base.db" | |
| CONFIDENCE_THRESHOLD = 0.95 # 95% Gemini confidence threshold for RAG knowledge base | |
| ENABLE_KNOWLEDGE_BASE_SEARCH = True # Enable knowledge base search with training data | |
| # Enhanced RAG System Configuration | |
| ENABLE_ENHANCED_RAG = True # Enable enhanced RAG system for Google Drive | |
| RAG_CONFIDENCE_THRESHOLD = 0.95 # 95% threshold for saving to RAG | |
| # Cloud Storage Configuration | |
| USE_CLOUD_STORAGE = False # Disabled - using Enhanced RAG system instead (no duplicates) | |
| CLOUD_STORAGE_TYPE = "google_drive" # Options: "google_drive", "google_cloud", "local" | |
| GOOGLE_DRIVE_FILE_ID = None # Will be set when file is created | |
| # Load Google Drive file ID if it exists | |
| try: | |
| if os.path.exists('google_drive_file_id.txt'): | |
| with open('google_drive_file_id.txt', 'r') as f: | |
| GOOGLE_DRIVE_FILE_ID = f.read().strip() | |
| print(f"📁 Loaded Google Drive file ID: {GOOGLE_DRIVE_FILE_ID}") | |
| except Exception as e: | |
| print(f"Could not load Google Drive file ID: {e}") | |
| GOOGLE_CLOUD_BUCKET = "your-bucket-name" # For Google Cloud Storage | |
| print("Loading the DistilBERT model we trained...") | |
| try: | |
| if os.path.exists(MODEL_PATH): | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) | |
| print("DistilBERT model loaded successfully!") | |
| else: | |
| print(f"Model directory '{MODEL_PATH}' not found!") | |
| print("Our custom model isn't available, trying a backup model...") | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased") | |
| model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=2) | |
| print("Fallback DistilBERT model loaded successfully!") | |
| except Exception as fallback_error: | |
| print(f"Fallback model also failed: {fallback_error}") | |
| tokenizer = None | |
| model = None | |
| except Exception as e: | |
| print(f"Error loading DistilBERT model: {e}") | |
| print("Something went wrong, trying the backup model...") | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased") | |
| model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=2) | |
| print("Fallback DistilBERT model loaded successfully!") | |
| except Exception as fallback_error: | |
| print(f"Fallback model also failed: {fallback_error}") | |
| tokenizer = None | |
| model = None | |
| # --- CLOUD STORAGE FUNCTIONS --- | |
| def get_google_drive_service(): | |
| """Get authenticated Google Drive service for Hugging Face Spaces""" | |
| try: | |
| SCOPES = ['https://www.googleapis.com/auth/drive.file'] | |
| creds = None | |
| # Check if running on Hugging Face Spaces | |
| import os | |
| is_hf_space = os.getenv('SPACE_ID') is not None | |
| if is_hf_space: | |
| # For Hugging Face Spaces, use environment variables | |
| client_id = os.getenv('GOOGLE_CLIENT_ID') | |
| client_secret = os.getenv('GOOGLE_CLIENT_SECRET') | |
| refresh_token = os.getenv('GOOGLE_REFRESH_TOKEN') | |
| if client_id and client_secret and refresh_token: | |
| creds = Credentials.from_authorized_user_info({ | |
| 'client_id': client_id, | |
| 'client_secret': client_secret, | |
| 'refresh_token': refresh_token, | |
| 'token_uri': 'https://oauth2.googleapis.com/token' | |
| }, SCOPES) | |
| else: | |
| print("⚠️ Google Drive credentials not found in Hugging Face secrets") | |
| return None | |
| else: | |
| # For local development, use files | |
| if os.path.exists('token.json'): | |
| creds = Credentials.from_authorized_user_file('token.json', SCOPES) | |
| # If no valid credentials, request authorization | |
| if not creds or not creds.valid: | |
| if creds and creds.expired and creds.refresh_token: | |
| creds.refresh(Request()) | |
| else: | |
| if os.path.exists('credentials.json'): | |
| flow = InstalledAppFlow.from_client_secrets_file( | |
| 'credentials.json', SCOPES) | |
| creds = flow.run_local_server(port=0) | |
| else: | |
| print("⚠️ credentials.json not found for local development") | |
| return None | |
| # Save credentials for next run | |
| with open('token.json', 'w') as token: | |
| token.write(creds.to_json()) | |
| return build('drive', 'v3', credentials=creds) | |
| except Exception as e: | |
| print(f"Error setting up Google Drive: {e}") | |
| return None | |
| def upload_to_google_drive(data, filename="knowledge_base.json"): | |
| """Upload knowledge base data to Google Drive""" | |
| try: | |
| service = get_google_drive_service() | |
| if not service: | |
| return None | |
| # Convert data to JSON | |
| json_data = json.dumps(data, ensure_ascii=False, indent=2) | |
| file_metadata = { | |
| 'name': filename, | |
| 'parents': [] # Root folder | |
| } | |
| media = MediaIoBaseUpload( | |
| io.BytesIO(json_data.encode('utf-8')), | |
| mimetype='application/json' | |
| ) | |
| file = service.files().create( | |
| body=file_metadata, | |
| media_body=media, | |
| fields='id' | |
| ).execute() | |
| print(f"✅ Uploaded {filename} to Google Drive (ID: {file.get('id')})") | |
| return file.get('id') | |
| except Exception as e: | |
| print(f"Error uploading to Google Drive: {e}") | |
| return None | |
| def download_from_google_drive(file_id): | |
| """Download knowledge base data from Google Drive""" | |
| try: | |
| service = get_google_drive_service() | |
| if not service: | |
| return [] | |
| request = service.files().get_media(fileId=file_id) | |
| file_content = io.BytesIO() | |
| downloader = MediaIoBaseDownload(file_content, request) | |
| done = False | |
| while done is False: | |
| status, done = downloader.next_chunk() | |
| file_content.seek(0) | |
| data = json.loads(file_content.read().decode('utf-8')) | |
| print(f"✅ Downloaded knowledge base from Google Drive") | |
| return data | |
| except Exception as e: | |
| print(f"Error downloading from Google Drive: {e}") | |
| return [] | |
| def save_knowledge_base_cloud(data): | |
| """Save knowledge base to cloud storage""" | |
| if CLOUD_STORAGE_TYPE == "google_drive": | |
| file_id = upload_to_google_drive(data) | |
| if file_id: | |
| global GOOGLE_DRIVE_FILE_ID | |
| GOOGLE_DRIVE_FILE_ID = file_id | |
| return file_id is not None | |
| elif CLOUD_STORAGE_TYPE == "google_cloud": | |
| # TODO: Implement Google Cloud Storage | |
| print("Google Cloud Storage not implemented yet") | |
| return False | |
| return False | |
| def load_knowledge_base_cloud(): | |
| """Load knowledge base from cloud storage""" | |
| if CLOUD_STORAGE_TYPE == "google_drive" and GOOGLE_DRIVE_FILE_ID: | |
| return download_from_google_drive(GOOGLE_DRIVE_FILE_ID) | |
| elif CLOUD_STORAGE_TYPE == "google_cloud": | |
| # TODO: Implement Google Cloud Storage | |
| print("Google Cloud Storage not implemented yet") | |
| return [] | |
| return [] | |
| # --- KNOWLEDGE BASE MANAGEMENT --- | |
| def init_knowledge_base(): | |
| """Initialize the SQLite knowledge base""" | |
| conn = sqlite3.connect(KNOWLEDGE_BASE_DB) | |
| cursor = conn.cursor() | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS knowledge_entries ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| content_hash TEXT UNIQUE, | |
| news_text TEXT, | |
| prediction TEXT, | |
| confidence REAL, | |
| search_results TEXT, | |
| gemini_analysis TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| access_count INTEGER DEFAULT 1 | |
| ) | |
| ''') | |
| conn.commit() | |
| conn.close() | |
| print("Knowledge base initialized successfully!") | |
| def add_to_knowledge_base(news_text, prediction, confidence, search_results, gemini_analysis): | |
| """Add high-confidence result to knowledge base""" | |
| try: | |
| # Create content hash for deduplication | |
| content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest() | |
| if USE_CLOUD_STORAGE: | |
| # Add to cloud storage | |
| data = load_knowledge_base_cloud() | |
| # Check if entry already exists | |
| for entry in data: | |
| if entry.get('content_hash') == content_hash: | |
| print(f"Entry already exists in cloud knowledge base (hash: {content_hash[:8]}...)") | |
| return False | |
| # Create new entry | |
| new_entry = { | |
| 'content_hash': content_hash, | |
| 'news_text': news_text, | |
| 'prediction': prediction, | |
| 'confidence': confidence, | |
| 'search_results': search_results, | |
| 'gemini_analysis': gemini_analysis, | |
| 'created_at': datetime.now().isoformat(), | |
| 'last_accessed': datetime.now().isoformat(), | |
| 'access_count': 1 | |
| } | |
| # Add to data and save to cloud | |
| data.append(new_entry) | |
| success = save_knowledge_base_cloud(data) | |
| if success: | |
| print(f"✅ Added high-confidence result to cloud knowledge base (confidence: {confidence:.1%})") | |
| print(f" Hash: {content_hash[:8]}...") | |
| print(f" Prediction: {prediction}") | |
| return True | |
| else: | |
| return False | |
| else: | |
| # Add to local SQLite database | |
| conn = sqlite3.connect(KNOWLEDGE_BASE_DB) | |
| cursor = conn.cursor() | |
| # Check if entry already exists | |
| cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,)) | |
| if cursor.fetchone(): | |
| print(f"Entry already exists in knowledge base (hash: {content_hash[:8]}...)") | |
| conn.close() | |
| return False | |
| # Insert new entry | |
| cursor.execute(''' | |
| INSERT INTO knowledge_entries | |
| (content_hash, news_text, prediction, confidence, search_results, gemini_analysis) | |
| VALUES (?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| content_hash, | |
| news_text, | |
| prediction, | |
| confidence, | |
| json.dumps(search_results, ensure_ascii=False), | |
| gemini_analysis | |
| )) | |
| conn.commit() | |
| conn.close() | |
| print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})") | |
| print(f" Hash: {content_hash[:8]}...") | |
| print(f" Prediction: {prediction}") | |
| return True | |
| except Exception as e: | |
| print(f"Error adding to knowledge base: {e}") | |
| return False | |
| def search_knowledge_base(query_text, limit=5): | |
| """Search the knowledge base for similar entries""" | |
| try: | |
| if USE_CLOUD_STORAGE: | |
| # Search in cloud storage | |
| data = load_knowledge_base_cloud() | |
| if not data: | |
| return [] | |
| # Simple text similarity search in JSON data | |
| results = [] | |
| query_lower = query_text[:50].lower() | |
| for entry in data: | |
| if (query_lower in entry.get('news_text', '').lower() or | |
| query_lower in entry.get('gemini_analysis', '').lower()): | |
| results.append(( | |
| entry['news_text'], | |
| entry['prediction'], | |
| entry['confidence'], | |
| entry.get('search_results', []), | |
| entry.get('gemini_analysis', ''), | |
| entry.get('created_at', ''), | |
| entry.get('access_count', 1) | |
| )) | |
| # Sort by confidence and access count | |
| results.sort(key=lambda x: (x[2], x[6]), reverse=True) | |
| results = results[:limit] | |
| if results: | |
| print(f"📚 Found {len(results)} similar entries in cloud knowledge base") | |
| return results | |
| else: | |
| return [] | |
| else: | |
| # Search in local SQLite database | |
| conn = sqlite3.connect(KNOWLEDGE_BASE_DB) | |
| cursor = conn.cursor() | |
| # Simple text similarity search (you can enhance this with embeddings later) | |
| cursor.execute(''' | |
| SELECT news_text, prediction, confidence, search_results, gemini_analysis, | |
| created_at, access_count | |
| FROM knowledge_entries | |
| WHERE news_text LIKE ? OR gemini_analysis LIKE ? | |
| ORDER BY confidence DESC, access_count DESC | |
| LIMIT ? | |
| ''', (f'%{query_text[:50]}%', f'%{query_text[:50]}%', limit)) | |
| results = cursor.fetchall() | |
| # Update access count and last_accessed | |
| for result in results: | |
| cursor.execute(''' | |
| UPDATE knowledge_entries | |
| SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP | |
| WHERE news_text = ? | |
| ''', (result[0],)) | |
| conn.commit() | |
| conn.close() | |
| if results: | |
| print(f"📚 Found {len(results)} similar entries in knowledge base") | |
| return results | |
| else: | |
| return [] | |
| except Exception as e: | |
| print(f"Error searching knowledge base: {e}") | |
| return [] | |
| def format_knowledge_for_rag(knowledge_results): | |
| """Format knowledge base results for RAG augmentation""" | |
| if not knowledge_results: | |
| return "" | |
| knowledge_summary = "\n=== KIẾN THỨC TƯƠNG TỰ TỪ CƠ SỞ DỮ LIỆU ===\n" | |
| for i, (news_text, prediction, confidence, search_results, gemini_analysis, created_at, access_count) in enumerate(knowledge_results, 1): | |
| knowledge_summary += f"\n{i}. Tin tức tương tự (Độ tin cậy: {confidence:.1%}, Lần truy cập: {access_count}):\n" | |
| knowledge_summary += f" Nội dung: {news_text[:200]}...\n" | |
| knowledge_summary += f" Kết luận: {prediction}\n" | |
| knowledge_summary += f" Thời gian: {created_at}\n" | |
| knowledge_summary += "\n==========================================\n" | |
| return knowledge_summary | |
| def get_knowledge_base_stats(): | |
| """Get statistics about the knowledge base""" | |
| try: | |
| conn = sqlite3.connect(KNOWLEDGE_BASE_DB) | |
| cursor = conn.cursor() | |
| # Get total entries | |
| cursor.execute('SELECT COUNT(*) FROM knowledge_entries') | |
| total_entries = cursor.fetchone()[0] | |
| # Get entries by prediction | |
| cursor.execute('SELECT prediction, COUNT(*) FROM knowledge_entries GROUP BY prediction') | |
| prediction_counts = dict(cursor.fetchall()) | |
| # Get average confidence | |
| cursor.execute('SELECT AVG(confidence) FROM knowledge_entries') | |
| avg_confidence = cursor.fetchone()[0] or 0 | |
| # Get most accessed entries | |
| cursor.execute('SELECT news_text, access_count FROM knowledge_entries ORDER BY access_count DESC LIMIT 3') | |
| top_accessed = cursor.fetchall() | |
| conn.close() | |
| return { | |
| 'total_entries': total_entries, | |
| 'prediction_counts': prediction_counts, | |
| 'avg_confidence': avg_confidence, | |
| 'top_accessed': top_accessed | |
| } | |
| except Exception as e: | |
| print(f"Error getting knowledge base stats: {e}") | |
| return None | |
| # Initialize knowledge base on startup | |
| init_knowledge_base() | |
| # Initialize Enhanced RAG System | |
| if ENABLE_ENHANCED_RAG: | |
| try: | |
| from rag_news_manager import initialize_rag_system | |
| print("🚀 Initializing Enhanced RAG System...") | |
| if initialize_rag_system(): | |
| print("✅ Enhanced RAG System initialized successfully!") | |
| else: | |
| print("⚠️ Enhanced RAG System initialization failed") | |
| print(" 📋 This usually means Google Drive authentication needs to be refreshed") | |
| print(" 🔧 To fix this, run: python setup_google_drive_rag.py") | |
| print(" 📝 Or check if credentials.json and token.json are valid") | |
| print(" ⚠️ System will continue, but high-confidence news won't be saved to Google Drive") | |
| # Don't disable - allow retry during runtime | |
| # ENABLE_ENHANCED_RAG = False | |
| except ImportError as e: | |
| print(f"⚠️ Enhanced RAG System not available: {e}") | |
| print(" 📋 Make sure rag_news_manager.py is in the same directory") | |
| # Don't disable - maybe module will be available later | |
| # ENABLE_ENHANCED_RAG = False | |
| except Exception as e: | |
| print(f"⚠️ Enhanced RAG System initialization error: {e}") | |
| error_msg = str(e).lower() | |
| if "invalid_grant" in error_msg or "bad request" in error_msg: | |
| print(" 🔑 Google Drive token expired or invalid") | |
| print(" 🔧 To fix: Run 'python setup_google_drive_rag.py' to re-authenticate") | |
| elif "credentials" in error_msg: | |
| print(" 🔑 Google Drive credentials issue") | |
| print(" 🔧 To fix: Make sure credentials.json exists and is valid") | |
| print(" ⚠️ System will continue, but RAG saving may not work until fixed") | |
| # Don't disable - allow retry during runtime | |
| # ENABLE_ENHANCED_RAG = False | |
| def populate_knowledge_base_from_training_data(): | |
| """Populate knowledge base with existing training data""" | |
| try: | |
| import pandas as pd | |
| # Load training data | |
| df = pd.read_csv('train_final.csv') | |
| print(f"📚 Loading {len(df)} training samples into knowledge base...") | |
| conn = sqlite3.connect(KNOWLEDGE_BASE_DB) | |
| cursor = conn.cursor() | |
| added_count = 0 | |
| skipped_count = 0 | |
| for index, row in df.iterrows(): | |
| news_text = str(row['content']) | |
| label = int(row['label']) | |
| prediction = "REAL" if label == 0 else "FAKE" | |
| # Create content hash for deduplication | |
| content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest() | |
| # Check if entry already exists | |
| cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,)) | |
| if cursor.fetchone(): | |
| skipped_count += 1 | |
| continue | |
| # Create synthetic analysis for training data | |
| synthetic_analysis = f"""1. KẾT LUẬN: {prediction} | |
| 2. ĐỘ TIN CẬY: THẬT: {95 if prediction == 'REAL' else 5}% / GIẢ: {5 if prediction == 'REAL' else 95}% | |
| 3. PHÂN TÍCH CHI TIẾT: | |
| - Nội dung: {'Tin tức được xác minh từ nguồn đào tạo' if prediction == 'REAL' else 'Tin tức giả được xác định từ nguồn đào tạo'} | |
| - Nguồn tin: Dữ liệu huấn luyện đã được xác minh | |
| - Ngữ cảnh: Mẫu từ bộ dữ liệu huấn luyện DistilBERT | |
| - Ngôn ngữ: {'Ngôn ngữ khách quan, tin cậy' if prediction == 'REAL' else 'Ngôn ngữ có dấu hiệu tin giả'} | |
| - Thời gian: Dữ liệu huấn luyện đã được kiểm chứng | |
| 4. CÁC DẤU HIỆU CẢNH BÁO: {'Không có dấu hiệu cảnh báo' if prediction == 'REAL' else 'Tin tức được xác định là giả từ nguồn đào tạo'} | |
| 5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC: | |
| - Nguồn: Dữ liệu huấn luyện đã được xác minh | |
| - Độ tin cậy: Cao (từ bộ dữ liệu đào tạo) | |
| - Lưu ý: Mẫu từ tập huấn luyện DistilBERT""" | |
| # Insert training sample | |
| cursor.execute(''' | |
| INSERT INTO knowledge_entries | |
| (content_hash, news_text, prediction, confidence, search_results, gemini_analysis) | |
| VALUES (?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| content_hash, | |
| news_text, | |
| prediction, | |
| 0.95, # High confidence for training data | |
| json.dumps([], ensure_ascii=False), # Empty search results for training data | |
| synthetic_analysis | |
| )) | |
| added_count += 1 | |
| # Show progress every 1000 entries | |
| if added_count % 1000 == 0: | |
| print(f" Added {added_count} entries...") | |
| conn.commit() | |
| conn.close() | |
| print(f"✅ Knowledge base populated successfully!") | |
| print(f" 📊 Added: {added_count} entries") | |
| print(f" ⏭️ Skipped: {skipped_count} duplicates") | |
| print(f" 🎯 Total entries: {added_count}") | |
| return True | |
| except FileNotFoundError as e: | |
| print(f"⚠️ Training data file not found: {e}") | |
| print(" Knowledge base will start empty (this is OK)") | |
| return False | |
| except Exception as e: | |
| print(f"⚠️ Error populating knowledge base: {e}") | |
| print(" Knowledge base will start empty (this is OK)") | |
| return False | |
| # Populate knowledge base with training data on startup (optional) | |
| print("🚀 Populating knowledge base with training data...") | |
| populate_knowledge_base_from_training_data() | |
| CREDIBLE_SOURCES = { | |
| # Báo tổng hợp & phổ biến | |
| 'vnexpress.net': 0.95, | |
| 'tuoitre.vn': 0.95, | |
| 'thanhnien.vn': 0.90, | |
| 'dantri.com.vn': 0.90, | |
| 'vietnamnet.vn': 0.80, | |
| 'zing.vn': 0.85, | |
| 'zingnews.vn': 0.85, | |
| 'nld.com.vn': 0.90, | |
| 'laodong.vn': 0.90, | |
| 'kenh14.vn': 0.80, | |
| 'soha.vn': 0.80, | |
| 'baotintuc.vn': 0.85, | |
| # Báo chính trị - xã hội & nhà nước | |
| 'nhandan.vn': 0.90, | |
| 'qdnd.vn': 0.90, | |
| 'cand.com.vn': 0.95, | |
| 'congan.com.vn': 0.95, | |
| 'baochinhphu.vn': 0.95, | |
| 'vnanet.vn': 0.90, | |
| 'quochoi.vn': 0.95, | |
| 'chinhphu.vn': 0.95, | |
| # Kinh tế, tài chính, đầu tư, công nghệ | |
| 'cafef.vn': 0.85, | |
| 'vietnamfinance.vn': 0.85, | |
| 'baodautu.vn': 0.85, | |
| 'ictnews.vietnamnet.vn': 0.85, | |
| # Giáo dục - Văn hóa - Khoa học | |
| 'giaoducthoidai.vn': 0.90, | |
| 'vov.vn': 0.90, | |
| 'toquoc.vn': 0.90, | |
| # Cổng thông tin các bộ ngành | |
| 'moh.gov.vn': 0.95, # Bộ Y tế | |
| 'mofa.gov.vn': 0.95, # Bộ Ngoại giao | |
| 'mard.gov.vn': 0.95, # Bộ NN&PTNT | |
| 'moc.gov.vn': 0.95, # Bộ Xây dựng | |
| 'mof.gov.vn': 0.95, # Bộ Tài chính | |
| 'most.gov.vn': 0.95, # Bộ KH&CN (đúng domain là most.gov.vn, không phải mst) | |
| # Quốc tế & tham khảo | |
| 'wikipedia.org': 0.95, | |
| 'fifa.com': 0.95, | |
| } | |
| def clean_text(text): | |
| """Clean up the text before feeding it to our model""" | |
| if not isinstance(text, str): | |
| text = str(text) | |
| text = re.sub(r'\s+', ' ', text.strip()) | |
| if len(text) < 10: | |
| text = "Tin tức ngắn: " + text | |
| return text | |
| def predict_with_distilbert(text): | |
| """Run the text through our trained DistilBERT model to get a prediction""" | |
| if model is None or tokenizer is None: | |
| return None, None, None, None | |
| try: | |
| clean_text_input = clean_text(text) | |
| inputs = tokenizer( | |
| clean_text_input, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=512 | |
| ) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
| real_score = predictions[0][0].item() | |
| fake_score = predictions[0][1].item() | |
| if real_score > fake_score: | |
| prediction = "REAL" | |
| confidence = real_score | |
| else: | |
| prediction = "FAKE" | |
| confidence = fake_score | |
| return prediction, confidence, real_score, fake_score | |
| except Exception as e: | |
| print(f"DistilBERT prediction error: {e}") | |
| return None, None, None, None | |
| def process_search_results(items): | |
| search_results = [] | |
| for item in items: | |
| search_results.append({ | |
| 'title': item.get('title', ''), | |
| 'snippet': item.get('snippet', ''), | |
| 'link': item.get('link', '') | |
| }) | |
| return search_results | |
| def google_search_fallback(news_text): | |
| print("Google Search is unavailable - using enhanced content analysis") | |
| # Enhanced content analysis without external search | |
| fake_indicators = ['giả', 'sai', 'không đúng', 'bịa đặt', 'lừa đảo', 'fake news', 'tin đồn', 'nghi vấn'] | |
| real_indicators = ['chính thức', 'xác nhận', 'chính xác', 'đúng', 'verified', 'chính phủ', 'bộ y tế', 'cơ quan'] | |
| text_lower = news_text.lower() | |
| fake_count = sum(1 for word in fake_indicators if word in text_lower) | |
| real_count = sum(1 for word in real_indicators if word in text_lower) | |
| # Create more detailed analysis | |
| analysis_details = [] | |
| if fake_count > real_count: | |
| analysis_details.append("Nhiều từ khóa nghi ngờ được sử dụng") | |
| elif real_count > fake_count: | |
| analysis_details.append("Có từ khóa xác thực từ nguồn chính thức") | |
| # Check for other patterns | |
| if len(news_text) < 100: | |
| analysis_details.append("Tin tức quá ngắn, thiếu thông tin chi tiết") | |
| if '!' in news_text or '!!!' in news_text: | |
| analysis_details.append("Sử dụng dấu chấm than thái quá") | |
| snippet = f"Phân tích nội dung: {fake_count} từ nghi ngờ, {real_count} từ xác thực. " | |
| snippet += "; ".join(analysis_details) if analysis_details else "Không phát hiện dấu hiệu đặc biệt" | |
| return [{ | |
| 'title': 'Phân tích nội dung chi tiết (không có tìm kiếm Google)', | |
| 'snippet': snippet, | |
| 'link': 'content-analysis-only' | |
| }] | |
| def google_search(news_text): | |
| """Search Google for information about the news - general search without prioritizing specific sites""" | |
| try: | |
| service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY) | |
| # Extract keywords from news text - improved for Vietnamese | |
| # Split into words (not individual characters) | |
| words = news_text.split() | |
| # Filter out very short words and keep important ones | |
| important_words = [w for w in words if len(w) > 2][:10] # Top 10 important words | |
| # Create search query from the text | |
| main_query = ' '.join(important_words) | |
| # Also try with the first 100 characters as exact phrase | |
| short_query = news_text[:100].strip() | |
| print(f"📝 News text preview: {news_text[:100]}...") | |
| # Prepare search queries | |
| search_queries = [] | |
| if short_query: | |
| search_queries.append(f'"{short_query}"') # Exact phrase search | |
| if main_query: | |
| search_queries.append(main_query) # Keyword search | |
| print(f"🔍 Search queries prepared: {len(search_queries)}") | |
| for idx, q in enumerate(search_queries, 1): | |
| print(f" Query {idx}: {q[:80]}...") | |
| all_results = [] | |
| for i, search_query in enumerate(search_queries): | |
| if not search_query.strip(): | |
| continue | |
| print(f"🔍 Search {i+1}: '{search_query[:80]}...'") | |
| try: | |
| # Request 10 results per query | |
| result = service.cse().list( | |
| q=search_query, | |
| cx=SEARCH_ENGINE_ID, | |
| num=10 | |
| ).execute() | |
| if 'items' in result and result['items']: | |
| all_results.extend(result['items']) | |
| print(f"✅ Found {len(result['items'])} results (total: {len(all_results)})") | |
| # If we have enough results, stop searching | |
| if len(all_results) >= 15: # Get a bit more for diversity filtering | |
| break | |
| except Exception as e: | |
| print(f"Search error: {e}") | |
| continue | |
| if all_results: | |
| # Filter for diversity - max 2 results per domain | |
| diverse_results = [] | |
| domain_count = {} | |
| for item in all_results: | |
| # Extract domain from URL | |
| url = item.get('link', '') | |
| domain = url.split('/')[2] if len(url.split('/')) > 2 else url | |
| # Count results per domain | |
| if domain not in domain_count: | |
| domain_count[domain] = 0 | |
| # Only add if we have less than 2 from this domain | |
| if domain_count[domain] < 2: | |
| diverse_results.append(item) | |
| domain_count[domain] += 1 | |
| # Stop when we have 10 diverse results | |
| if len(diverse_results) >= 10: | |
| break | |
| print(f"✅ Returning {len(diverse_results)} diverse results from {len(domain_count)} domains") | |
| return process_search_results(diverse_results) | |
| print("No results found, using fallback...") | |
| return google_search_fallback(news_text) | |
| except Exception as e: | |
| print(f"Google Search error: {e}") | |
| print(f"Error type: {type(e).__name__}") | |
| error_str = str(e).lower() | |
| if any(keyword in error_str for keyword in ["403", "blocked", "quota", "limit", "exceeded"]): | |
| print("Google Search API blocked/quota exceeded, using fallback...") | |
| # Return error information along with fallback results | |
| fallback_results = google_search_fallback(news_text) | |
| return { | |
| 'results': fallback_results, | |
| 'error': 'QUOTA_EXCEEDED', | |
| 'error_message': 'Google Search API quota exceeded. Using content analysis only.', | |
| 'error_details': str(e) | |
| } | |
| elif "invalid" in error_str or "unauthorized" in error_str: | |
| print("API key issue, using fallback...") | |
| fallback_results = google_search_fallback(news_text) | |
| return { | |
| 'results': fallback_results, | |
| 'error': 'API_KEY_INVALID', | |
| 'error_message': 'Google Search API key invalid. Using content analysis only.', | |
| 'error_details': str(e) | |
| } | |
| else: | |
| print("Unknown Google Search error, using fallback...") | |
| fallback_results = google_search_fallback(news_text) | |
| return { | |
| 'results': fallback_results, | |
| 'error': 'UNKNOWN_ERROR', | |
| 'error_message': 'Google Search failed. Using content analysis only.', | |
| 'error_details': str(e) | |
| } | |
| def analyze_sources(search_results): | |
| """Check how trustworthy the news sources are - using AVERAGE credibility""" | |
| if not search_results: | |
| return 0.50, 0.20, "No sources found", [] | |
| credible_count = 0 | |
| total_sources = len(search_results) | |
| found_sources = [] | |
| credible_sources_found = [] | |
| credibility_scores = [] # Track individual credibility scores | |
| for result in search_results: | |
| domain = result['link'].split('/')[2] if '//' in result['link'] else '' | |
| found_sources.append(domain) | |
| # Check if this domain matches any credible source | |
| found_credible = False | |
| for source, credibility in CREDIBLE_SOURCES.items(): | |
| if source in domain: | |
| credible_count += 1 | |
| credible_sources_found.append(f"{source} ({credibility:.0%})") | |
| credibility_scores.append(credibility) # Add actual credibility score | |
| found_credible = True | |
| break | |
| # If not credible, assign 50% (0.50) | |
| if not found_credible: | |
| credibility_scores.append(0.50) | |
| # Calculate AVERAGE credibility of all sources (credible sites keep their %, non-credible = 50%) | |
| source_credibility = sum(credibility_scores) / len(credibility_scores) if credibility_scores else 0.50 | |
| print(f"📊 Source Credibility Calculation:") | |
| print(f" Total sources: {total_sources}") | |
| print(f" Credible sources: {credible_count}") | |
| print(f" Non-credible sources: {total_sources - credible_count} (each counts as 50%)") | |
| print(f" Individual scores: {[f'{s:.0%}' for s in credibility_scores]}") | |
| print(f" Average credibility: {source_credibility:.1%}") | |
| popularity_score = min(1.0, total_sources / 5.0) # Normalize to 0-1 | |
| # Create a summary of what we found | |
| if source_credibility > 0.7: | |
| credibility_text = f"High credibility: {credible_count}/{total_sources} sources from reputable outlets (avg: {source_credibility:.0%})" | |
| elif source_credibility > 0.4: | |
| credibility_text = f"Medium credibility: {credible_count}/{total_sources} sources from reputable outlets (avg: {source_credibility:.0%})" | |
| else: | |
| credibility_text = f"Low credibility: {credible_count}/{total_sources} sources from reputable outlets (avg: {source_credibility:.0%})" | |
| return source_credibility, popularity_score, credibility_text, found_sources, credible_sources_found | |
| def analyze_source_support(news_text, search_results): | |
| """Check if the search results agree or disagree with the news""" | |
| if not search_results: | |
| return 0.5, "No sources to analyze" | |
| support_count = 0 | |
| contradict_count = 0 | |
| total_sources = len(search_results) | |
| # Look for years mentioned in the news | |
| import re | |
| news_years = re.findall(r'\b(20\d{2})\b', news_text) | |
| news_year = news_years[0] if news_years else None | |
| for result in search_results: | |
| title_snippet = (result.get('title', '') + ' ' + result.get('snippet', '')).lower() | |
| # See if the years match up | |
| if news_year: | |
| source_years = re.findall(r'\b(20\d{2})\b', title_snippet) | |
| if source_years and news_year not in source_years: | |
| contradict_count += 1 | |
| continue | |
| # Look for words that suggest agreement or disagreement | |
| support_keywords = ['confirm', 'verify', 'true', 'accurate', 'correct', 'xác nhận', 'chính xác', 'đúng'] | |
| contradict_keywords = ['false', 'fake', 'incorrect', 'wrong', 'sai', 'giả', 'không đúng'] | |
| support_score = sum(1 for keyword in support_keywords if keyword in title_snippet) | |
| contradict_score = sum(1 for keyword in contradict_keywords if keyword in title_snippet) | |
| if contradict_score > support_score: | |
| contradict_count += 1 | |
| elif support_score > contradict_score: | |
| support_count += 1 | |
| else: | |
| # If unclear, assume slight support | |
| support_count += 0.5 | |
| support_ratio = support_count / total_sources if total_sources > 0 else 0.5 | |
| if support_ratio > 0.7: | |
| support_text = f"Sources strongly support the news: {support_count:.1f}/{total_sources} sources confirm" | |
| elif support_ratio > 0.4: | |
| support_text = f"Sources mixed: {support_count:.1f}/{total_sources} sources support, {contradict_count} contradict" | |
| else: | |
| support_text = f"Sources contradict the news: {contradict_count}/{total_sources} sources contradict" | |
| return support_ratio, support_text | |
| def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence): | |
| """Use Gemini AI to analyze the news and compare with our model results""" | |
| try: | |
| # Knowledge base search with training data | |
| if ENABLE_KNOWLEDGE_BASE_SEARCH: | |
| print("🔍 Searching knowledge base for similar entries...") | |
| knowledge_results = search_knowledge_base(news_text, limit=2) # Reduced to 2 for speed | |
| knowledge_context = format_knowledge_for_rag(knowledge_results) | |
| else: | |
| knowledge_context = "" | |
| # Try to use the latest Gemini model available | |
| try: | |
| model = genai.GenerativeModel('gemini-2.0-flash-exp') | |
| except: | |
| try: | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| except: | |
| try: | |
| model = genai.GenerativeModel('gemini-1.5-pro') | |
| except: | |
| model = genai.GenerativeModel('gemini-1.5-flash') | |
| # Format the search results for Gemini (limit to top 3 for speed) | |
| search_summary = "" | |
| if search_results: | |
| search_summary = "Kết quả tìm kiếm Google:\n" | |
| for i, result in enumerate(search_results[:3], 1): # Reduced from 5 to 3 | |
| search_summary += f"{i}. {result['title']}\n {result['snippet']}\n Nguồn: {result['link']}\n\n" | |
| else: | |
| search_summary = "Không tìm thấy kết quả tìm kiếm Google cho tin tức này. Điều này có thể do API bị giới hạn hoặc tin tức quá mới/chưa được đăng tải." | |
| # Note: We're not including DistilBERT results to keep Gemini analysis independent | |
| prompt = f""" | |
| Bạn là một chuyên gia phân tích tin tức chuyên nghiệp. Hãy phân tích chi tiết tin tức sau và đánh giá độ tin cậy của nó: | |
| "{news_text}" | |
| {search_summary} | |
| {knowledge_context} | |
| Hãy thực hiện phân tích toàn diện theo các tiêu chí sau: | |
| 1. Phân tích nội dung: Kiểm tra tính logic, mâu thuẫn, ngôn ngữ cảm xúc thái quá | |
| 2. Phân tích nguồn tin: Đánh giá uy tín và độ tin cậy của nguồn | |
| 3. Phân tích ngữ cảnh: So sánh với thông tin có sẵn và kiến thức thực tế | |
| 4. Phân tích ngôn ngữ: Tìm dấu hiệu của tin giả như từ ngữ gây sốc, cảm xúc | |
| 5. Phân tích thời gian: Kiểm tra tính hợp lý về mặt thời gian | |
| Trả lời theo định dạng sau (chỉ bằng tiếng Việt, viết chi tiết và chuyên nghiệp): | |
| 1. KẾT LUẬN: [THẬT/GIẢ/KHÔNG XÁC ĐỊNH] | |
| 2. ĐỘ TIN CẬY: [THẬT: X% / GIẢ: Y%] (Trong đó X% là độ tin cậy tin THẬT, Y% là độ tin cậy tin GIẢ, X+Y=100%) | |
| 3. PHÂN TÍCH CHI TIẾT: | |
| - Nội dung: [Phân tích chi tiết về nội dung tin tức] | |
| - Nguồn tin: [Đánh giá về nguồn và độ tin cậy] | |
| - Ngữ cảnh: [So sánh với thông tin có sẵn] | |
| - Ngôn ngữ: [Phân tích cách sử dụng từ ngữ] | |
| - Thời gian: [Kiểm tra tính hợp lý về mặt thời gian] | |
| 4. CÁC DẤU HIỆU CẢNH BÁO: [Liệt kê các dấu hiệu đáng ngờ nếu có] | |
| 5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC: | |
| - [Hướng dẫn cụ thể để kiểm chứng thông tin] | |
| - [Các nguồn tin đáng tin cậy để tham khảo] | |
| - [Cách phân biệt tin thật và tin giả] | |
| QUAN TRỌNG: Trong phần "ĐỘ TIN CẬY", hãy cung cấp tỷ lệ phần trăm chính xác dựa trên phân tích của bạn. Ví dụ: "THẬT: 95% / GIẢ: 5%" nghĩa là 95% tin tức này là THẬT, 5% là GIẢ. | |
| Viết chi tiết, chuyên nghiệp và hữu ích cho người đọc. | |
| """ | |
| print("Calling Gemini API...") | |
| print(f"DEBUG - News text being analyzed: {news_text}") | |
| print(f"DEBUG - Search results count: {len(search_results)}") | |
| if search_results: | |
| print(f"DEBUG - First search result title: {search_results[0].get('title', 'No title')}") | |
| # Use settings optimized for faster processing | |
| generation_config = genai.types.GenerationConfig( | |
| temperature=0.3, # Lower for more consistent results | |
| top_p=0.8, # Reduced for faster processing | |
| top_k=20, # Reduced for faster processing | |
| max_output_tokens=1000 # Reduced for faster responses | |
| ) | |
| response = model.generate_content(prompt, generation_config=generation_config) | |
| print("Gemini API response received successfully") | |
| return response.text | |
| except Exception as e: | |
| print(f"Gemini analysis error: {e}") | |
| print(f"Error type: {type(e).__name__}") | |
| # If we hit the API limit, provide a basic analysis | |
| if "429" in str(e) or "quota" in str(e).lower(): | |
| print("Gemini API quota exceeded, providing enhanced fallback analysis...") | |
| # Enhanced analysis based on content patterns | |
| fake_patterns = ['giả', 'sai', 'không đúng', 'bịa đặt', 'lừa đảo', 'fake news', 'tin đồn'] | |
| real_patterns = ['chính thức', 'xác nhận', 'chính xác', 'đúng', 'verified', 'chính phủ', 'bộ y tế'] | |
| news_lower = news_text.lower() | |
| fake_score = sum(1 for pattern in fake_patterns if pattern in news_lower) | |
| real_score = sum(1 for pattern in real_patterns if pattern in news_lower) | |
| # Adjust prediction based on patterns | |
| if fake_score > real_score and distilbert_prediction == 'FAKE': | |
| confidence_boost = "Cao (có từ khóa nghi ngờ)" | |
| elif real_score > fake_score and distilbert_prediction == 'REAL': | |
| confidence_boost = "Cao (có từ khóa xác thực)" | |
| else: | |
| confidence_boost = "Trung bình" | |
| # Create detailed fallback analysis | |
| conclusion = 'THẬT' if distilbert_prediction == 'REAL' else 'GIẢ' if distilbert_prediction == 'FAKE' else 'KHÔNG XÁC ĐỊNH' | |
| # Enhanced analysis based on content patterns | |
| suspicious_patterns = [] | |
| if fake_score > 0: | |
| suspicious_patterns.append(f"Tìm thấy {fake_score} từ khóa nghi ngờ") | |
| if real_score > 0: | |
| suspicious_patterns.append(f"Tìm thấy {real_score} từ khóa xác thực") | |
| warning_signs = [] | |
| if 'cảnh báo' in news_lower or 'nguy hiểm' in news_lower: | |
| warning_signs.append("Sử dụng từ ngữ gây sợ hãi") | |
| if 'ngay lập tức' in news_lower or 'khẩn cấp' in news_lower: | |
| warning_signs.append("Tạo cảm giác cấp bách không cần thiết") | |
| if len(news_text) < 100: | |
| warning_signs.append("Tin tức quá ngắn, thiếu thông tin chi tiết") | |
| fallback_analysis = f"""1. KẾT LUẬN: {conclusion} | |
| 2. ĐỘ TIN CẬY: {'THẬT: 5% / GIẢ: 95%' if conclusion == 'GIẢ' else 'THẬT: 95% / GIẢ: 5%' if conclusion == 'THẬT' else 'THẬT: 50% / GIẢ: 50%'} | |
| 3. PHÂN TÍCH CHI TIẾT: | |
| - Nội dung: {'Tin tức có vẻ hợp lý' if distilbert_prediction == 'REAL' else 'Tin tức có nhiều dấu hiệu đáng ngờ' if distilbert_prediction == 'FAKE' else 'Nội dung không rõ ràng'} | |
| - Nguồn tin: Google Search không khả dụng (hết quota) - không thể kiểm tra nguồn | |
| - Ngữ cảnh: Phân tích từ khóa: {confidence_boost} | |
| - Ngôn ngữ: {'Ngôn ngữ trung tính' if fake_score == real_score else 'Có dấu hiệu cảm xúc thái quá' if fake_score > real_score else 'Ngôn ngữ khách quan'} | |
| - Thời gian: Không thể xác minh do thiếu thông tin bổ sung | |
| 4. CÁC DẤU HIỆU CẢNH BÁO: | |
| {chr(10).join([f"- {sign}" for sign in warning_signs]) if warning_signs else "- Không phát hiện dấu hiệu cảnh báo rõ ràng"} | |
| 5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC: | |
| - Kiểm tra nguồn: Tìm kiếm thông tin tương tự trên các trang báo uy tín như VnExpress, Tuổi Trẻ, Thanh Niên | |
| - Xác minh thời gian: Kiểm tra xem tin tức có được đăng tải đồng thời trên nhiều nguồn không | |
| - Đánh giá ngôn ngữ: Tránh chia sẻ tin tức có ngôn ngữ cảm xúc thái quá hoặc tạo cảm giác cấp bách | |
| - Lưu ý: Do hệ thống API tạm thời không khả dụng, kết quả phân tích có thể không hoàn toàn chính xác""" | |
| return fallback_analysis | |
| # For other errors, see what models are available | |
| try: | |
| models = genai.list_models() | |
| print("Available models:") | |
| for model in models: | |
| if 'gemini' in model.name.lower(): | |
| print(f" - {model.name}") | |
| except Exception as list_error: | |
| print(f"Could not list models: {list_error}") | |
| return f"Lỗi phân tích Gemini: {e}" | |
| def extract_gemini_percentage(gemini_analysis): | |
| """Extract percentage confidence from Gemini analysis - Enhanced with multiple patterns""" | |
| try: | |
| gemini_lower = gemini_analysis.lower() | |
| # Look for the confidence percentage pattern | |
| import re | |
| # Pattern 1: "ĐỘ TIN CẬY: THẬT: X% / GIẢ: Y%" format (flexible spacing) | |
| patterns = [ | |
| # Standard format with "độ tin cậy" | |
| r'độ\s*tin\s*cậy[:\s]*thật[:\s]*(\d+)\s*%[:\s/]*giả[:\s]*(\d+)\s*%', | |
| # Reversed order | |
| r'độ\s*tin\s*cậy[:\s]*giả[:\s]*(\d+)\s*%[:\s/]*thật[:\s]*(\d+)\s*%', | |
| # Without "độ tin cậy" prefix | |
| r'thật[:\s]*(\d+)\s*%[:\s/]*giả[:\s]*(\d+)\s*%', | |
| r'giả[:\s]*(\d+)\s*%[:\s/]*thật[:\s]*(\d+)\s*%', | |
| # With ":" separator | |
| r'thật:\s*(\d+)\s*%\s*/\s*giả:\s*(\d+)\s*%', | |
| r'giả:\s*(\d+)\s*%\s*/\s*thật:\s*(\d+)\s*%', | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, gemini_lower) | |
| if match: | |
| percent1 = int(match.group(1)) | |
| percent2 = int(match.group(2)) | |
| # Determine which is real and which is fake based on context | |
| # Check which group comes first in the match | |
| match_text = match.group(0).lower() | |
| if 'thật' in match_text[:len(match_text)//2]: | |
| # "thật" appears first | |
| real_percent = percent1 | |
| fake_percent = percent2 | |
| elif 'giả' in match_text[:len(match_text)//2]: | |
| # "giả" appears first, so swap | |
| fake_percent = percent1 | |
| real_percent = percent2 | |
| else: | |
| # Can't determine from context, assume first is real | |
| real_percent = percent1 | |
| fake_percent = percent2 | |
| # Normalize to ensure they're between 0-100 | |
| real_percent = min(100, max(0, real_percent)) | |
| fake_percent = min(100, max(0, fake_percent)) | |
| # Convert to 0-1 scale | |
| real_percent = real_percent / 100 | |
| fake_percent = fake_percent / 100 | |
| # If they don't add up to 1, normalize | |
| total = real_percent + fake_percent | |
| if total > 0: | |
| real_percent = real_percent / total | |
| fake_percent = fake_percent / total | |
| else: | |
| real_percent = 0.5 | |
| fake_percent = 0.5 | |
| print(f"✅ Extracted Gemini percentages: {real_percent:.1%} real, {fake_percent:.1%} fake") | |
| return real_percent, fake_percent | |
| # Fallback: try to find individual percentages in separate lines | |
| # Look for "THẬT: X%" and "GIẢ: Y%" on separate lines or separated | |
| real_match = re.search(r'thật[:\s]*(\d+)\s*%', gemini_lower) | |
| fake_match = re.search(r'giả[:\s]*(\d+)\s*%', gemini_lower) | |
| if real_match and fake_match: | |
| real_percent = min(100, max(0, int(real_match.group(1)))) / 100 | |
| fake_percent = min(100, max(0, int(fake_match.group(1)))) / 100 | |
| # Normalize | |
| total = real_percent + fake_percent | |
| if total > 0: | |
| real_percent = real_percent / total | |
| fake_percent = fake_percent / total | |
| else: | |
| real_percent = 0.5 | |
| fake_percent = 0.5 | |
| print(f"✅ Extracted Gemini percentages (fallback): {real_percent:.1%} real, {fake_percent:.1%} fake") | |
| return real_percent, fake_percent | |
| # Last resort: Look for any percentage numbers near "thật" or "giả" | |
| # Find all percentage patterns | |
| all_percentages = re.findall(r'(\d+)\s*%', gemini_lower) | |
| if len(all_percentages) >= 2: | |
| # Try to find context around percentages | |
| for i, match_obj in enumerate(re.finditer(r'(\d+)\s*%', gemini_lower)): | |
| start_pos = max(0, match_obj.start() - 20) | |
| end_pos = min(len(gemini_lower), match_obj.end() + 20) | |
| context = gemini_lower[start_pos:end_pos] | |
| percent_val = int(match_obj.group(1)) | |
| if 'thật' in context and i == 0: | |
| real_percent = min(100, max(0, percent_val)) / 100 | |
| elif 'giả' in context and i == 1: | |
| fake_percent = min(100, max(0, percent_val)) / 100 | |
| elif 'thật' in context: | |
| real_percent = min(100, max(0, percent_val)) / 100 | |
| elif 'giả' in context: | |
| fake_percent = min(100, max(0, percent_val)) / 100 | |
| if 'real_percent' in locals() and 'fake_percent' in locals(): | |
| # Normalize | |
| total = real_percent + fake_percent | |
| if total > 0: | |
| real_percent = real_percent / total | |
| fake_percent = fake_percent / total | |
| else: | |
| real_percent = 0.5 | |
| fake_percent = 0.5 | |
| print(f"✅ Extracted Gemini percentages (last resort): {real_percent:.1%} real, {fake_percent:.1%} fake") | |
| return real_percent, fake_percent | |
| print(f"⚠️ Could not extract Gemini percentages from analysis") | |
| print(f" Analysis preview: {gemini_analysis[:200]}...") | |
| return None, None | |
| except Exception as e: | |
| print(f"❌ Error extracting Gemini percentages: {e}") | |
| import traceback | |
| print(f" Traceback: {traceback.format_exc()}") | |
| return None, None | |
| def calculate_combined_confidence(distilbert_prediction, distilbert_confidence, source_credibility, popularity_score, gemini_analysis, source_support=0.5, skip_google_search=False, num_search_results=0): | |
| """Calculate combined confidence using multi-tier system: | |
| SPECIAL CASES (Priority Order): | |
| 1. Gemini ≥95% (real OR fake) → Final Score = Gemini Max Confidence | |
| 2. Google Search ≥95% (real) → Final Score = Google Search Score | |
| 3. Google Search ≤5% (fake) → Final Score = Google Search Score | |
| 4. Cả hai ≥85% → Final Score = (Gemini + Google Search) / 2 | |
| 5. Default: Weighted average (DistilBERT 35%, Gemini 35%, Google Search 30%) | |
| WEIGHTS (when Google Search available): | |
| - DistilBERT: 35% (or 52.5% if Google Search skipped) | |
| - Gemini AI: 35% (or 47.5% if Google Search skipped) | |
| - Google Search: 30% (or 0% if skipped) | |
| Note: Gemini score uses MAX confidence (≥95% in either direction triggers override) | |
| """ | |
| # Adjust weights if Google Search is skipped | |
| if skip_google_search: | |
| distilbert_weight = 0.525 # 52.5% | |
| gemini_weight = 0.475 # 47.5% | |
| google_weight = 0.0 # 0% | |
| print("⚠️ Google Search skipped - Using adjusted weights: DistilBERT 52.5%, Gemini 47.5%") | |
| else: | |
| distilbert_weight = 0.35 # 35% | |
| gemini_weight = 0.35 # 35% | |
| google_weight = 0.30 # 30% | |
| # 1. DISTILBERT SCORE | |
| if distilbert_prediction == "REAL": | |
| distilbert_score = distilbert_confidence | |
| else: | |
| distilbert_score = 1 - distilbert_confidence | |
| print(f"DistilBERT Score: {distilbert_score:.3f} ({distilbert_weight*100:.1f}% weight)") | |
| # 2. GEMINI AI SCORE | |
| gemini_lower = gemini_analysis.lower() | |
| gemini_direction = "UNKNOWN" # Initialize direction | |
| # Try to extract percentage from Gemini analysis first | |
| gemini_real_percent, gemini_fake_percent = extract_gemini_percentage(gemini_analysis) | |
| if gemini_real_percent is not None and gemini_fake_percent is not None: | |
| # Determine which direction Gemini is more confident about | |
| if gemini_real_percent > gemini_fake_percent: | |
| # Gemini thinks it's REAL - use real percentage | |
| gemini_score = gemini_real_percent | |
| gemini_direction = "REAL" | |
| else: | |
| # Gemini thinks it's FAKE - use fake percentage | |
| gemini_score = gemini_fake_percent | |
| gemini_direction = "FAKE" | |
| print(f"Gemini Score (from percentage): {gemini_score:.3f} ({gemini_weight*100:.1f}% weight) - {gemini_real_percent:.1%} real, {gemini_fake_percent:.1%} fake") | |
| print(f" → Gemini direction: {gemini_direction} with {gemini_score:.1%} confidence (triggers override if ≥95%)") | |
| else: | |
| # Fallback to conclusion analysis | |
| conclusion_score = 0.5 # Default neutral | |
| if "kết luận: giả" in gemini_lower or "kết luận: fake" in gemini_lower: | |
| conclusion_score = 0.1 # Very low for FAKE | |
| print("Gemini Conclusion: FAKE") | |
| elif "kết luận: thật" in gemini_lower or "kết luận: real" in gemini_lower: | |
| conclusion_score = 0.9 # Very high for REAL | |
| print("Gemini Conclusion: REAL") | |
| elif "giả" in gemini_lower and "kết luận" in gemini_lower: | |
| # Check if "giả" appears near "kết luận" | |
| conclusion_start = gemini_lower.find("kết luận") | |
| if conclusion_start != -1: | |
| conclusion_section = gemini_lower[conclusion_start:conclusion_start + 50] | |
| if "giả" in conclusion_section: | |
| conclusion_score = 0.1 | |
| print("Gemini Conclusion: FAKE (detected in conclusion section)") | |
| elif "thật" in conclusion_section: | |
| conclusion_score = 0.9 | |
| print("Gemini Conclusion: REAL (detected in conclusion section)") | |
| # Additional analysis indicators | |
| fake_indicators = ["giả", "fake", "vô lý", "phi thực tế", "absurd", "preposterous", "impossible", | |
| "không thể xảy ra", "không có căn cứ", "tin giả"] | |
| real_indicators = ["thật", "real", "chính xác", "đúng", "xác nhận", "verified", "đáng tin cậy"] | |
| fake_count = sum(1 for indicator in fake_indicators if indicator in gemini_lower) | |
| real_count = sum(1 for indicator in real_indicators if indicator in gemini_lower) | |
| # Adjust based on analysis indicators (but conclusion takes priority) | |
| if fake_count > real_count: | |
| analysis_adjustment = -0.2 | |
| print(f"Gemini Analysis: {fake_count} fake indicators vs {real_count} real indicators") | |
| elif real_count > fake_count: | |
| analysis_adjustment = 0.2 | |
| print(f"Gemini Analysis: {real_count} real indicators vs {fake_count} fake indicators") | |
| else: | |
| analysis_adjustment = 0.0 | |
| gemini_score = max(0.1, min(0.9, conclusion_score + analysis_adjustment)) | |
| print(f"Gemini Score (from conclusion): {gemini_score:.3f} ({gemini_weight*100:.1f}% weight)") | |
| # 3. GOOGLE SEARCH SCORE | |
| if skip_google_search: | |
| google_search_score = 0 | |
| google_search_real = False # Cannot determine if Google Search indicates real news | |
| google_search_fake = False # Cannot determine if Google Search indicates fake news | |
| print(f"Google Search Score: 0.000 (0% weight - SKIPPED due to short input)") | |
| else: | |
| # Calculate Google Search score based on credibility and support | |
| # Source credibility component (max 0.75) | |
| credibility_component = source_credibility * 0.75 # Convert to 0-0.75 scale | |
| # Source support component (max 0.75) | |
| support_component = source_support * 0.75 # Convert to 0-0.75 scale | |
| # Base score (0.25) + components, but CAP at 1.0 (100%) | |
| google_search_score = min(credibility_component + support_component + 0.25, 1.0) | |
| # If Gemini strongly says FAKE, reduce Google Search score | |
| if gemini_score < 0.3: # Gemini says FAKE (low score) | |
| google_search_score = min(google_search_score, 0.4) # Cap at 0.4 when Gemini says fake | |
| print(f"Google Search Score: {google_search_score:.3f} ({google_weight*100:.1f}% weight - {num_search_results} results) - Credibility: {source_credibility:.2f}, Support: {source_support:.2f} - CAPPED due to Gemini FAKE") | |
| else: | |
| print(f"Google Search Score: {google_search_score:.3f} ({google_weight*100:.1f}% weight - {num_search_results} results) - Credibility: {source_credibility:.2f}, Support: {source_support:.2f}") | |
| # Determine if Google Search indicates high confidence real or fake | |
| google_search_real = google_search_score >= 0.95 # ≥95% indicates strong evidence for real news | |
| google_search_fake = google_search_score <= 0.05 # ≤5% indicates strong evidence for fake news | |
| # 4. CHECK FOR SPECIAL CASES FROM SPREADSHEET (in priority order) | |
| # Case 1: Gemini ≥95% (real OR fake) → Final Score = Gemini Score | |
| if gemini_score >= 0.95: | |
| print(f"🚀 GEMINI OVERRIDE TRIGGERED: {gemini_score:.1%} ≥ 95%") | |
| # Use the actual Gemini score (not max, but the direction Gemini chose) | |
| if gemini_real_percent is not None and gemini_fake_percent is not None: | |
| if gemini_real_percent > gemini_fake_percent: | |
| # Gemini thinks it's REAL with high confidence | |
| final_confidence = gemini_real_percent | |
| print(f"🎯 GEMINI ≥95% REAL: Final Score = Gemini Real Confidence ({gemini_real_percent:.1%})") | |
| else: | |
| # Gemini thinks it's FAKE with high confidence | |
| final_confidence = gemini_fake_percent | |
| print(f"🎯 GEMINI ≥95% FAKE: Final Score = Gemini Fake Confidence ({gemini_fake_percent:.1%})") | |
| else: | |
| # Fallback to the calculated gemini_score | |
| final_confidence = gemini_score | |
| print(f"🎯 GEMINI ≥95% (Fallback): Final Score = Gemini Score ({gemini_score:.1%})") | |
| final_confidence = min(1.0, max(0.0, final_confidence)) # Bound between 0-100% | |
| print(f" - Gemini Direction: {gemini_direction}") | |
| print(f" - Gemini Confidence: {gemini_score:.1%}") | |
| print(f" - Final Confidence: {final_confidence:.1%}") | |
| print(f" - DistilBERT: IGNORED (Gemini override)") | |
| print(f" - Google Search: IGNORED (Gemini override)") | |
| print(f"✅ GEMINI OVERRIDE COMPLETE: Using {final_confidence:.1%} as final score") | |
| # Case 2: Google Search ≥95% → Final Score = Google Search Score | |
| elif google_search_real and not skip_google_search: | |
| final_confidence = min(1.0, max(0.0, google_search_score)) # Bound between 0-100% | |
| print(f"🎯 GOOGLE SEARCH ≥95%: Final Score = Google Search Score") | |
| print(f" - Google Search Score: {google_search_score:.1%}") | |
| print(f" - Final Confidence: {final_confidence:.1%}") | |
| print(f" - DistilBERT: IGNORED (Google Search override)") | |
| print(f" - Gemini: IGNORED (Google Search override)") | |
| # Case 3: Google Search ≤5% (fake) → Final Score = Google Search Score | |
| elif google_search_fake and not skip_google_search: | |
| final_confidence = min(1.0, max(0.0, google_search_score)) # Bound between 0-100% | |
| print(f"🎯 GOOGLE SEARCH ≤5% (FAKE): Final Score = Google Search Score") | |
| print(f" - Google Search Score: {google_search_score:.1%}") | |
| print(f" - Final Confidence: {final_confidence:.1%}") | |
| print(f" - DistilBERT: IGNORED (Google Search override)") | |
| print(f" - Gemini: IGNORED (Google Search override)") | |
| # Case 4: Cả hai ≥85% → Final Score = (Gemini + Google Search) / 2 | |
| elif (not skip_google_search and google_search_score >= 0.85 and gemini_score >= 0.85): | |
| final_confidence = min(1.0, max(0.0, (google_search_score + gemini_score) / 2)) # Bound between 0-100% | |
| print(f"🎯 CẢ HAI ≥85%: Final Score = (Gemini + Google Search) / 2") | |
| print(f" - Gemini Max Confidence: {gemini_score:.1%}") | |
| print(f" - Google Search Score: {google_search_score:.1%}") | |
| print(f" - Final Confidence: {final_confidence:.1%} (Average)") | |
| print(f" - DistilBERT: IGNORED (Both high confidence override)") | |
| # Case 5: Default weighted average (normal case) | |
| else: | |
| final_confidence = ( | |
| distilbert_score * distilbert_weight + | |
| gemini_score * gemini_weight + | |
| google_search_score * google_weight | |
| ) | |
| # Apply reasonable bounds for weighted average (allow full 0-100% range) | |
| final_confidence = min(1.0, max(0.0, final_confidence)) | |
| print(f"📊 DEFAULT WEIGHTED AVERAGE: Using standard weights") | |
| print(f" - DistilBERT ({distilbert_weight*100:.1f}%): {distilbert_score:.1%}") | |
| print(f" - Gemini ({gemini_weight*100:.1f}%): {gemini_score:.1%}") | |
| print(f" - Google Search ({google_weight*100:.1f}%): {google_search_score:.1%}") | |
| print(f" - Final Confidence: {final_confidence:.1%}") | |
| return final_confidence | |
| def analyze_news(news_text): | |
| """Main analysis function combining all three tools""" | |
| try: | |
| # Check if input is empty | |
| if not news_text.strip(): | |
| empty_message = """ | |
| <div style="font-family: 'Segoe UI', Arial, sans-serif; line-height: 1.6; color: #333;"> | |
| ## 📝 **HƯỚNG DẪN SỬ DỤNG** | |
| <div style="background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0; text-align: center;"> | |
| <h2 style="margin: 0; font-size: 24px;">💡 Vui lòng nhập tin tức</h2> | |
| <p style="margin: 10px 0 0 0; font-size: 16px; opacity: 0.9;">Để bắt đầu phân tích</p> | |
| </div> | |
| <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #17a2b8; margin: 10px 0;"> | |
| <p><strong>Hướng dẫn:</strong></p> | |
| <ul> | |
| <li>Nhập tin tức tiếng Việt cần kiểm tra vào ô trên</li> | |
| <li>Nhấn nút "Phân tích với AI nâng cao"</li> | |
| <li>Chờ hệ thống phân tích (có thể mất 10-30 giây)</li> | |
| <li>Xem kết quả phân tích chi tiết</li> | |
| </ul> | |
| </div> | |
| </div> | |
| """ | |
| return gr.update(value=empty_message, visible=True), "**Độ chắc chắn là tin thật:** 0%", "**Độ chắc chắn là tin giả:** 0%", gr.update(visible=False), gr.update(visible=False) | |
| print(f"Analyzing: {news_text[:50]}...") | |
| # Check if input is too short for Google Search (< 20 characters) | |
| skip_google_search = len(news_text.strip()) < 20 | |
| if skip_google_search: | |
| print("⚠️ Input too short (< 20 chars) - Skipping Google Search, will use only DistilBERT + Gemini") | |
| # Step 1: Search Google for related information (skip if input too short) | |
| search_results = [] | |
| search_error = None | |
| if skip_google_search: | |
| print("1. Skipping Google Search (input < 20 chars)...") | |
| search_results = [] | |
| else: | |
| print("1. Running Google Search...") | |
| try: | |
| search_response = google_search(news_text) | |
| # Handle both old format (list) and new format (dict with error info) | |
| if isinstance(search_response, dict) and 'error' in search_response: | |
| search_results = search_response['results'] | |
| search_error = { | |
| 'type': search_response['error'], | |
| 'message': search_response['error_message'], | |
| 'details': search_response['error_details'] | |
| } | |
| print(f"Google Search error: {search_error['message']}") | |
| else: | |
| search_results = search_response | |
| except Exception as e: | |
| print(f"Google Search error: {e}") | |
| search_results = [] | |
| search_error = { | |
| 'type': 'EXCEPTION', | |
| 'message': 'Google Search failed with exception', | |
| 'details': str(e) | |
| } | |
| # Step 2: Run our trained model | |
| print("2. Running DistilBERT analysis...") | |
| try: | |
| distilbert_prediction, distilbert_confidence, real_score, fake_score = predict_with_distilbert(news_text) | |
| except Exception as e: | |
| print(f"DistilBERT analysis error: {e}") | |
| distilbert_prediction, distilbert_confidence, real_score, fake_score = None, None, None, None | |
| # Step 3: Check the sources we found | |
| print("3. Analyzing sources and popularity...") | |
| try: | |
| source_credibility, popularity_score, credibility_text, found_sources, credible_sources_found = analyze_sources(search_results) | |
| source_support, support_text = analyze_source_support(news_text, search_results) | |
| except Exception as e: | |
| print(f"Source analysis error: {e}") | |
| source_credibility, popularity_score, credibility_text = 0.5, 0.2, "Lỗi phân tích nguồn" | |
| found_sources, credible_sources_found = [], [] | |
| source_support, support_text = 0.5, "Lỗi phân tích hỗ trợ nguồn" | |
| # Step 4: Get Gemini AI analysis | |
| print("4. Running Gemini analysis...") | |
| try: | |
| gemini_analysis = analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence) | |
| except Exception as e: | |
| print(f"Gemini analysis error: {e}") | |
| gemini_analysis = f"Lỗi phân tích Gemini: {str(e)}" | |
| # Step 5: Combine everything into final result | |
| print("5. Calculating combined confidence...") | |
| print(f" DistilBERT: {distilbert_prediction} ({distilbert_confidence:.3f})") | |
| print(f" Source credibility: {source_credibility:.3f}") | |
| print(f" Source support: {source_support:.3f}") | |
| print(f" Popularity: {popularity_score:.3f}") | |
| try: | |
| combined_confidence = calculate_combined_confidence( | |
| distilbert_prediction, distilbert_confidence, | |
| source_credibility, popularity_score, gemini_analysis, source_support, skip_google_search, len(search_results) | |
| ) | |
| print(f" Final combined confidence: {combined_confidence:.3f}") | |
| except Exception as e: | |
| print(f"Confidence calculation error: {e}") | |
| combined_confidence = 0.5 # Default to neutral | |
| # Step 6: Format the final results | |
| # Always prioritize Gemini's percentages when available | |
| gemini_real_percent, gemini_fake_percent = extract_gemini_percentage(gemini_analysis) | |
| if gemini_real_percent is not None and gemini_fake_percent is not None: | |
| # Use Gemini's actual percentages directly | |
| real_confidence = gemini_real_percent | |
| fake_confidence = gemini_fake_percent | |
| print(f"🎯 Using Gemini results directly: {real_confidence:.1%} REAL, {fake_confidence:.1%} FAKE") | |
| else: | |
| # Fallback to weighted average if Gemini percentages not available | |
| real_confidence = combined_confidence | |
| fake_confidence = 1 - combined_confidence | |
| print(f"📊 Using weighted average (Gemini fallback): {real_confidence:.1%} REAL, {fake_confidence:.1%} FAKE") | |
| # Set to 0 to avoid errors | |
| # Step 7: Check if result should be added to RAG (Gemini ≥95% in either direction) | |
| # Use the same gemini_real_percent and gemini_fake_percent from above | |
| if gemini_real_percent is not None and gemini_fake_percent is not None: | |
| gemini_max_confidence = max(gemini_real_percent, gemini_fake_percent) | |
| print(f"📊 Gemini confidence extracted: REAL={gemini_real_percent:.1%}, FAKE={gemini_fake_percent:.1%}, MAX={gemini_max_confidence:.1%}") | |
| else: | |
| gemini_max_confidence = 0 | |
| print("⚠️ Could not extract Gemini confidence percentages for RAG") | |
| if gemini_analysis: | |
| print(f" Analysis preview: {gemini_analysis[:300]}...") | |
| # Step 7: Enhanced RAG System - Save to Google Drive if confidence is high enough | |
| # (Old knowledge base system disabled to avoid duplicates - Enhanced RAG is better) | |
| print(f"\n🔍 RAG SAVE DEBUG:") | |
| print(f" ENABLE_ENHANCED_RAG: {ENABLE_ENHANCED_RAG}") | |
| print(f" gemini_max_confidence: {gemini_max_confidence} ({gemini_max_confidence:.1%} if > 0)") | |
| print(f" RAG_CONFIDENCE_THRESHOLD: {RAG_CONFIDENCE_THRESHOLD}") | |
| print(f" Condition check: gemini_max_confidence >= RAG_CONFIDENCE_THRESHOLD: {gemini_max_confidence >= RAG_CONFIDENCE_THRESHOLD if gemini_max_confidence else False}") | |
| print(f" Should save? {ENABLE_ENHANCED_RAG and gemini_max_confidence and gemini_max_confidence >= RAG_CONFIDENCE_THRESHOLD}\n") | |
| if ENABLE_ENHANCED_RAG and gemini_max_confidence and gemini_max_confidence >= RAG_CONFIDENCE_THRESHOLD: | |
| try: | |
| from rag_news_manager import add_news_to_rag, initialize_rag_system, rag_manager | |
| # Try to initialize if not already initialized | |
| if not rag_manager.service or not rag_manager.rag_file_id: | |
| print("🔄 RAG system not initialized - attempting to initialize now...") | |
| if initialize_rag_system(): | |
| print("✅ RAG system initialized successfully!") | |
| else: | |
| print("❌ Failed to initialize RAG system - cannot save") | |
| print(" 🔧 Please run: python setup_google_drive_rag.py") | |
| print(" 📝 Or check Google Drive authentication") | |
| print(f"🚀 High confidence detected ({gemini_max_confidence:.1%}) - saving to Enhanced RAG system...") | |
| final_prediction = "REAL" if gemini_real_percent > gemini_fake_percent else "FAKE" | |
| rag_success = add_news_to_rag( | |
| news_text=news_text, | |
| gemini_analysis=gemini_analysis, | |
| gemini_confidence=gemini_max_confidence, | |
| prediction=final_prediction, | |
| search_results=search_results, | |
| distilbert_confidence=distilbert_confidence | |
| ) | |
| if rag_success: | |
| print("✅ Successfully saved to Enhanced RAG system (Google Drive)!") | |
| else: | |
| print("⚠️ Failed to save to Enhanced RAG system") | |
| print(" Possible reasons:") | |
| print(" - Entry already exists (duplicate)") | |
| print(" - Google Drive authentication expired (run: python setup_google_drive_rag.py)") | |
| print(" - File permission issue") | |
| except ImportError as e: | |
| print(f"❌ Cannot import RAG system: {e}") | |
| print(" Make sure rag_news_manager.py is in the same directory") | |
| except Exception as e: | |
| print(f"❌ Enhanced RAG system error: {e}") | |
| error_msg = str(e).lower() | |
| if "invalid_grant" in error_msg or "bad request" in error_msg: | |
| print(" 🔑 Google Drive token expired!") | |
| print(" 🔧 Run: python setup_google_drive_rag.py to re-authenticate") | |
| import traceback | |
| print(f" Traceback: {traceback.format_exc()}") | |
| else: | |
| if not ENABLE_ENHANCED_RAG: | |
| print("⚠️ Enhanced RAG system is disabled (ENABLE_ENHANCED_RAG = False)") | |
| elif not gemini_max_confidence: | |
| print(f"⚠️ Cannot save: gemini_max_confidence is {gemini_max_confidence} (needs to be extracted)") | |
| elif gemini_max_confidence < RAG_CONFIDENCE_THRESHOLD: | |
| print(f"⚠️ Confidence {gemini_max_confidence:.1%} below threshold {RAG_CONFIDENCE_THRESHOLD:.1%}") | |
| # Build the detailed report with better formatting | |
| # Use the actual confidence scores to determine the final classification | |
| final_prediction = "REAL" if real_confidence > fake_confidence else "FAKE" if fake_confidence > real_confidence else "UNCERTAIN" | |
| prediction_emoji = "✅" if final_prediction == "REAL" else "❌" if final_prediction == "FAKE" else "❓" | |
| # Use the higher confidence for display | |
| max_confidence = max(real_confidence, fake_confidence) | |
| confidence_level = "Cao" if max_confidence > 0.7 else "Trung bình" if max_confidence > 0.4 else "Thấp" | |
| confidence_emoji = "🟢" if max_confidence > 0.7 else "🟡" if max_confidence > 0.4 else "🔴" | |
| # Convert technical metrics to user-friendly Vietnamese | |
| source_quality = "Tốt" if source_credibility > 0.7 else "Trung bình" if source_credibility > 0.4 else "Kém" | |
| source_count_text = f"{len(search_results)} nguồn tin" if len(search_results) > 0 else "Không tìm thấy nguồn" | |
| # Create source list display - SHOW ALL SOURCES | |
| sources_display = "" | |
| if found_sources: | |
| sources_display = "<br>".join([f"• {source}" for source in found_sources]) # Show ALL sources | |
| elif len(search_results) == 0: | |
| sources_display = "⚠️ Google Search không khả dụng do hết quota" | |
| # Show credible sources found | |
| credible_display = "" | |
| if credible_sources_found: | |
| credible_display = f"<br><strong>Nguồn uy tín:</strong><br>" + "<br>".join([f"✅ {source}" for source in credible_sources_found]) | |
| # Simplify credibility text | |
| if search_error: | |
| if search_error['type'] == 'QUOTA_EXCEEDED': | |
| credibility_summary = f"⚠️ Google Search hết quota - chỉ dùng phân tích nội dung" | |
| source_count_text = "Không có (API hết quota)" | |
| elif search_error['type'] == 'API_KEY_INVALID': | |
| credibility_summary = f"❌ Google Search API key không hợp lệ" | |
| source_count_text = "Không có (API key lỗi)" | |
| else: | |
| credibility_summary = f"⚠️ Google Search lỗi - chỉ dùng phân tích nội dung" | |
| source_count_text = "Không có (lỗi API)" | |
| elif "High credibility" in credibility_text: | |
| credibility_summary = f"✅ Nguồn tin đáng tin cậy" | |
| elif "Medium credibility" in credibility_text: | |
| credibility_summary = f"⚠️ Nguồn tin trung bình" | |
| else: | |
| credibility_summary = f"❌ Nguồn tin kém tin cậy" | |
| # Simplify support text | |
| if "strongly support" in support_text.lower(): | |
| support_summary = "✅ Các nguồn ủng hộ tin tức này" | |
| elif "contradict" in support_text.lower(): | |
| support_summary = "❌ Các nguồn phản bác tin tức này" | |
| else: | |
| support_summary = "⚠️ Các nguồn có ý kiến trái chiều" | |
| detailed_analysis = f""" | |
| <div style="font-family: 'Segoe UI', Arial, sans-serif; line-height: 1.6; color: #333;"> | |
| ## 🔍 **KẾT QUẢ PHÂN TÍCH TIN TỨC** | |
| <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0; text-align: center;"> | |
| <h2 style="margin: 0; font-size: 24px;">{prediction_emoji} {'TIN THẬT' if final_prediction == 'REAL' else 'TIN GIẢ' if final_prediction == 'FAKE' else 'KHÔNG XÁC ĐỊNH'}</h2> | |
| <p style="margin: 10px 0 0 0; font-size: 18px; opacity: 0.9;">{confidence_emoji} Độ tin cậy: {confidence_level} ({max_confidence:.0%})</p> | |
| </div> | |
| ### 🤖 **Phân tích bằng AI** | |
| <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #007bff; margin: 10px 0;"> | |
| <p><strong>Kết quả:</strong> {prediction_emoji} {'Tin tức này có vẻ THẬT' if final_prediction == 'REAL' else 'Tin tức này có vẻ GIẢ' if final_prediction == 'FAKE' else 'Không thể xác định'}</p> | |
| <p><strong>Độ chắc chắn:</strong> {f"{distilbert_confidence:.0%}" if distilbert_confidence else 'Không có'} - {'Rất cao' if distilbert_confidence and distilbert_confidence > 0.8 else 'Cao' if distilbert_confidence and distilbert_confidence > 0.6 else 'Trung bình' if distilbert_confidence and distilbert_confidence > 0.4 else 'Thấp'}</p> | |
| </div> | |
| ### 🌐 **Kiểm tra nguồn tin** | |
| <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745; margin: 10px 0;"> | |
| <p><strong>Tìm thấy:</strong> {source_count_text}</p> | |
| <p><strong>Chất lượng nguồn:</strong> {source_quality} ({source_credibility:.0%})</p> | |
| <p><strong>Đánh giá:</strong> {credibility_summary}</p> | |
| <p><strong>Hỗ trợ:</strong> {support_summary}</p> | |
| {sources_display and f'<p><strong>Nguồn tìm thấy:</strong><br>{sources_display}</p>' or ''} | |
| {credible_display} | |
| </div> | |
| {search_error and f''' | |
| ### ⚠️ **Cảnh báo Google Search** | |
| <div style="background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107; margin: 10px 0;"> | |
| <p><strong>Lỗi:</strong> {search_error["message"]}</p> | |
| <p><strong>Chi tiết:</strong> {search_error["details"]}</p> | |
| <p><strong>Ảnh hưởng:</strong> Hệ thống đang sử dụng phân tích nội dung thay vì tìm kiếm Google. Kết quả có thể kém chính xác hơn.</p> | |
| </div> | |
| ''' or ''} | |
| ### 🧠 **Phân tích thông minh** | |
| <div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #ffc107; margin: 15px 0; font-family: 'Segoe UI', Arial, sans-serif; line-height: 1.6;"> | |
| <div style="white-space: pre-line; color: #333;"> | |
| {gemini_analysis} | |
| </div> | |
| </div> | |
| ### 📊 **KẾT LUẬN CUỐI CÙNG** | |
| <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); color: white; padding: 15px; border-radius: 8px; margin: 20px 0;"> | |
| <p style="margin: 0; font-size: 16px;"><strong>Tin tức này có khả năng {'THẬT' if final_prediction == 'REAL' else 'GIẢ' if final_prediction == 'FAKE' else 'KHÔNG XÁC ĐỊNH'} với độ tin cậy {max(real_confidence, fake_confidence):.0%}</strong></p> | |
| <p style="margin: 5px 0 0 0; font-size: 14px; opacity: 0.9;">Dựa trên phân tích AI, kiểm tra nguồn tin và đánh giá thông minh</p> | |
| </div> | |
| </div> | |
| """ | |
| return gr.update(value=detailed_analysis, visible=True), f"**Độ chắc chắn là tin thật:** {real_confidence:.1%}", f"**Độ chắc chắn là tin giả:** {fake_confidence:.1%}", gr.update(visible=True), gr.update(visible=False) | |
| except Exception as e: | |
| error_message = f""" | |
| <div style="font-family: 'Segoe UI', Arial, sans-serif; line-height: 1.6; color: #333;"> | |
| ## ❌ **LỖI PHÂN TÍCH** | |
| <div style="background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0; text-align: center;"> | |
| <h2 style="margin: 0; font-size: 24px;">⚠️ Có lỗi xảy ra</h2> | |
| <p style="margin: 10px 0 0 0; font-size: 16px; opacity: 0.9;">Vui lòng thử lại sau</p> | |
| </div> | |
| <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #dc3545; margin: 10px 0;"> | |
| <p><strong>Chi tiết lỗi:</strong> {str(e)}</p> | |
| <p><strong>Gợi ý:</strong> Kiểm tra kết nối internet và thử lại</p> | |
| </div> | |
| </div> | |
| """ | |
| print(f"Analysis error: {e}") | |
| return gr.update(value=error_message, visible=True), "**Độ chắc chắn là tin thật:** 0%", "**Độ chắc chắn là tin giả:** 0%", gr.update(visible=True), gr.update(visible=False) | |
| # --- GRADIO INTERFACE --- | |
| def create_interface(): | |
| with gr.Blocks(title="Vietnamese Fake News Detection System", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <h1 style="color: #2c3e50; margin-bottom: 10px;">🔍 Vietnamese Fake News Detection System</h1> | |
| <p style="color: #7f8c8d; font-size: 16px; margin-bottom: 30px;">Powered by Google Search + Gemini AI + DistilBERT</p> | |
| <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0;"> | |
| <h3 style="margin: 0 0 15px 0;">🛡️ Hệ thống phát hiện tin giả tiếng Việt</h3> | |
| <div style="display: flex; justify-content: space-around; flex-wrap: wrap; gap: 15px;"> | |
| <div style="text-align: center;"> | |
| <div style="font-size: 24px; margin-bottom: 5px;">🌐</div> | |
| <strong>Google Search</strong><br> | |
| <small>Tìm kiếm thông tin thực tế</small> | |
| </div> | |
| <div style="text-align: center;"> | |
| <div style="font-size: 24px; margin-bottom: 5px;">🧠</div> | |
| <strong>Gemini AI</strong><br> | |
| <small>Phân tích thông minh</small> | |
| </div> | |
| <div style="text-align: center;"> | |
| <div style="font-size: 24px; margin-bottom: 5px;">🤖</div> | |
| <strong>DistilBERT</strong><br> | |
| <small>AI chuyên tiếng Việt</small> | |
| </div> | |
| </div> | |
| </div> | |
| <div style="background: #f8f9fa; padding: 15px; border-radius: 10px; border-left: 4px solid #17a2b8; margin: 20px 0;"> | |
| <p style="margin: 0; color: #495057;"><strong>💡 Lưu ý:</strong> Kết quả có thể thay đổi nhẹ giữa các lần phân tích do tính chất AI của Gemini, nhưng độ chính xác tổng thể vẫn được đảm bảo.</p> | |
| </div> | |
| <div style="background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%); padding: 15px; border-radius: 10px; margin: 20px 0;"> | |
| <h4 style="margin: 0 0 10px 0; color: #333;">🧠 Hệ thống RAG với Cơ sở Tri thức Tự động</h4> | |
| <p style="margin: 0; color: #555; font-size: 14px;">Khi độ tin cậy > 95%, hệ thống sẽ tự động lưu kết quả vào cơ sở tri thức để sử dụng cho các phân tích tương lai.</p> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown("### 📝 Nhập tin tức cần kiểm tra") | |
| news_input = gr.Textbox( | |
| placeholder="Nhập tin tức tiếng Việt cần kiểm tra...", | |
| lines=4, | |
| show_label=False | |
| ) | |
| analyze_btn = gr.Button("🔍 Phân tích với AI nâng cao", variant="primary", size="lg") | |
| with gr.Column(scale=1, visible=False) as results_column: | |
| gr.Markdown("### 📊 Kết quả phân tích") | |
| real_confidence = gr.Markdown("**Độ chắc chắn là tin thật:** 0%") | |
| fake_confidence = gr.Markdown("**Độ chắc chắn là tin giả:** 0%") | |
| detailed_analysis = gr.Markdown("### 📋 Phân tích chi tiết sẽ hiển thị ở đây...", visible=False) | |
| # Loading indicator | |
| loading_status = gr.Markdown("", visible=False) | |
| # Event handlers | |
| analyze_btn.click( | |
| fn=lambda: gr.update(visible=True, value="🔄 **Đang phân tích...** Vui lòng chờ trong giây lát..."), | |
| inputs=[], | |
| outputs=[loading_status] | |
| ).then( | |
| fn=analyze_news, | |
| inputs=[news_input], | |
| outputs=[detailed_analysis, real_confidence, fake_confidence, results_column, loading_status] | |
| ) | |
| return interface | |
| def test_google_search(): | |
| """Test Google Search API functionality""" | |
| print("Testing Google Search API...") | |
| print("=" * 50) | |
| # Test queries | |
| test_queries = [ | |
| "Argentina World Cup 2022", | |
| "Vietnam COVID-19 news", | |
| "Tin tức Việt Nam" | |
| ] | |
| results_found = 0 | |
| for i, query in enumerate(test_queries, 1): | |
| print(f"\nTest {i}: '{query}'") | |
| print("-" * 30) | |
| try: | |
| results = google_search(query) | |
| print(f"Results: {len(results)} found") | |
| if results: | |
| results_found += 1 | |
| print(f"First result: {results[0]['title'][:50]}...") | |
| print(f" Link: {results[0]['link']}") | |
| else: | |
| print("No results found") | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| print(f"\nTest Summary: {results_found}/{len(test_queries)} tests passed") | |
| if results_found == 0: | |
| print("\nGoogle Search is not working!") | |
| print("Possible solutions:") | |
| print(" 1. Check API quota in Google Cloud Console") | |
| print(" 2. Verify API keys are correct") | |
| print(" 3. Ensure Custom Search API is enabled") | |
| print(" 4. Check Search Engine ID is valid") | |
| elif results_found < len(test_queries): | |
| print("\nGoogle Search partially working") | |
| print("Some queries work, others don't - check query formatting") | |
| else: | |
| print("\nGoogle Search is working perfectly!") | |
| return results_found > 0 | |
| def test_gemini_override(): | |
| """Test the Gemini override functionality""" | |
| print("Testing Gemini Override Logic") | |
| print("=" * 50) | |
| # Test cases with different Gemini confidence levels | |
| test_cases = [ | |
| { | |
| "name": "High Real Confidence", | |
| "gemini_analysis": "1. KẾT LUẬN: THẬT\n2. ĐỘ TIN CẬY: THẬT: 98% / GIẢ: 2%", | |
| "expected_override": True, | |
| "expected_direction": "REAL" | |
| }, | |
| { | |
| "name": "High Fake Confidence", | |
| "gemini_analysis": "1. KẾT LUẬN: GIẢ\n2. ĐỘ TIN CẬY: THẬT: 3% / GIẢ: 97%", | |
| "expected_override": True, | |
| "expected_direction": "FAKE" | |
| }, | |
| { | |
| "name": "Low Confidence", | |
| "gemini_analysis": "1. KẾT LUẬN: KHÔNG XÁC ĐỊNH\n2. ĐỘ TIN CẬY: THẬT: 60% / GIẢ: 40%", | |
| "expected_override": False, | |
| "expected_direction": "UNKNOWN" | |
| } | |
| ] | |
| for i, test_case in enumerate(test_cases, 1): | |
| print(f"\nTest Case {i}: {test_case['name']}") | |
| print("-" * 40) | |
| print(f"Gemini Analysis: {test_case['gemini_analysis']}") | |
| try: | |
| # Test the confidence calculation directly | |
| gemini_real_percent, gemini_fake_percent = extract_gemini_percentage(test_case['gemini_analysis']) | |
| if gemini_real_percent is not None and gemini_fake_percent is not None: | |
| if gemini_real_percent > gemini_fake_percent: | |
| gemini_score = gemini_real_percent | |
| gemini_direction = "REAL" | |
| else: | |
| gemini_score = gemini_fake_percent | |
| gemini_direction = "FAKE" | |
| print(f"Extracted: Real={gemini_real_percent:.1%}, Fake={gemini_fake_percent:.1%}") | |
| print(f"Direction: {gemini_direction}") | |
| print(f"Score: {gemini_score:.1%}") | |
| print(f"Override triggered: {gemini_score >= 0.95}") | |
| if gemini_score >= 0.95: | |
| print(f"✅ OVERRIDE: Using {gemini_score:.1%} as final score") | |
| else: | |
| print(f"❌ NO OVERRIDE: Score {gemini_score:.1%} < 95%") | |
| else: | |
| print("❌ Could not extract percentages") | |
| except Exception as e: | |
| print(f"❌ Test failed: {e}") | |
| def test_complete_system(): | |
| """Test the complete fake news detection system""" | |
| print("Testing Complete Vietnamese Fake News Detection System") | |
| print("=" * 60) | |
| # Test cases | |
| test_cases = [ | |
| "Argentina vô địch World Cup 2022", | |
| "Hôm nay trời mưa ở Hà Nội", | |
| "COVID-19 đã được chữa khỏi hoàn toàn" | |
| ] | |
| for i, test_text in enumerate(test_cases, 1): | |
| print(f"\nTest Case {i}: '{test_text}'") | |
| print("-" * 40) | |
| try: | |
| result = analyze_news(test_text) | |
| print("Analysis completed successfully") | |
| print(f"Result type: {type(result)}") | |
| except Exception as e: | |
| print(f"Analysis failed: {e}") | |
| # --- LAUNCH APP --- | |
| if __name__ == "__main__": | |
| print("Starting Vietnamese Fake News Detection System...") | |
| print("Tools integrated: Google Search + Gemini AI + DistilBERT") | |
| # Uncomment the line below to run tests first | |
| # test_google_search() | |
| # test_gemini_override() # Uncomment to test Gemini override logic | |
| interface = create_interface() | |
| interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, # Standard port for Hugging Face Spaces | |
| share=True, # Not needed for Hugging Face Spaces | |
| show_error=True | |
| ) |