Spaces:

NLong
/

FakeNews_Detector

Running

App Files Files Community

NLong commited on Sep 28

Commit

5c79a89

verified ·

1 Parent(s): 5eace42

Upload app.py

Browse files

Files changed (1) hide show

app.py +385 -70

app.py CHANGED Viewed

@@ -11,6 +11,13 @@ import json
 import sqlite3
 from datetime import datetime
 import hashlib
 GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
 SEARCH_ENGINE_ID = "f34f8a4816771488b"
@@ -21,8 +28,23 @@ genai.configure(api_key=GEMINI_API_KEY)
 # Knowledge Base Configuration
 KNOWLEDGE_BASE_DB = "knowledge_base.db"
-CONFIDENCE_THRESHOLD = 0.95  # 95% threshold for auto-updating knowledge base
-ENABLE_KNOWLEDGE_BASE_SEARCH = False  # Set to True to enable knowledge base search (slower)
 print("Loading the DistilBERT model we trained...")
 try:
@@ -53,6 +75,141 @@ except Exception as e:
         tokenizer = None
         model = None
 # --- KNOWLEDGE BASE MANAGEMENT ---
 def init_knowledge_base():
     """Initialize the SQLite knowledge base"""
@@ -84,37 +241,73 @@ def add_to_knowledge_base(news_text, prediction, confidence, search_results, gem
         # Create content hash for deduplication
         content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
-        conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
-        cursor = conn.cursor()
-        # Check if entry already exists
-        cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
-        if cursor.fetchone():
-            print(f"Entry already exists in knowledge base (hash: {content_hash[:8]}...)")
             conn.close()
-            return False
-        # Insert new entry
-        cursor.execute('''
-            INSERT INTO knowledge_entries
-            (content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
-            VALUES (?, ?, ?, ?, ?, ?)
-        ''', (
-            content_hash,
-            news_text,
-            prediction,
-            confidence,
-            json.dumps(search_results, ensure_ascii=False),
-            gemini_analysis
-        ))
-        conn.commit()
-        conn.close()
-        print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})")
-        print(f"   Hash: {content_hash[:8]}...")
-        print(f"   Prediction: {prediction}")
-        return True
     except Exception as e:
         print(f"Error adding to knowledge base: {e}")
@@ -123,37 +316,71 @@ def add_to_knowledge_base(news_text, prediction, confidence, search_results, gem
 def search_knowledge_base(query_text, limit=5):
     """Search the knowledge base for similar entries"""
     try:
-        conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
-        cursor = conn.cursor()
-        # Simple text similarity search (you can enhance this with embeddings later)
-        cursor.execute('''
-            SELECT news_text, prediction, confidence, search_results, gemini_analysis,
-                   created_at, access_count
-            FROM knowledge_entries
-            WHERE news_text LIKE ? OR gemini_analysis LIKE ?
-            ORDER BY confidence DESC, access_count DESC
-            LIMIT ?
-        ''', (f'%{query_text[:50]}%', f'%{query_text[:50]}%', limit))
-        results = cursor.fetchall()
-        # Update access count and last_accessed
-        for result in results:
-            cursor.execute('''
-                UPDATE knowledge_entries
-                SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
-                WHERE news_text = ?
-            ''', (result[0],))
-        conn.commit()
-        conn.close()
-        if results:
-            print(f"📚 Found {len(results)} similar entries in knowledge base")
-            return results
         else:
-            return []
     except Exception as e:
         print(f"Error searching knowledge base: {e}")
@@ -213,6 +440,92 @@ def get_knowledge_base_stats():
 # Initialize knowledge base on startup
 init_knowledge_base()
 CREDIBLE_SOURCES = {
     'vnexpress.net': 0.95,
     'tuoitre.vn': 0.95,
@@ -520,10 +833,10 @@ def analyze_source_support(news_text, search_results):
 def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence):
     """Use Gemini AI to analyze the news and compare with our model results"""
     try:
-        # Knowledge base search (optional for faster performance)
         if ENABLE_KNOWLEDGE_BASE_SEARCH:
             print("🔍 Searching knowledge base for similar entries...")
-            knowledge_results = search_knowledge_base(news_text, limit=3)
             knowledge_context = format_knowledge_for_rag(knowledge_results)
         else:
             knowledge_context = ""
@@ -993,17 +1306,19 @@ def analyze_news(news_text):
         real_confidence = combined_confidence
         fake_confidence = 1 - combined_confidence
-        # Step 7: Check if result should be added to knowledge base
-        max_confidence = max(real_confidence, fake_confidence)
-        if max_confidence > CONFIDENCE_THRESHOLD:
-            print(f"🚀 High confidence result detected ({max_confidence:.1%}) - adding to knowledge base...")
-            final_prediction = "REAL" if real_confidence > fake_confidence else "FAKE"
             # Add to knowledge base
             success = add_to_knowledge_base(
                 news_text=news_text,
                 prediction=final_prediction,
-                confidence=max_confidence,
                 search_results=search_results,
                 gemini_analysis=gemini_analysis
             )

 import sqlite3
 from datetime import datetime
 import hashlib
+import io
+import os
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
 GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
 SEARCH_ENGINE_ID = "f34f8a4816771488b"
 # Knowledge Base Configuration
 KNOWLEDGE_BASE_DB = "knowledge_base.db"
+CONFIDENCE_THRESHOLD = 0.95  # 95% Gemini confidence threshold for RAG knowledge base
+ENABLE_KNOWLEDGE_BASE_SEARCH = True  # Enable knowledge base search with training data
+# Cloud Storage Configuration
+USE_CLOUD_STORAGE = True  # Set to True to use cloud storage instead of local DB
+CLOUD_STORAGE_TYPE = "google_drive"  # Options: "google_drive", "google_cloud", "local"
+GOOGLE_DRIVE_FILE_ID = None  # Will be set when file is created
+# Load Google Drive file ID if it exists
+try:
+    if os.path.exists('google_drive_file_id.txt'):
+        with open('google_drive_file_id.txt', 'r') as f:
+            GOOGLE_DRIVE_FILE_ID = f.read().strip()
+        print(f"📁 Loaded Google Drive file ID: {GOOGLE_DRIVE_FILE_ID}")
+except Exception as e:
+    print(f"Could not load Google Drive file ID: {e}")
+GOOGLE_CLOUD_BUCKET = "your-bucket-name"  # For Google Cloud Storage
 print("Loading the DistilBERT model we trained...")
 try:
         tokenizer = None
         model = None
+# --- CLOUD STORAGE FUNCTIONS ---
+def get_google_drive_service():
+    """Get authenticated Google Drive service for Hugging Face Spaces"""
+    try:
+        SCOPES = ['https://www.googleapis.com/auth/drive.file']
+        creds = None
+        # Check if running on Hugging Face Spaces
+        import os
+        is_hf_space = os.getenv('SPACE_ID') is not None
+        if is_hf_space:
+            # For Hugging Face Spaces, use environment variables
+            client_id = os.getenv('GOOGLE_CLIENT_ID')
+            client_secret = os.getenv('GOOGLE_CLIENT_SECRET')
+            refresh_token = os.getenv('GOOGLE_REFRESH_TOKEN')
+            if client_id and client_secret and refresh_token:
+                creds = Credentials.from_authorized_user_info({
+                    'client_id': client_id,
+                    'client_secret': client_secret,
+                    'refresh_token': refresh_token,
+                    'token_uri': 'https://oauth2.googleapis.com/token'
+                }, SCOPES)
+            else:
+                print("⚠️ Google Drive credentials not found in Hugging Face secrets")
+                return None
+        else:
+            # For local development, use files
+            if os.path.exists('token.json'):
+                creds = Credentials.from_authorized_user_file('token.json', SCOPES)
+            # If no valid credentials, request authorization
+            if not creds or not creds.valid:
+                if creds and creds.expired and creds.refresh_token:
+                    creds.refresh(Request())
+                else:
+                    if os.path.exists('credentials.json'):
+                        flow = InstalledAppFlow.from_client_secrets_file(
+                            'credentials.json', SCOPES)
+                        creds = flow.run_local_server(port=0)
+                    else:
+                        print("⚠️ credentials.json not found for local development")
+                        return None
+                # Save credentials for next run
+                with open('token.json', 'w') as token:
+                    token.write(creds.to_json())
+        return build('drive', 'v3', credentials=creds)
+    except Exception as e:
+        print(f"Error setting up Google Drive: {e}")
+        return None
+def upload_to_google_drive(data, filename="knowledge_base.json"):
+    """Upload knowledge base data to Google Drive"""
+    try:
+        service = get_google_drive_service()
+        if not service:
+            return None
+        # Convert data to JSON
+        json_data = json.dumps(data, ensure_ascii=False, indent=2)
+        file_metadata = {
+            'name': filename,
+            'parents': []  # Root folder
+        }
+        media = MediaIoBaseUpload(
+            io.BytesIO(json_data.encode('utf-8')),
+            mimetype='application/json'
+        )
+        file = service.files().create(
+            body=file_metadata,
+            media_body=media,
+            fields='id'
+        ).execute()
+        print(f"✅ Uploaded {filename} to Google Drive (ID: {file.get('id')})")
+        return file.get('id')
+    except Exception as e:
+        print(f"Error uploading to Google Drive: {e}")
+        return None
+def download_from_google_drive(file_id):
+    """Download knowledge base data from Google Drive"""
+    try:
+        service = get_google_drive_service()
+        if not service:
+            return []
+        request = service.files().get_media(fileId=file_id)
+        file_content = io.BytesIO()
+        downloader = MediaIoBaseDownload(file_content, request)
+        done = False
+        while done is False:
+            status, done = downloader.next_chunk()
+        file_content.seek(0)
+        data = json.loads(file_content.read().decode('utf-8'))
+        print(f"✅ Downloaded knowledge base from Google Drive")
+        return data
+    except Exception as e:
+        print(f"Error downloading from Google Drive: {e}")
+        return []
+def save_knowledge_base_cloud(data):
+    """Save knowledge base to cloud storage"""
+    if CLOUD_STORAGE_TYPE == "google_drive":
+        file_id = upload_to_google_drive(data)
+        if file_id:
+            global GOOGLE_DRIVE_FILE_ID
+            GOOGLE_DRIVE_FILE_ID = file_id
+        return file_id is not None
+    elif CLOUD_STORAGE_TYPE == "google_cloud":
+        # TODO: Implement Google Cloud Storage
+        print("Google Cloud Storage not implemented yet")
+        return False
+    return False
+def load_knowledge_base_cloud():
+    """Load knowledge base from cloud storage"""
+    if CLOUD_STORAGE_TYPE == "google_drive" and GOOGLE_DRIVE_FILE_ID:
+        return download_from_google_drive(GOOGLE_DRIVE_FILE_ID)
+    elif CLOUD_STORAGE_TYPE == "google_cloud":
+        # TODO: Implement Google Cloud Storage
+        print("Google Cloud Storage not implemented yet")
+        return []
+    return []
 # --- KNOWLEDGE BASE MANAGEMENT ---
 def init_knowledge_base():
     """Initialize the SQLite knowledge base"""
         # Create content hash for deduplication
         content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
+        if USE_CLOUD_STORAGE:
+            # Add to cloud storage
+            data = load_knowledge_base_cloud()
+            # Check if entry already exists
+            for entry in data:
+                if entry.get('content_hash') == content_hash:
+                    print(f"Entry already exists in cloud knowledge base (hash: {content_hash[:8]}...)")
+                    return False
+            # Create new entry
+            new_entry = {
+                'content_hash': content_hash,
+                'news_text': news_text,
+                'prediction': prediction,
+                'confidence': confidence,
+                'search_results': search_results,
+                'gemini_analysis': gemini_analysis,
+                'created_at': datetime.now().isoformat(),
+                'last_accessed': datetime.now().isoformat(),
+                'access_count': 1
+            }
+            # Add to data and save to cloud
+            data.append(new_entry)
+            success = save_knowledge_base_cloud(data)
+            if success:
+                print(f"✅ Added high-confidence result to cloud knowledge base (confidence: {confidence:.1%})")
+                print(f"   Hash: {content_hash[:8]}...")
+                print(f"   Prediction: {prediction}")
+                return True
+            else:
+                return False
+        else:
+            # Add to local SQLite database
+            conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
+            cursor = conn.cursor()
+            # Check if entry already exists
+            cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
+            if cursor.fetchone():
+                print(f"Entry already exists in knowledge base (hash: {content_hash[:8]}...)")
+                conn.close()
+                return False
+            # Insert new entry
+            cursor.execute('''
+                INSERT INTO knowledge_entries
+                (content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
+                VALUES (?, ?, ?, ?, ?, ?)
+            ''', (
+                content_hash,
+                news_text,
+                prediction,
+                confidence,
+                json.dumps(search_results, ensure_ascii=False),
+                gemini_analysis
+            ))
+            conn.commit()
             conn.close()
+            print(f"✅ Added high-confidence result to knowledge base (confidence: {confidence:.1%})")
+            print(f"   Hash: {content_hash[:8]}...")
+            print(f"   Prediction: {prediction}")
+            return True
     except Exception as e:
         print(f"Error adding to knowledge base: {e}")
 def search_knowledge_base(query_text, limit=5):
     """Search the knowledge base for similar entries"""
     try:
+        if USE_CLOUD_STORAGE:
+            # Search in cloud storage
+            data = load_knowledge_base_cloud()
+            if not data:
+                return []
+            # Simple text similarity search in JSON data
+            results = []
+            query_lower = query_text[:50].lower()
+            for entry in data:
+                if (query_lower in entry.get('news_text', '').lower() or
+                    query_lower in entry.get('gemini_analysis', '').lower()):
+                    results.append((
+                        entry['news_text'],
+                        entry['prediction'],
+                        entry['confidence'],
+                        entry.get('search_results', []),
+                        entry.get('gemini_analysis', ''),
+                        entry.get('created_at', ''),
+                        entry.get('access_count', 1)
+                    ))
+            # Sort by confidence and access count
+            results.sort(key=lambda x: (x[2], x[6]), reverse=True)
+            results = results[:limit]
+            if results:
+                print(f"📚 Found {len(results)} similar entries in cloud knowledge base")
+                return results
+            else:
+                return []
         else:
+            # Search in local SQLite database
+            conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
+            cursor = conn.cursor()
+            # Simple text similarity search (you can enhance this with embeddings later)
+            cursor.execute('''
+                SELECT news_text, prediction, confidence, search_results, gemini_analysis,
+                       created_at, access_count
+                FROM knowledge_entries
+                WHERE news_text LIKE ? OR gemini_analysis LIKE ?
+                ORDER BY confidence DESC, access_count DESC
+                LIMIT ?
+            ''', (f'%{query_text[:50]}%', f'%{query_text[:50]}%', limit))
+            results = cursor.fetchall()
+            # Update access count and last_accessed
+            for result in results:
+                cursor.execute('''
+                    UPDATE knowledge_entries
+                    SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
+                    WHERE news_text = ?
+                ''', (result[0],))
+            conn.commit()
+            conn.close()
+            if results:
+                print(f"📚 Found {len(results)} similar entries in knowledge base")
+                return results
+            else:
+                return []
     except Exception as e:
         print(f"Error searching knowledge base: {e}")
 # Initialize knowledge base on startup
 init_knowledge_base()
+def populate_knowledge_base_from_training_data():
+    """Populate knowledge base with existing training data"""
+    try:
+        import pandas as pd
+        # Load training data
+        df = pd.read_csv('train_final.csv')
+        print(f"📚 Loading {len(df)} training samples into knowledge base...")
+        conn = sqlite3.connect(KNOWLEDGE_BASE_DB)
+        cursor = conn.cursor()
+        added_count = 0
+        skipped_count = 0
+        for index, row in df.iterrows():
+            news_text = str(row['content'])
+            label = int(row['label'])
+            prediction = "REAL" if label == 0 else "FAKE"
+            # Create content hash for deduplication
+            content_hash = hashlib.md5(news_text.encode('utf-8')).hexdigest()
+            # Check if entry already exists
+            cursor.execute('SELECT id FROM knowledge_entries WHERE content_hash = ?', (content_hash,))
+            if cursor.fetchone():
+                skipped_count += 1
+                continue
+            # Create synthetic analysis for training data
+            synthetic_analysis = f"""1. KẾT LUẬN: {prediction}
+2. ĐỘ TIN CẬY: THẬT: {95 if prediction == 'REAL' else 5}% / GIẢ: {5 if prediction == 'REAL' else 95}%
+3. PHÂN TÍCH CHI TIẾT:
+- Nội dung: {'Tin tức được xác minh từ nguồn đào tạo' if prediction == 'REAL' else 'Tin tức giả được xác định từ nguồn đào tạo'}
+- Nguồn tin: Dữ liệu huấn luyện đã được xác minh
+- Ngữ cảnh: Mẫu từ bộ dữ liệu huấn luyện DistilBERT
+- Ngôn ngữ: {'Ngôn ngữ khách quan, tin cậy' if prediction == 'REAL' else 'Ngôn ngữ có dấu hiệu tin giả'}
+- Thời gian: Dữ liệu huấn luyện đã được kiểm chứng
+4. CÁC DẤU HIỆU CẢNH BÁO: {'Không có dấu hiệu cảnh báo' if prediction == 'REAL' else 'Tin tức được xác định là giả từ nguồn đào tạo'}
+5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:
+- Nguồn: Dữ liệu huấn luyện đã được xác minh
+- Độ tin cậy: Cao (từ bộ dữ liệu đào tạo)
+- Lưu ý: Mẫu từ tập huấn luyện DistilBERT"""
+            # Insert training sample
+            cursor.execute('''
+                INSERT INTO knowledge_entries
+                (content_hash, news_text, prediction, confidence, search_results, gemini_analysis)
+                VALUES (?, ?, ?, ?, ?, ?)
+            ''', (
+                content_hash,
+                news_text,
+                prediction,
+                0.95,  # High confidence for training data
+                json.dumps([], ensure_ascii=False),  # Empty search results for training data
+                synthetic_analysis
+            ))
+            added_count += 1
+            # Show progress every 1000 entries
+            if added_count % 1000 == 0:
+                print(f"   Added {added_count} entries...")
+        conn.commit()
+        conn.close()
+        print(f"✅ Knowledge base populated successfully!")
+        print(f"   📊 Added: {added_count} entries")
+        print(f"   ⏭️ Skipped: {skipped_count} duplicates")
+        print(f"   🎯 Total entries: {added_count}")
+        return True
+    except Exception as e:
+        print(f"❌ Error populating knowledge base: {e}")
+        return False
+# Populate knowledge base with training data on startup
+print("🚀 Populating knowledge base with training data...")
+populate_knowledge_base_from_training_data()
 CREDIBLE_SOURCES = {
     'vnexpress.net': 0.95,
     'tuoitre.vn': 0.95,
 def analyze_with_gemini(news_text, search_results, distilbert_prediction, distilbert_confidence):
     """Use Gemini AI to analyze the news and compare with our model results"""
     try:
+        # Knowledge base search with training data
         if ENABLE_KNOWLEDGE_BASE_SEARCH:
             print("🔍 Searching knowledge base for similar entries...")
+            knowledge_results = search_knowledge_base(news_text, limit=2)  # Reduced to 2 for speed
             knowledge_context = format_knowledge_for_rag(knowledge_results)
         else:
             knowledge_context = ""
         real_confidence = combined_confidence
         fake_confidence = 1 - combined_confidence
+        # Step 7: Check if result should be added to knowledge base (using only Gemini confidence for RAG)
+        gemini_real_confidence, gemini_fake_confidence = extract_gemini_percentage(gemini_analysis)
+        gemini_max_confidence = max(gemini_real_confidence, gemini_fake_confidence)
+        if gemini_max_confidence > CONFIDENCE_THRESHOLD:
+            print(f"🚀 High Gemini confidence detected ({gemini_max_confidence:.1%}) - adding to knowledge base for RAG...")
+            final_prediction = "REAL" if gemini_real_confidence > gemini_fake_confidence else "FAKE"
             # Add to knowledge base
             success = add_to_knowledge_base(
                 news_text=news_text,
                 prediction=final_prediction,
+                confidence=gemini_max_confidence,  # Use Gemini confidence for RAG storage
                 search_results=search_results,
                 gemini_analysis=gemini_analysis
             )