import requests from bs4 import BeautifulSoup import gradio as gr import re import pandas as pd import json from datetime import datetime from urllib.parse import urljoin, quote_plus import hashlib import time import os import base64 import io import numpy as np from collections import Counter import nltk from nltk.util import ngrams from sklearn.feature_extraction.text import TfidfVectorizer from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, pipeline import torch import matplotlib.pyplot as plt import matplotlib.font_manager as fm from wordcloud import WordCloud import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots # Di file app.py, ganti baris ini: # from advanced_analysis import AdvancedTextAnalysis, perform_advanced_analysis, save_advanced_analysis_results # Menjadi: try: from advanced_analysis import AdvancedTextAnalysis, perform_advanced_analysis, save_advanced_analysis_results, perform_advanced_analysis_wrapper except ImportError as e: print(f"Warning: Advanced analysis module not available: {e}") # Fallback functions def perform_advanced_analysis_wrapper(): return "❌ Modul analisis lanjutan tidak tersedia", None, None, None, None, None, None, None def create_advanced_dashboard(): return "❌ Modul analisis lanjutan tidak tersedia", None, None, None, None, None, None, None, None, None try: from advanced_analysis import perform_advanced_analysis_wrapper except ImportError as e: print(f"Warning: Advanced analysis module not available: {e}") # Fallback function def perform_advanced_analysis_wrapper(): return "❌ Modul analisis lanjutan tidak tersedia", None, None, None, pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() # Download NLTK data jika belum ada try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') # ==================== KONFIGURASI GITHUB ==================== GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") # Token dari environment variables GITHUB_REPO = "abdfajar/republika_sentiner" GITHUB_BRANCH = "main" SCRAPPER_RESULT_PATH = "scrapper_result" ANALYSIS_PATH = "analisis" # ==================== KONFIGURASI MODEL ==================== SENTIMENT_MODEL_NAME = "indolem/indobert-base-uncased" NER_MODEL_NAME = "cahya/bert-base-indonesian-NER" # Global variables untuk models sentiment_pipeline = None ner_pipeline = None indonesian_stopwords = None # ==================== FUNGSI LOAD STOPWORDS ==================== def load_indonesian_stopwords(): """Load stopwords Indonesia dari GitHub repo""" global indonesian_stopwords if indonesian_stopwords is not None: return indonesian_stopwords try: # URL raw file stopwords-id.txt dari GitHub stopwords_url = f"https://raw.githubusercontent.com/{GITHUB_REPO}/{GITHUB_BRANCH}/stopwords-id.txt" print(f"Loading stopwords from: {stopwords_url}") response = requests.get(stopwords_url) response.raise_for_status() # Parse stopwords stopwords_list = response.text.splitlines() # Clean dan filter stopwords indonesian_stopwords = set() for word in stopwords_list: word = word.strip().lower() if word and not word.startswith('#'): # Skip comments indonesian_stopwords.add(word) # Tambahkan stopwords dasar jika file tidak tersedia if not indonesian_stopwords: indonesian_stopwords = get_basic_indonesian_stopwords() print(f"✅ Loaded {len(indonesian_stopwords)} Indonesian stopwords") return indonesian_stopwords except Exception as e: print(f"❌ Error loading stopwords from GitHub: {e}") print("🔄 Using basic Indonesian stopwords as fallback...") indonesian_stopwords = get_basic_indonesian_stopwords() return indonesian_stopwords def get_basic_indonesian_stopwords(): """Fallback basic Indonesian stopwords jika file tidak tersedia""" basic_stopwords = { 'yang', 'dan', 'di', 'dengan', 'ini', 'itu', 'dari', 'dalam', 'untuk', 'pada', 'ke', 'dari', 'tidak', 'akan', 'ada', 'adalah', 'atau', 'juga', 'bahwa', 'sebagai', 'dapat', 'oleh', 'karena', 'saat', 'saya', 'kamu', 'kami', 'kita', 'mereka', 'dia', 'aku', 'engkau', 'kalian', 'kami', 'kita', 'mereka', 'ini', 'itu', 'sini', 'situ', 'sana', 'mana', 'apa', 'siapa', 'bagaimana', 'mengapa', 'kapan', 'dimana', 'berapa', 'sangat', 'sekali', 'terlalu', 'amat', 'paling', 'cukup', 'agak', 'hanya', 'saja', 'lagi', 'telah', 'sudah', 'sedang', 'akan', 'belum', 'pernah', 'selalu', 'sering', 'kadang', 'jarang', 'tidak', 'bukan', 'jangan', 'jika', 'kalau', 'meski', 'walaupun', 'karena', 'sebab', 'maka', 'oleh', 'untuk', 'guna', 'demi', 'sampai', 'hingga', 'dari', 'sejak', 'selama', 'ketika', 'sambil', 'seraya', 'setelah', 'sebelum', 'sesudah', 'sementara', 'tatkala', 'dengan', 'tanpa', 'secara', 'secara', 'secara', 'sangat', 'amat', 'terlalu', 'paling', 'cukup', 'agak', 'hampir', 'nyaris', 'hanya', 'saja', 'lagi', 'pun', 'kah', 'lah', 'tah', 'pun', 'per', 'para', 'si', 'sang', 'itu', 'ini', 'sini', 'situ', 'sana', 'kamu', 'aku', 'dia', 'kami', 'kita', 'mereka', 'engkau', 'kalian', 'beliau', 'nya', 'ku', 'mu', 'kau', 'beta', 'kami', 'kita' } print(f"✅ Using {len(basic_stopwords)} basic Indonesian stopwords") return basic_stopwords # ==================== FUNGSI LOAD MODEL ==================== def load_sentiment_model(): """Load model sentiment analysis""" global sentiment_pipeline if sentiment_pipeline is None: try: print("Loading sentiment model...") tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_NAME) model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_NAME) sentiment_pipeline = pipeline( "sentiment-analysis", model=model, tokenizer=tokenizer ) print("Sentiment model loaded successfully") except Exception as e: print(f"Error loading sentiment model: {e}") return None return sentiment_pipeline def load_ner_model(): """Load model NER""" global ner_pipeline if ner_pipeline is None: try: print("Loading NER model...") tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME) model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME) ner_pipeline = pipeline( "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple" ) print("NER model loaded successfully") except Exception as e: print(f"Error loading NER model: {e}") return None return ner_pipeline # ==================== FUNGSI ANALISIS TEKS ==================== def preprocess_text(text): """Preprocessing teks untuk analisis""" if not text or pd.isna(text): return "" # Clean text text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'\s+', ' ', text) text = text.strip().lower() return text # ==================== FUNGSI PREDIKSI SENTIMEN ==================== def predict_sentiment_batch(articles_data): """Prediksi sentimen untuk batch articles""" sentiment_pipeline = load_sentiment_model() if not sentiment_pipeline: return [] results = [] texts = [] article_ids = [] # Prepare data for batch processing for article_id, text in articles_data: if not text or len(text.strip()) < 10: results.append({ 'article_id': article_id, 'sentimen': 'NETRAL', 'confidence': 0.5, 'explanation': 'Teks terlalu pendek untuk dianalisis', 'model_used': SENTIMENT_MODEL_NAME }) continue truncated_text = text[:512] # Truncate untuk model texts.append(truncated_text) article_ids.append(article_id) if not texts: return results try: # Batch prediction batch_results = sentiment_pipeline(texts) for i, result in enumerate(batch_results): # Map label ke bahasa Indonesia label_map = { "LABEL_0": "NEGATIF", "LABEL_1": "POSITIF", "LABEL_2": "NETRAL" } predicted_label = label_map.get(result["label"], result["label"]) confidence = round(result["score"], 4) # Generate explanation if confidence > 0.8: explanation = f"Prediksi {predicted_label} dengan keyakinan tinggi" elif confidence > 0.6: explanation = f"Prediksi {predicted_label} dengan keyakinan sedang" else: explanation = f"Prediksi {predicted_label} dengan keyakinan rendah" results.append({ 'article_id': article_ids[i], 'sentimen': predicted_label, 'confidence': confidence, 'explanation': explanation, 'model_used': SENTIMENT_MODEL_NAME }) except Exception as e: print(f"Error in batch sentiment prediction: {e}") # Fallback untuk error for article_id in article_ids: results.append({ 'article_id': article_id, 'sentimen': 'NETRAL', 'confidence': 0.5, 'explanation': f'Error dalam prediksi: {str(e)}', 'model_used': SENTIMENT_MODEL_NAME }) return results # ==================== FUNGSI NAMED ENTITY RECOGNITION ==================== def extract_entities_batch(articles_data): """Extract named entities dari batch articles""" ner_pipeline = load_ner_model() if not ner_pipeline: return [] all_entities = [] for article_id, text in articles_data: if not text or len(text.strip()) < 10: all_entities.append({ 'article_id': article_id, 'entities': [] }) continue try: # Truncate text jika terlalu panjang truncated_text = text[:512] entities = ner_pipeline(truncated_text) # Process entities processed_entities = [] for entity in entities: if entity["score"] > 0.8: # Filter entities dengan score tinggi processed_entities.append({ 'entity': entity["word"], 'entity_type': entity["entity_group"], 'score': entity["score"] }) all_entities.append({ 'article_id': article_id, 'entities': processed_entities }) except Exception as e: print(f"Error extracting entities for article {article_id}: {e}") all_entities.append({ 'article_id': article_id, 'entities': [] }) return all_entities # ==================== FUNGSI TRIGRAM ANALYSIS ==================== def extract_trigrams_batch(articles_data): """Extract trigrams dari batch articles""" all_trigrams = [] for article_id, text in articles_data: if not text: all_trigrams.append({ 'article_id': article_id, 'trigrams': [] }) continue try: # Preprocess text processed_text = preprocess_text(text) # Tokenize tokens = nltk.word_tokenize(processed_text) # Load stopwords untuk filter trigrams stopwords = load_indonesian_stopwords() # Generate trigrams dan filter yang mengandung stopwords trigrams_list = list(ngrams(tokens, 3)) # Count frequency dan filter trigrams yang meaningful trigram_counter = Counter() for trigram in trigrams_list: trigram_str = ' '.join(trigram) # Skip trigrams yang semuanya stopwords if all(word in stopwords for word in trigram): continue # Skip trigrams yang mengandung kata yang terlalu pendek (<2 karakter) if any(len(word) < 2 for word in trigram): continue trigram_counter[trigram_str] += 1 # Convert to list trigrams_data = [] for trigram, freq in trigram_counter.most_common(20): # Top 20 trigrams trigrams_data.append({ 'trigram': trigram, 'frekuensi': freq }) all_trigrams.append({ 'article_id': article_id, 'trigrams': trigrams_data }) except Exception as e: print(f"Error extracting trigrams for article {article_id}: {e}") all_trigrams.append({ 'article_id': article_id, 'trigrams': [] }) return all_trigrams # ==================== FUNGSI TF-IDF ANALYSIS ==================== def calculate_tfidf_batch(articles_data): """Calculate TF-IDF scores untuk batch articles dengan stopwords Indonesia""" if not articles_data: return [] try: # Prepare data article_ids = [] processed_texts = [] for article_id, text in articles_data: if text and len(text.strip()) > 10: processed_text = preprocess_text(text) article_ids.append(article_id) processed_texts.append(processed_text) if not processed_texts: return [] # Load stopwords Indonesia stopwords = load_indonesian_stopwords() # Calculate TF-IDF dengan stopwords Indonesia vectorizer = TfidfVectorizer( max_features=100, stop_words=list(stopwords), # Gunakan stopwords Indonesia ngram_range=(1, 2) ) tfidf_matrix = vectorizer.fit_transform(processed_texts) feature_names = vectorizer.get_feature_names_out() all_tfidf_results = [] for i, article_id in enumerate(article_ids): # Get TF-IDF scores for this document doc_scores = tfidf_matrix[i].toarray().flatten() # Create list of (term, score) pairs term_scores = [] for j, score in enumerate(doc_scores): if score > 0: term_scores.append((feature_names[j], score)) # Sort by score descending term_scores.sort(key=lambda x: x[1], reverse=True) # Take top 15 terms top_terms = term_scores[:15] tfidf_data = [] for rank, (term, score) in enumerate(top_terms, 1): tfidf_data.append({ 'term': term, 'tfidf_score': round(score, 6), 'rank': rank }) all_tfidf_results.append({ 'article_id': article_id, 'tfidf_terms': tfidf_data }) return all_tfidf_results except Exception as e: print(f"Error calculating TF-IDF: {e}") return [] # ==================== FUNGSI DASHBOARD LANJUTAN ==================== def create_advanced_dashboard(): """Buat dashboard dengan analisis lanjutan""" try: # Check if analysis files exist if not os.path.exists('scrapper_result/article_metadata.csv'): return "❌ Data belum tersedia. Silakan lakukan scraping terlebih dahulu.", None, None, None, None, None, None, None, None, None # Load data metadata_df = pd.read_csv('scrapper_result/article_metadata.csv') if metadata_df.empty: return "❌ Tidak ada data untuk dianalisis.", None, None, None, None, None, None, None, None, None # Perform advanced analysis progress = gr.Progress() result_msg, advanced_results, topic_viz, keyword_viz, concept_viz = perform_advanced_analysis( metadata_df, progress_callback=progress ) # Create basic visualizations sentiment_trend = create_sentiment_trend_chart() confidence_chart = create_confidence_chart() wordcloud_fig = create_wordcloud() top_trigrams = get_top_trigrams() ner_bubble = create_ner_bubble_chart() popular_topics = get_popular_topics() # Prepare output output = "## 📊 Dashboard Analisis Lanjutan\n\n" output += "### 🔍 Analisis Dasar\n" if sentiment_trend: output += "#### 📈 Trend Sentimen Harian\n" output += "Grafik menunjukkan jumlah artikel untuk setiap sentimen per tanggal.\n\n" if advanced_results: output += "### 🧠 Analisis Lanjutan\n" output += "#### 🎯 Topic Modeling\n" if 'topics' in advanced_results and advanced_results['topics']: topics = advanced_results['topics']['topics'] for topic in topics: output += f"- **Topik {topic['topic_id']}**: {topic['keywords']}\n" output += "\n" output += "#### 🔑 Keyword Extraction\n" if 'keywords' in advanced_results and advanced_results['keywords']: keyword_types = set([k['type'] for k in advanced_results['keywords']]) output += f"Metode: {', '.join(keyword_types)}\n\n" output += "#### 📝 Text Summarization\n" if 'summaries' in advanced_results and advanced_results['summaries']: valid_summaries = [s for s in advanced_results['summaries'] if s['compression_ratio'] > 0] avg_compression = np.mean([s['compression_ratio'] for s in valid_summaries]) if valid_summaries else 0 output += f"Rata-rata kompresi: {avg_compression:.1%}\n\n" output += "#### 💡 Concept Extraction\n" if 'concepts' in advanced_results and advanced_results['concepts']: top_concepts = sorted(advanced_results['concepts'], key=lambda x: x['frequency'], reverse=True)[:5] output += "Konsep teratas: " + ", ".join([c['concept'] for c in top_concepts]) + "\n\n" return (output, sentiment_trend, confidence_chart, wordcloud_fig, top_trigrams, ner_bubble, popular_topics, topic_viz, keyword_viz, concept_viz) except Exception as e: return f"❌ Error creating advanced dashboard: {str(e)}", None, None, None, None, None, None, None, None, None def get_advanced_analysis_data(): """Dapatkan data untuk tab analisis lanjutan""" try: data = {} # Load topic modeling results if os.path.exists('analisis/topic_modeling_results.csv'): data['topics'] = pd.read_csv('analisis/topic_modeling_results.csv') # Load keyword extraction results if os.path.exists('analisis/keyword_extraction_results.csv'): data['keywords'] = pd.read_csv('analisis/keyword_extraction_results.csv') # Load text summarization results if os.path.exists('analisis/text_summarization_results.csv'): data['summaries'] = pd.read_csv('analisis/text_summarization_results.csv') # Load concept extraction results if os.path.exists('analisis/concept_extraction_results.csv'): data['concepts'] = pd.read_csv('analisis/concept_extraction_results.csv') return data except Exception as e: print(f"Error loading advanced analysis data: {e}") return {} # ==================== FUNGSI ANALISIS SENTIMEN UTAMA ==================== def analyze_sentiment_comprehensive(start_date, end_date, progress=gr.Progress()): """Fungsi utama untuk analisis sentimen yang komprehensif""" # Load data yang difilter keyword_df, results_df, metadata_df = load_and_filter_data(start_date, end_date) if metadata_df.empty: return "Tidak ada data untuk dianalisis", None, None, None, None # Prepare articles data articles_data = [] for idx, row in metadata_df.iterrows(): article_id = row.get('article_id', '') title = row.get('judul', '') content = row.get('konten', '') # Gabungkan judul dan konten untuk analisis combined_text = f"{title}. {content}" if title else content articles_data.append((article_id, combined_text)) total_articles = len(articles_data) results = { 'sentiment': [], 'ner': [], 'trigrams': [], 'tfidf': [] } # Pre-load stopwords di awal progress(0.1, desc="Memuat stopwords Indonesia...") load_indonesian_stopwords() # 1. PREDIKSI SENTIMEN progress(0.2, desc="Memuat model sentiment...") load_sentiment_model() progress(0.3, desc="Melakukan prediksi sentimen...") sentiment_results = predict_sentiment_batch(articles_data) results['sentiment'] = sentiment_results # 2. NAMED ENTITY RECOGNITION (NER) progress(0.4, desc="Memuat model NER...") load_ner_model() progress(0.5, desc="Melakukan ekstraksi entitas...") ner_results = extract_entities_batch(articles_data) results['ner'] = ner_results # 3. TRIGRAM ANALYSIS progress(0.7, desc="Melakukan analisis trigram...") trigram_results = extract_trigrams_batch(articles_data) results['trigrams'] = trigram_results # 4. TF-IDF ANALYSIS progress(0.9, desc="Menghitung TF-IDF dengan stopwords...") tfidf_results = calculate_tfidf_batch(articles_data) results['tfidf'] = tfidf_results # Format results untuk output df_sentiment = format_sentiment_results(sentiment_results) df_ner = format_ner_results(ner_results) df_trigram = format_trigram_results(trigram_results) df_tfidf = format_tfidf_results(tfidf_results) # Simpan hasil analisis ke folder analisis save_analysis_results(df_sentiment, df_ner, df_trigram, df_tfidf) progress(1.0, desc="Analisis selesai!") # Buat summary summary = create_analysis_summary(results, total_articles) return summary, df_sentiment, df_ner, df_trigram, df_tfidf def format_sentiment_results(sentiment_results): """Format sentiment results untuk DataFrame""" data = [] for result in sentiment_results: data.append({ 'article_id': result['article_id'], 'sentimen': result['sentimen'], 'confidence': result['confidence'], 'explanation': result['explanation'], 'model_used': result['model_used'] }) return pd.DataFrame(data) def format_ner_results(ner_results): """Format NER results untuk DataFrame""" data = [] for result in ner_results: article_id = result['article_id'] for entity in result['entities']: data.append({ 'article_id': article_id, 'entity': entity['entity'], 'frekuensi': 1, # Each entity counted once per article 'entity_type': entity['entity_type'] }) return pd.DataFrame(data) def format_trigram_results(trigram_results): """Format trigram results untuk DataFrame""" data = [] for result in trigram_results: article_id = result['article_id'] for rank, trigram_data in enumerate(result['trigrams'], 1): data.append({ 'article_id': article_id, 'trigram': trigram_data['trigram'], 'frekuensi': trigram_data['frekuensi'], 'rank': rank }) return pd.DataFrame(data) def format_tfidf_results(tfidf_results): """Format TF-IDF results untuk DataFrame""" data = [] for result in tfidf_results: article_id = result['article_id'] for term_data in result['tfidf_terms']: data.append({ 'article_id': article_id, 'term': term_data['term'], 'tfidf_score': term_data['tfidf_score'], 'rank': term_data['rank'] }) return pd.DataFrame(data) def create_analysis_summary(results, total_articles): """Buat summary analisis""" sentiment_counts = Counter([r['sentimen'] for r in results['sentiment']]) total_entities = sum(len(r['entities']) for r in results['ner']) total_trigrams = sum(len(r['trigrams']) for r in results['trigrams']) total_tfidf_terms = sum(len(r['tfidf_terms']) for r in results['tfidf']) # Load stopwords info stopwords_count = len(load_indonesian_stopwords()) summary = f""" **📊 SUMMARY ANALISIS SENTIMEN** **Statistik Umum:** - Total Artikel: {total_articles} - Total Entitas: {total_entities} - Total Trigram: {total_trigrams} - Total Terms TF-IDF: {total_tfidf_terms} - Stopwords digunakan: {stopwords_count} kata **Distribusi Sentimen:** - Positif: {sentiment_counts['POSITIF']} ({sentiment_counts['POSITIF']/total_articles*100:.1f}%) - Negatif: {sentiment_counts['NEGATIF']} ({sentiment_counts['NEGATIF']/total_articles*100:.1f}%) - Netral: {sentiment_counts['NETRAL']} ({sentiment_counts['NETRAL']/total_articles*100:.1f}%) **Model yang Digunakan:** - Sentiment Analysis: {SENTIMENT_MODEL_NAME} - Named Entity Recognition: {NER_MODEL_NAME} - TF-IDF & Trigram: Custom implementation dengan stopwords Indonesia **Konfigurasi TF-IDF:** - Stopwords: Indonesia ({stopwords_count} kata) - N-gram range: (1, 2) - Max features: 100 **Hasil disimpan dalam folder 'analisis':** - sentiment_prediction.csv - ner_results.csv - trigram_results.csv - tfidf_results.csv """ return summary def save_analysis_results(df_sentiment, df_ner, df_trigram, df_tfidf): """Simpan hasil analisis ke folder analisis""" try: # Buat folder analisis jika belum ada os.makedirs('analisis', exist_ok=True) # Simpan masing-masing file df_sentiment.to_csv('analisis/sentiment_prediction.csv', index=False) df_ner.to_csv('analisis/ner_results.csv', index=False) df_trigram.to_csv('analisis/trigram_results.csv', index=False) df_tfidf.to_csv('analisis/tfidf_results.csv', index=False) print("✅ Hasil analisis berhasil disimpan ke folder 'analisis'") except Exception as e: print(f"❌ Error menyimpan hasil analisis: {e}") # ==================== FUNGSI SCRAPING ARTIKEL ==================== def clean_text(text): """Membersihkan teks dari karakter tidak diinginkan""" if not text: return "" text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s.,!?;:()\-]', '', text) return text.strip() def extract_republika_article(url): """ Fungsi utama untuk scraping artikel Republika.co.id """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } print(f"🔍 Mengakses URL: {url}") response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Generate article_id dari URL article_id = hashlib.md5(url.encode()).hexdigest()[:16] metadata = { 'article_id': article_id, 'judul': '', 'waktu_terbit': '', 'editor': '', 'konten': '', 'url': url, 'panjang_konten': 0 } main_content = soup.find('div', class_='main-content__left') if not main_content: return None, "Struktur halaman tidak dikenali. Tidak ditemukan div.main-content__left" # Extract title title_div = main_content.find('div', class_='max-card__title') if title_div: title_h1 = title_div.find('h1') metadata['judul'] = clean_text(title_h1.get_text()) if title_h1 else "Judul tidak ditemukan" else: title_h1 = main_content.find('h1') metadata['judul'] = clean_text(title_h1.get_text()) if title_h1 else "Judul tidak ditemukan" # Extract content article_content = main_content.find('div', class_='article-content') if article_content: konten_artikel = article_content.get_text(separator='\n', strip=True) else: konten_artikel = main_content.get_text(separator='\n', strip=True) metadata['konten'] = clean_text(konten_artikel) metadata['panjang_konten'] = len(metadata['konten']) return metadata, None except Exception as e: return None, f"Error: {str(e)}" # ==================== FUNGSI SCRAPING PENCARIAN ==================== def generate_search_id(keyword, startdate, enddate): """Generate unique search_id based on inputs (without page)""" input_str = f"{keyword}_{startdate}_{enddate}" return hashlib.md5(input_str.encode()).hexdigest()[:16] def scrape_republika_search(keyword, startdate, enddate, progress=gr.Progress()): """ Scrape all pages from Republika.co.id search until no more results """ all_results = [] page = 1 max_pages = 50 # Safety limit to prevent infinite loop status_msgs = [] while page <= max_pages: try: progress((page-1)/max_pages, desc=f"Scraping halaman {page}") q = quote_plus(keyword) url = f"https://republika.co.id/search/v3/all/{page}/?q={q}&latest_date=custom&startdate={startdate}&enddate={enddate}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } print(f"🔍 Scraping page {page}: {url}") response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') selector = "#search > div.main-wrapper > main > div.main-content > div.container > div.results-section" results_section = soup.select_one(selector) if not results_section: fallback_selectors = [ 'div.results-section', '.results-section', 'main div.container div[class*="result"]', '.search-results' ] for sel in fallback_selectors: results_section = soup.select_one(sel) if results_section: print(f"✅ Found results with fallback: {sel}") break if not results_section: status_msgs.append(f"❌ Results section not found on page {page}. Stopping.") break items = [] item_selectors = [ 'div[class*="card"] a', 'article a', '.search-item', '.result-item', 'div.max-card' ] for sel in item_selectors: items = results_section.select(sel) if items: print(f"✅ Found {len(items)} items on page {page} with selector: {sel}") break else: items = results_section.find_all('a', href=re.compile(r'/berita/|/reads/')) if not items: status_msgs.append(f"✅ No more results on page {page}. Stopping.") break page_results = [] for item in items: title_elem = item.find(['h1', 'h2', 'h3', 'h4']) or item title = title_elem.get_text(strip=True) if not title or len(title) < 10: continue date_elem = item.find(class_=re.compile(r'date|time')) or item.find('span') date_text = date_elem.get_text(strip=True) if date_elem else "" date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4},\s+\d{1,2}:\d{2})', date_text) date = date_match.group(1) if date_match else "Date not found" href = item.get('href', '') if href.startswith('/'): full_url = urljoin("https://republika.co.id", href) else: full_url = href page_results.append({ 'title': title[:200], 'date': date, 'url': full_url }) all_results.extend(page_results) status_msgs.append(f"✅ Found {len(page_results)} results on page {page}") # Check for next page (look for pagination) next_page = soup.find('a', class_='next') or soup.find('a', text=re.compile(r'Next|Selanjutnya')) if not next_page: status_msgs.append("✅ No next page found. Stopping.") break page += 1 time.sleep(2) # Delay to avoid rate limiting except Exception as e: status_msgs.append(f"❌ Error on page {page}: {str(e)}. Stopping.") break return all_results, "\n".join(status_msgs) # ==================== FUNGSI UNTUK APPEND KE CSV ==================== def append_to_csv(df, filename): """Append DataFrame to existing CSV file or create new one""" if os.path.exists(filename): # File exists, append without header df.to_csv(filename, mode='a', header=False, index=False) print(f"✅ Data appended to {filename}") else: # File doesn't exist, create new with header df.to_csv(filename, index=False) print(f"✅ New file created: {filename}") # ==================== PROSES UTAMA SCRAPING ==================== def process_republika_search(keyword, startdate_str, enddate_str, progress=gr.Progress()): if not keyword.strip(): return "❌ Masukkan keyword pencarian!", None, None, None, None, None, None startdate = startdate_str or '2025-10-01' enddate = enddate_str or '2025-10-31' # Validasi format tanggal try: if startdate: datetime.strptime(startdate, '%Y-%m-%d') if enddate: datetime.strptime(enddate, '%Y-%m-%d') except ValueError: return "❌ Format tanggal harus YYYY-MM-DD!", None, None, None, None, None, None results_list, status = scrape_republika_search(keyword, startdate, enddate, progress) if not results_list: return "❌ Tidak ada hasil yang ditemukan!", None, None, None, None, None, None search_id = generate_search_id(keyword, startdate, enddate) timestamp_search = datetime.now().isoformat() num_results = len(results_list) results_json = json.dumps(results_list, ensure_ascii=False) df_keyword_search = pd.DataFrame([{ 'search_id': search_id, 'keyword': keyword, 'source_type': 'Republika Search', 'num_results': num_results, 'results': results_json, 'timestamp_search': timestamp_search }]) df_results = pd.DataFrame(results_list) # Scraping metadata artikel untuk setiap URL articles_metadata = [] for i, result in enumerate(results_list): progress((i)/len(results_list), desc=f"Scraping artikel {i+1}/{len(results_list)}") url = result['url'] metadata, error = extract_republika_article(url) if metadata: metadata['search_id'] = search_id metadata['article_id'] = hashlib.md5(url.encode()).hexdigest()[:16] metadata['timestamp_ekstraksi'] = datetime.now().isoformat() articles_metadata.append(metadata) time.sleep(2) # Delay else: print(f"⚠️ Failed to scrape {url}: {error}") df_metadata = pd.DataFrame(articles_metadata) # Simpan ke CSV dengan nama file spesifik (seperti sebelumnya) csv_dir = "scrapper_result" os.makedirs(csv_dir, exist_ok=True) keyword_csv_path = os.path.join(csv_dir, f"keyword_search_{search_id}.csv") results_csv_path = os.path.join(csv_dir, f"search_results_{search_id}.csv") metadata_csv_path = os.path.join(csv_dir, f"article_metadata_{search_id}.csv") df_keyword_search.to_csv(keyword_csv_path, index=False) df_results.to_csv(results_csv_path, index=False) df_metadata.to_csv(metadata_csv_path, index=False) # TAMBAHAN: Append ke file CSV utama main_keyword_csv = os.path.join(csv_dir, "keyword_search.csv") main_results_csv = os.path.join(csv_dir, "search_results.csv") main_metadata_csv = os.path.join(csv_dir, "article_metadata.csv") # Append data ke file utama append_to_csv(df_keyword_search, main_keyword_csv) append_to_csv(df_results, main_results_csv) if not df_metadata.empty: append_to_csv(df_metadata, main_metadata_csv) # Upload ke GitHub try: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Upload file individual upload_to_github( f"{SCRAPPER_RESULT_PATH}/keyword_search_{search_id}.csv", df_keyword_search, f"Scraping keyword search {search_id} - {timestamp}" ) upload_to_github( f"{SCRAPPER_RESULT_PATH}/search_results_{search_id}.csv", df_results, f"Scraping search results {search_id} - {timestamp}" ) if not df_metadata.empty: upload_to_github( f"{SCRAPPER_RESULT_PATH}/article_metadata_{search_id}.csv", df_metadata, f"Scraping article metadata {search_id} - {timestamp}" ) # Upload file utama (append) upload_to_github( f"{SCRAPPER_RESULT_PATH}/keyword_search.csv", df_keyword_search, f"Update keyword search utama - {timestamp}" ) upload_to_github( f"{SCRAPPER_RESULT_PATH}/search_results.csv", df_results, f"Update search results utama - {timestamp}" ) if not df_metadata.empty: upload_to_github( f"{SCRAPPER_RESULT_PATH}/article_metadata.csv", df_metadata, f"Update article metadata utama - {timestamp}" ) github_status = "✅ Data berhasil diupload ke GitHub" except Exception as e: github_status = f"❌ Gagal upload ke GitHub: {str(e)}" return f"✅ Scraping selesai! {github_status}", df_results, df_keyword_search, df_metadata, search_id, num_results, len(articles_metadata) # ==================== FUNGSI UNTUK TAB TINJAUAN DATA ==================== def load_and_filter_data(start_date, end_date): """Memuat dan memfilter data berdasarkan tanggal""" csv_dir = "scrapper_result" # Load data try: keyword_df = pd.read_csv(os.path.join(csv_dir, "keyword_search.csv")) except: keyword_df = pd.DataFrame() try: results_df = pd.read_csv(os.path.join(csv_dir, "search_results.csv")) except: results_df = pd.DataFrame() try: metadata_df = pd.read_csv(os.path.join(csv_dir, "article_metadata.csv")) # Pastikan article_id ada if 'article_id' not in metadata_df.columns and 'url' in metadata_df.columns: metadata_df['article_id'] = metadata_df['url'].apply( lambda x: hashlib.md5(str(x).encode()).hexdigest()[:16] ) except Exception as e: print(f"Error loading metadata: {e}") metadata_df = pd.DataFrame() return keyword_df, results_df, metadata_df # ==================== FUNGSI GITHUB ==================== def get_github_headers(): """Mendapatkan headers untuk request GitHub API""" headers = {"Accept": "application/vnd.github.v3+json"} if GITHUB_TOKEN: headers["Authorization"] = f"token {GITHUB_TOKEN}" return headers def github_api_request(endpoint, method="GET", data=None): """Membuat request ke GitHub API""" url = f"https://api.github.com/repos/{GITHUB_REPO}/{endpoint}" headers = get_github_headers() try: if method == "GET": response = requests.get(url, headers=headers) elif method == "PUT": response = requests.put(url, headers=headers, json=data) elif method == "POST": response = requests.post(url, headers=headers, json=data) response.raise_for_status() return response.json() if response.content else {} except Exception as e: print(f"Error GitHub API: {str(e)}") return None def upload_to_github(file_path, content, commit_message): """Upload file ke GitHub""" # Encode content to base64 if isinstance(content, pd.DataFrame): content = content.to_csv(index=False) if isinstance(content, str): content = content.encode('utf-8') content_b64 = base64.b64encode(content).decode('utf-8') # Get existing file SHA if exists sha = get_file_sha(file_path) data = { "message": commit_message, "content": content_b64, "branch": GITHUB_BRANCH } if sha: data["sha"] = sha endpoint = f"contents/{file_path}" result = github_api_request(endpoint, "PUT", data) return result is not None def get_file_sha(file_path): """Mendapatkan SHA hash file yang ada di GitHub""" endpoint = f"contents/{file_path}?ref={GITHUB_BRANCH}" result = github_api_request(endpoint) return result.get("sha") if result else None def sync_to_github(): """Sinkronisasi file lokal ke GitHub""" csv_dir = "scrapper_result" if not os.path.exists(csv_dir): return "Folder scrapper_result tidak ditemukan secara lokal" success_count = 0 total_count = 0 messages = [] for filename in os.listdir(csv_dir): if filename.endswith('.csv'): file_path = os.path.join(csv_dir, filename) github_file_path = f"{SCRAPPER_RESULT_PATH}/{filename}" try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() if upload_to_github(github_file_path, content, f"Sync {filename}"): success_count += 1 messages.append(f"✅ {filename} berhasil diupload ke GitHub") else: messages.append(f"❌ Gagal upload {filename} ke GitHub") total_count += 1 time.sleep(1) # Delay untuk menghindari rate limit except Exception as e: messages.append(f"❌ Error sync {filename}: {str(e)}") result_msg = f"{success_count}/{total_count} file berhasil disinkronisasi ke GitHub\n" result_msg += "\n".join(messages) return result_msg def load_from_github(): """Load file dari GitHub ke lokal""" csv_dir = "scrapper_result" os.makedirs(csv_dir, exist_ok=True) success_count = 0 total_count = 0 messages = [] # Load dari folder scrapper_result files = list_github_files(SCRAPPER_RESULT_PATH) for filename in files: if filename.endswith('.csv'): github_file_path = f"{SCRAPPER_RESULT_PATH}/{filename}" local_file_path = os.path.join(csv_dir, filename) try: content = download_from_github(github_file_path) if content: with open(local_file_path, 'w', encoding='utf-8') as f: f.write(content) success_count += 1 messages.append(f"✅ {filename} berhasil didownload dari GitHub") else: messages.append(f"❌ Gagal download {filename} dari GitHub") total_count += 1 time.sleep(1) # Delay untuk menghindari rate limit except Exception as e: messages.append(f"❌ Error load {filename}: {str(e)}") result_msg = f"{success_count}/{total_count} file berhasil dimuat dari GitHub\n" result_msg += "\n".join(messages) return result_msg def list_github_files(folder_path): """Mendapatkan daftar file di folder GitHub""" endpoint = f"contents/{folder_path}?ref={GITHUB_BRANCH}" result = github_api_request(endpoint) if result and isinstance(result, list): return [item["name"] for item in result if item["type"] == "file"] return [] def download_from_github(file_path): """Download file dari GitHub""" endpoint = f"contents/{file_path}?ref={GITHUB_BRANCH}" result = github_api_request(endpoint) if result and "content" in result: content_b64 = result["content"] content = base64.b64decode(content_b64).decode('utf-8') return content return None # ==================== FUNGSI DASHBOARD ==================== def create_sentiment_trend_chart(): """Buat grafik trend sentimen per tanggal""" try: # Load data metadata_df = pd.read_csv('scrapper_result/article_metadata.csv') sentiment_df = pd.read_csv('analisis/sentiment_prediction.csv') if metadata_df.empty or sentiment_df.empty: return None # Merge data merged_df = sentiment_df.merge(metadata_df, on='article_id', how='left') # Extract date from waktu_terbit merged_df['tanggal'] = pd.to_datetime( merged_df['waktu_terbit'], errors='coerce', format='mixed' ).dt.date # Group by date and sentiment daily_sentiment = merged_df.groupby(['tanggal', 'sentimen']).size().unstack(fill_value=0) # Create plot fig = go.Figure() colors = {'POSITIF': 'green', 'NEGATIF': 'red', 'NETRAL': 'blue'} for sentiment in ['POSITIF', 'NEGATIF', 'NETRAL']: if sentiment in daily_sentiment.columns: fig.add_trace(go.Scatter( x=daily_sentiment.index, y=daily_sentiment[sentiment], mode='lines+markers', name=sentiment, line=dict(color=colors[sentiment], width=3), marker=dict(size=8) )) fig.update_layout( title='Trend Sentimen Harian', xaxis_title='Tanggal', yaxis_title='Jumlah Artikel', template='plotly_white', height=400 ) return fig except Exception as e: print(f"Error creating sentiment trend chart: {e}") return None def create_confidence_chart(): """Buat grafik rata-rata confidence per sentimen per tanggal""" try: # Load data metadata_df = pd.read_csv('scrapper_result/article_metadata.csv') sentiment_df = pd.read_csv('analisis/sentiment_prediction.csv') if metadata_df.empty or sentiment_df.empty: return None # Merge data merged_df = sentiment_df.merge(metadata_df, on='article_id', how='left') # Extract date from waktu_terbit merged_df['tanggal'] = pd.to_datetime( merged_df['waktu_terbit'], errors='coerce', format='mixed' ).dt.date # Calculate average confidence per date and sentiment avg_confidence = merged_df.groupby(['tanggal', 'sentimen'])['confidence'].mean().unstack(fill_value=0) # Create plot fig = go.Figure() colors = {'POSITIF': 'green', 'NEGATIF': 'red', 'NETRAL': 'blue'} for sentiment in ['POSITIF', 'NEGATIF', 'NETRAL']: if sentiment in avg_confidence.columns: fig.add_trace(go.Scatter( x=avg_confidence.index, y=avg_confidence[sentiment] * 100, # Convert to percentage mode='lines+markers', name=f'{sentiment} Confidence', line=dict(color=colors[sentiment], width=3, dash='dot'), marker=dict(size=6) )) fig.update_layout( title='Rata-rata Confidence Sentimen Harian (%)', xaxis_title='Tanggal', yaxis_title='Confidence (%)', template='plotly_white', height=400 ) return fig except Exception as e: print(f"Error creating confidence chart: {e}") return None def create_wordcloud(): """Buat wordcloud dari TF-IDF terms""" try: # Load TF-IDF data tfidf_df = pd.read_csv('analisis/tfidf_results.csv') if tfidf_df.empty: return None # Create word frequencies from TF-IDF scores word_freq = {} for _, row in tfidf_df.iterrows(): term = row['term'] score = row['tfidf_score'] word_freq[term] = word_freq.get(term, 0) + score # Create wordcloud wordcloud = WordCloud( width=800, height=400, background_color='white', colormap='viridis', max_words=50 ).generate_from_frequencies(word_freq) # Convert to plot fig, ax = plt.subplots(figsize=(10, 5)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title('WordCloud - Top 50 Terms TF-IDF', fontsize=16, pad=20) return fig except Exception as e: print(f"Error creating wordcloud: {e}") return None def get_top_trigrams(): """Dapatkan top 10 trigrams""" try: trigram_df = pd.read_csv('analisis/trigram_results.csv') if trigram_df.empty: return pd.DataFrame() # Aggregate trigrams across all articles top_trigrams = trigram_df.groupby('trigram')['frekuensi'].sum().nlargest(10).reset_index() top_trigrams['rank'] = range(1, len(top_trigrams) + 1) return top_trigrams except Exception as e: print(f"Error getting top trigrams: {e}") return pd.DataFrame() def create_ner_bubble_chart(): """Buat grafik bubble untuk top 10 NER""" try: ner_df = pd.read_csv('analisis/ner_results.csv') if ner_df.empty: return None # Aggregate entities top_entities = ner_df.groupby(['entity', 'entity_type']).size().reset_index(name='count') top_entities = top_entities.nlargest(10, 'count') # Create bubble chart fig = px.scatter( top_entities, x='entity_type', y='count', size='count', color='entity_type', hover_name='entity', size_max=60, title='Top 10 Named Entities - Bubble Chart' ) fig.update_layout( xaxis_title='Tipe Entitas', yaxis_title='Frekuensi', template='plotly_white', height=500 ) return fig except Exception as e: print(f"Error creating NER bubble chart: {e}") return None def get_popular_topics(): """Dapatkan top 10 topik populer dari TF-IDF""" try: tfidf_df = pd.read_csv('analisis/tfidf_results.csv') if tfidf_df.empty: return pd.DataFrame() # Aggregate terms by frequency topic_freq = tfidf_df.groupby('term').agg({ 'tfidf_score': 'sum', 'article_id': 'nunique' }).reset_index() topic_freq = topic_freq.nlargest(10, 'tfidf_score') topic_freq = topic_freq.rename(columns={ 'term': 'Topik', 'tfidf_score': 'Total TF-IDF Score', 'article_id': 'Jumlah Artikel' }) topic_freq['Rank'] = range(1, len(topic_freq) + 1) return topic_freq[['Rank', 'Topik', 'Total TF-IDF Score', 'Jumlah Artikel']] except Exception as e: print(f"Error getting popular topics: {e}") return pd.DataFrame() def create_dashboard(): """Buat dashboard dengan semua visualisasi""" try: # Check if analysis files exist if not os.path.exists('analisis/sentiment_prediction.csv'): return "❌ Data analisis belum tersedia. Silakan jalankan analisis sentimen terlebih dahulu." # Create all visualizations sentiment_trend = create_sentiment_trend_chart() confidence_chart = create_confidence_chart() wordcloud_fig = create_wordcloud() top_trigrams = get_top_trigrams() ner_bubble = create_ner_bubble_chart() popular_topics = get_popular_topics() # Prepare output output = "## 📊 Dashboard Analisis Sentimen\n\n" if sentiment_trend: output += "### 📈 Trend Sentimen Harian\n" output += "Grafik menunjukkan jumlah artikel untuk setiap sentimen per tanggal.\n\n" if confidence_chart: output += "### 📊 Rata-rata Confidence Sentimen\n" output += "Grafik menunjukkan rata-rata confidence (dalam %) untuk setiap sentimen per tanggal.\n\n" if wordcloud_fig: output += "### ☁️ WordCloud Top Terms\n" output += "Visualisasi kata-kata paling penting berdasarkan skor TF-IDF.\n\n" if not top_trigrams.empty: output += "### 🔤 Top 10 Trigram\n" output += "Trigram (3 kata berurutan) yang paling sering muncul.\n\n" if ner_bubble: output += "### 🏷️ Top 10 Named Entities\n" output += "Entitas yang paling sering diidentifikasi dengan ukuran bubble menunjukkan frekuensi.\n\n" if not popular_topics.empty: output += "### 🔥 Top 10 Topik Populer\n" output += "Topik-topik paling populer berdasarkan skor TF-IDF.\n\n" return output, sentiment_trend, confidence_chart, wordcloud_fig, top_trigrams, ner_bubble, popular_topics except Exception as e: return f"❌ Error creating dashboard: {str(e)}", None, None, None, None, None, None # ==================== INTERFACE GRADIO ==================== def create_gradio_interface(): """Membuat interface Gradio untuk aplikasi""" with gr.Blocks(title="🔍 Republika.co.id Search Scraper & Analisis Sentimen", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🔍 Republika.co.id Search Scraper & Analisis Sentimen") with gr.Tab("📊 Scraping"): gr.Markdown(""" **Ekstrak hasil pencarian berdasarkan keyword ke dalam skema KeywordSearchResult & ArticleMetadata** **Fitur:** - ✅ Scraping semua halaman otomatis - ✅ Ekstraksi otomatis title, date, URL - ✅ Scraping metadata artikel untuk setiap hasil - ✅ Generate Search ID unik - ✅ Simpan ke CSV otomatis - ✅ **DATA DITAMBAHKAN KE FILE CSV UTAMA** (keyword_search.csv, search_results.csv, article_metadata.csv) - ✅ **SYNC OTOMATIS KE GITHUB** (https://github.com/abdfajar/republika_sentiner) """) with gr.Row(): with gr.Column(scale=1): keyword_input = gr.Textbox( label="🔑 Keyword Pencarian", value="MBG", placeholder="e.g., MBG" ) with gr.Row(): startdate_input = gr.Textbox( label="📅 Tanggal Mulai", value="2025-10-01", placeholder="YYYY-MM-DD" ) enddate_input = gr.Textbox( label="📅 Tanggal Selesai", value="2025-10-31", placeholder="YYYY-MM-DD" ) search_btn = gr.Button( "🚀 Cari & Scrap Semua Halaman", variant="primary" ) gr.Markdown("---") gr.Markdown("### GitHub Operations") with gr.Row(): sync_btn = gr.Button("📤 Sync ke GitHub") load_btn = gr.Button("📥 Load dari GitHub") gr.Markdown("---") gr.Markdown("### Contoh Pencarian") with gr.Row(): example_mbg = gr.Button("MBG (Oktober 2025)") example_prabowo = gr.Button("Prabowo (September 2025)") with gr.Column(scale=2): scraping_output = gr.Textbox(label="Status Scraping", lines=5, max_lines=10) with gr.Row(): search_id_output = gr.Textbox(label="💾 Search ID") num_results_output = gr.Number(label="📚 Total Hasil") num_articles_output = gr.Number(label="📑 Metadata Artikel") with gr.Tab("Hasil Pencarian"): results_table = gr.Dataframe(label="📋 Tabel Hasil Pencarian") with gr.Tab("Keyword Search"): keyword_table = gr.Dataframe(label="💾 Skema KeywordSearchResult") with gr.Tab("Metadata Artikel"): metadata_table = gr.Dataframe(label="📚 Tabel Metadata Artikel") with gr.Tab("📈 Tinjauan Data"): gr.Markdown("Menampilkan data dari file CSV yang telah di-scrap") with gr.Row(): start_date_filter = gr.Textbox(label="Tanggal Mulai", value="2025-10-01") end_date_filter = gr.Textbox(label="Tanggal Selesai", value="2025-10-31") load_data_btn = gr.Button("🔍 Muat Data", variant="secondary") with gr.Tab("Keyword Search"): keyword_data_table = gr.Dataframe(label="📋 Keyword Search Data") keyword_count = gr.Number(label="Jumlah Record") with gr.Tab("Search Results"): results_data_table = gr.Dataframe(label="📋 Search Results Data") results_count = gr.Number(label="Jumlah Record") with gr.Tab("Article Metadata"): metadata_data_table = gr.Dataframe(label="📋 Article Metadata Data") metadata_count = gr.Number(label="Jumlah Record") with gr.Tab("🧠 Analisis Sentimen"): gr.Markdown(""" Analisis sentimen pada artikel yang telah di-scrap menggunakan model AI: - **Sentiment Analysis**: IndoBERT model - **Named Entity Recognition**: BERT-base Indonesian NER - **Trigram Analysis**: Pola kata berurutan - **TF-IDF Analysis**: Kata kunci penting dengan stopwords Indonesia """) with gr.Row(): analysis_start_date = gr.Textbox(label="Tanggal Mulai Analisis", value="2025-10-01") analysis_end_date = gr.Textbox(label="Tanggal Selesai Analisis", value="2025-10-31") analyze_btn = gr.Button("🚀 Analisis Sentimen", variant="primary") analysis_output = gr.Textbox(label="Status Analisis", lines=5) with gr.Tab("Prediksi Sentimen"): sentiment_table = gr.Dataframe(label="📊 Prediksi Sentimen") with gr.Tab("Named Entity Recognition"): ner_table = gr.Dataframe(label="🏷️ Named Entity Recognition (NER)") with gr.Tab("Trigram Analysis"): trigram_table = gr.Dataframe(label="🔤 Trigram Analysis") with gr.Tab("TF-IDF Analysis"): tfidf_table = gr.Dataframe(label="📊 TF-IDF Analysis") # Dalam fungsi create_gradio_interface(), tambahkan tab baru: with gr.Tab("🧠 Analisis Lanjutan"): gr.Markdown(""" ## Analisis Teks Lanjutan **Fitur Analisis:** - 🎯 **Topic Modeling**: Identifikasi topik utama menggunakan LDA - 🔑 **Keyword Extraction**: Ekstraksi kata kunci dengan multiple methods - 📝 **Text Summarization**: Ringkasan otomatis artikel - 💡 **Concept Extraction**: Identifikasi konsep dan ide utama """) advanced_analyze_btn = gr.Button("🚀 Jalankan Analisis Lanjutan", variant="primary") advanced_analysis_output = gr.Markdown() with gr.Row(): with gr.Column(): topic_plot = gr.Plot(label="Topic Modeling Visualization") with gr.Column(): keyword_cloud_plot = gr.Plot(label="Keyword WordCloud") with gr.Row(): with gr.Column(): concept_network_plot = gr.Plot(label="Concept Network") with gr.Tab("Topic Modeling Results"): topic_table = gr.Dataframe(label="Hasil Topic Modeling") with gr.Tab("Keyword Extraction"): keyword_table = gr.Dataframe(label="Hasil Keyword Extraction") with gr.Tab("Text Summarization"): summary_table = gr.Dataframe(label="Hasil Text Summarization") with gr.Tab("Concept Extraction"): concept_table = gr.Dataframe(label="Hasil Concept Extraction") # Event handler untuk analisis lanjutan advanced_analyze_btn.click( fn=perform_advanced_analysis_wrapper, outputs=[advanced_analysis_output, topic_plot, keyword_cloud_plot, concept_network_plot, topic_table, keyword_table, summary_table, concept_table] ) def perform_advanced_analysis_wrapper(): """Wrapper function untuk analisis lanjutan""" # Load metadata metadata_df = pd.read_csv('scrapper_result/article_metadata.csv') if metadata_df.empty: return "❌ Tidak ada data untuk dianalisis", None, None, None, None, None, None, None # Perform analysis result_msg, results, topic_viz, keyword_viz, concept_viz = perform_advanced_analysis(metadata_df) # Prepare dataframes topic_df = pd.DataFrame(results['topics']['topics']) if results and 'topics' in results else pd.DataFrame() keyword_df = pd.DataFrame(results['keywords']) if results and 'keywords' in results else pd.DataFrame() summary_df = pd.DataFrame(results['summaries']) if results and 'summaries' in results else pd.DataFrame() concept_df = pd.DataFrame(results['concepts']) if results and 'concepts' in results else pd.DataFrame() return result_msg, topic_viz, keyword_viz, concept_viz, topic_df, keyword_df, summary_df, concept_df with gr.Tab("📊 Dashboard"): gr.Markdown(""" ## Dashboard Analisis Sentimen Visualisasi komprehensif hasil analisis sentimen dan insights dari data. """) dashboard_btn = gr.Button("🔄 Refresh Dashboard", variant="primary") dashboard_output = gr.Markdown() with gr.Row(): with gr.Column(): sentiment_trend_plot = gr.Plot(label="Trend Sentimen Harian") with gr.Column(): confidence_plot = gr.Plot(label="Rata-rata Confidence Sentimen") with gr.Row(): with gr.Column(): wordcloud_plot = gr.Plot(label="WordCloud Top Terms") with gr.Column(): ner_bubble_plot = gr.Plot(label="Top 10 Named Entities") with gr.Row(): with gr.Column(): trigram_table_dash = gr.Dataframe(label="Top 10 Trigram") with gr.Column(): topics_table = gr.Dataframe(label="Top 10 Topik Populer") # Event handlers untuk Tab Scraping search_btn.click( fn=process_republika_search, inputs=[keyword_input, startdate_input, enddate_input], outputs=[scraping_output, results_table, keyword_table, metadata_table, search_id_output, num_results_output, num_articles_output] ) sync_btn.click( fn=sync_to_github, outputs=scraping_output ) load_btn.click( fn=load_from_github, outputs=scraping_output ) example_mbg.click( fn=lambda: ["MBG", "2025-10-01", "2025-10-31"], outputs=[keyword_input, startdate_input, enddate_input] ) example_prabowo.click( fn=lambda: ["Prabowo", "2025-09-01", "2025-09-30"], outputs=[keyword_input, startdate_input, enddate_input] ) # Event handlers untuk Tab Tinjauan Data def load_data_for_review(start_date, end_date): keyword_df, results_df, metadata_df = load_and_filter_data(start_date, end_date) keyword_count_val = len(keyword_df) if not keyword_df.empty else 0 results_count_val = len(results_df) if not results_df.empty else 0 metadata_count_val = len(metadata_df) if not metadata_df.empty else 0 return ( keyword_df, keyword_count_val, results_df, results_count_val, metadata_df, metadata_count_val ) load_data_btn.click( fn=load_data_for_review, inputs=[start_date_filter, end_date_filter], outputs=[ keyword_data_table, keyword_count, results_data_table, results_count, metadata_data_table, metadata_count ] ) # Event handlers untuk Tab Analisis Sentimen analyze_btn.click( fn=analyze_sentiment_comprehensive, inputs=[analysis_start_date, analysis_end_date], outputs=[analysis_output, sentiment_table, ner_table, trigram_table, tfidf_table] ) # Event handlers untuk Tab Dashboard dashboard_btn.click( fn=create_dashboard, outputs=[dashboard_output, sentiment_trend_plot, confidence_plot, wordcloud_plot, trigram_table_dash, ner_bubble_plot, topics_table] ) gr.Markdown("---") gr.Markdown(f"Dibuat dengan ❤️ menggunakan Gradio | Scraping data dari Republika.co.id | GitHub: https://github.com/{GITHUB_REPO}") # Update dashboard utama untuk menyertakan visualisasi lanjutan with gr.Tab("📊 Dashboard Lengkap"): gr.Markdown(""" ## Dashboard Analisis Komprehensif Visualisasi lengkap hasil analisis dasar dan lanjutan. """) full_dashboard_btn = gr.Button("🔄 Refresh Dashboard Lengkap", variant="primary") full_dashboard_output = gr.Markdown() # Basic analysis visualizations with gr.Row(): with gr.Column(): sentiment_trend_full = gr.Plot(label="Trend Sentimen Harian") with gr.Column(): confidence_plot_full = gr.Plot(label="Rata-rata Confidence Sentimen") with gr.Row(): with gr.Column(): wordcloud_plot_full = gr.Plot(label="WordCloud Top Terms") with gr.Column(): ner_bubble_plot_full = gr.Plot(label="Top 10 Named Entities") # Advanced analysis visualizations with gr.Row(): with gr.Column(): topic_plot_full = gr.Plot(label="Topic Modeling") with gr.Column(): keyword_cloud_plot_full = gr.Plot(label="Keyword Extraction") with gr.Row(): with gr.Column(): concept_network_plot_full = gr.Plot(label="Concept Extraction") with gr.Column(): trigram_table_full = gr.Dataframe(label="Top 10 Trigram") with gr.Row(): with gr.Column(): topics_table_full = gr.Dataframe(label="Top 10 Topik Populer") # Event handler untuk dashboard lengkap full_dashboard_btn.click( fn=create_advanced_dashboard, outputs=[ full_dashboard_output, sentiment_trend_full, confidence_plot_full, wordcloud_plot_full, trigram_table_full, ner_bubble_plot_full, topics_table_full, topic_plot_full, keyword_cloud_plot_full, concept_network_plot_full ] ) return demo # ==================== FUNGSI UTAMA ==================== if __name__ == "__main__": # Pre-load models dan stopwords saat aplikasi start print("Pre-loading models and stopwords...") load_sentiment_model() load_ner_model() load_indonesian_stopwords() demo = create_gradio_interface() demo.launch()