import pandas as pd import numpy as np import re from collections import Counter import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction import text import gensim from gensim import corpora from gensim.models import LdaModel # Hapus impor yang bermasalah # from gensim.summarization import summarize as gensim_summarize from transformers import pipeline import torch # Hapus impor yang membutuhkan OMP # from keybert import KeyBERT # from yake import KeywordExtractor import spacy from collections import defaultdict import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go from wordcloud import WordCloud import io import base64 import os # Download NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') class AdvancedTextAnalysis: def __init__(self): self.sentiment_analyzer = None self.summarizer = None # Hapus model yang bermasalah # self.keybert_model = None # self.nlp = None self.stop_words_id = None # Load stopwords Indonesia self._load_indonesian_stopwords() def _load_indonesian_stopwords(self): """Load stopwords Indonesia""" try: # Basic Indonesian stopwords self.stop_words_id = set(stopwords.words('indonesian')) except: # Fallback stopwords self.stop_words_id = { 'yang', 'dan', 'di', 'dengan', 'ini', 'itu', 'dari', 'dalam', 'untuk', 'pada', 'ke', 'tidak', 'akan', 'ada', 'adalah', 'atau', 'juga', 'bahwa', 'sebagai', 'dapat', 'oleh', 'karena' } def preprocess_text(self, text): """Preprocessing teks untuk analisis""" if not text or pd.isna(text): return "" # Clean text text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'\s+', ' ', text) text = text.strip().lower() return text def topic_modeling_lda(self, texts, num_topics=5, num_words=10): """ Topic Modeling menggunakan LDA """ try: # Preprocess texts processed_texts = [self.preprocess_text(text) for text in texts] # Create document-term matrix vectorizer = CountVectorizer( max_features=1000, stop_words=list(self.stop_words_id), min_df=2, max_df=0.95 ) dtm = vectorizer.fit_transform(processed_texts) feature_names = vectorizer.get_feature_names_out() # Apply LDA lda = LatentDirichletAllocation( n_components=num_topics, random_state=42, max_iter=10 ) lda.fit(dtm) # Extract topics topics = [] for topic_idx, topic in enumerate(lda.components_): top_words_idx = topic.argsort()[-num_words:][::-1] top_words = [feature_names[i] for i in top_words_idx] topic_words = ", ".join(top_words) topics.append({ 'topic_id': topic_idx + 1, 'keywords': topic_words, 'top_words': top_words, 'topic_weight': round(topic.sum(), 2) }) # Get topic distribution for documents topic_distribution = lda.transform(dtm) doc_topics = [] for i, dist in enumerate(topic_distribution): dominant_topic = dist.argmax() doc_topics.append({ 'doc_id': i, 'dominant_topic': dominant_topic + 1, 'topic_confidence': round(dist[dominant_topic], 3), 'topic_distribution': dist.tolist() }) return { 'topics': topics, 'document_topics': doc_topics, 'model': lda, 'vectorizer': vectorizer } except Exception as e: print(f"Error in topic modeling: {e}") return None def extract_keywords_yake(self, texts, num_keywords=10): """ Extract keywords menggunakan YAKE - Fallback version """ try: all_keywords = [] for i, text in enumerate(texts): if not text or len(text.strip()) < 50: continue processed_text = self.preprocess_text(text) # Simple keyword extraction based on TF-IDF as fallback words = processed_text.split() word_freq = Counter(words) # Remove stopwords and short words filtered_words = {word: freq for word, freq in word_freq.items() if word not in self.stop_words_id and len(word) > 2} # Get top keywords top_keywords = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)[:num_keywords] for keyword, freq in top_keywords: score = freq / len(words) # Simple frequency-based score all_keywords.append({ 'doc_id': i, 'keyword': keyword, 'score': round(score, 4), 'type': 'FREQUENCY' }) return all_keywords except Exception as e: print(f"Error in YAKE keyword extraction: {e}") return [] def extract_keywords_tfidf(self, texts, num_keywords=10): """ Extract keywords menggunakan TF-IDF """ try: processed_texts = [self.preprocess_text(text) for text in texts] vectorizer = TfidfVectorizer( max_features=100, stop_words=list(self.stop_words_id), ngram_range=(1, 2) ) tfidf_matrix = vectorizer.fit_transform(processed_texts) feature_names = vectorizer.get_feature_names_out() all_keywords = [] for i, doc_vector in enumerate(tfidf_matrix): scores = doc_vector.toarray().flatten() top_indices = scores.argsort()[-num_keywords:][::-1] for idx in top_indices: if scores[idx] > 0: all_keywords.append({ 'doc_id': i, 'keyword': feature_names[idx], 'score': round(scores[idx], 4), 'type': 'TF-IDF' }) return all_keywords except Exception as e: print(f"Error in TF-IDF keyword extraction: {e}") return [] def text_summarization(self, texts, ratio=0.3): """ Text summarization menggunakan extractive methods - Simplified version """ try: summaries = [] for i, text in enumerate(texts): if not text or len(text.strip()) < 200: summaries.append({ 'doc_id': i, 'summary': 'Teks terlalu pendek untuk diringkas', 'original_length': len(text) if text else 0, 'summary_length': 0, 'compression_ratio': 0 }) continue try: # Simple extractive summarization: take first few sentences sentences = sent_tokenize(text) num_sentences = max(1, int(len(sentences) * ratio)) summary = ' '.join(sentences[:num_sentences]) summaries.append({ 'doc_id': i, 'summary': summary, 'original_length': len(text), 'summary_length': len(summary), 'compression_ratio': round(len(summary) / len(text), 2) }) except Exception as e: # Fallback: ambil beberapa kalimat pertama sentences = sent_tokenize(text) if len(sentences) > 3: summary = ' '.join(sentences[:3]) else: summary = text[:500] + '...' if len(text) > 500 else text summaries.append({ 'doc_id': i, 'summary': summary, 'original_length': len(text), 'summary_length': len(summary), 'compression_ratio': round(len(summary) / len(text), 2) }) return summaries except Exception as e: print(f"Error in text summarization: {e}") return [] def concept_extraction(self, texts, min_freq=2): """ Extract concepts menggunakan frequency analysis dan pattern matching """ try: # Preprocess texts processed_texts = [self.preprocess_text(text) for text in texts] # Extract noun phrases (simple pattern-based) concepts = [] concept_patterns = [ r'\b(\w+\s+\w+\s+\w+)\b', # 3-word phrases r'\b(\w+\s+\w+)\b', # 2-word phrases ] for pattern in concept_patterns: for i, text in enumerate(processed_texts): matches = re.findall(pattern, text) for match in matches: # Filter out phrases with stopwords words = match.split() if not all(word in self.stop_words_id for word in words): concepts.append({ 'doc_id': i, 'concept': match, 'length': len(words), 'type': 'noun_phrase' }) # Frequency analysis concept_freq = Counter([c['concept'] for c in concepts]) # Filter by frequency and prepare results concept_results = [] for concept, freq in concept_freq.items(): if freq >= min_freq: concept_results.append({ 'concept': concept, 'frequency': freq, 'type': 'noun_phrase' }) return concept_results[:50] # Return top 50 concepts except Exception as e: print(f"Error in concept extraction: {e}") return [] def create_topic_visualization(self, topics): """Buat visualisasi untuk topic modeling""" try: if not topics: return None # Prepare data for visualization topic_data = [] for topic in topics: words = topic['top_words'][:5] # Top 5 words per topic for word in words: topic_data.append({ 'topic': f'Topik {topic["topic_id"]}', 'word': word, 'importance': 1 # Placeholder }) df = pd.DataFrame(topic_data) # Create bar chart fig = px.bar(df, x='word', y='importance', color='topic', title='Distribusi Kata dalam Topik', labels={'word': 'Kata', 'importance': 'Importance'}, height=400) fig.update_layout( xaxis_tickangle=-45, showlegend=True, template='plotly_white' ) return fig except Exception as e: print(f"Error creating topic visualization: {e}") return None def create_keyword_cloud(self, keywords): """Buat wordcloud untuk keywords""" try: if not keywords: return None # Create word frequencies word_freq = {} for kw in keywords: word = kw['keyword'] score = kw['score'] word_freq[word] = word_freq.get(word, 0) + score # Create wordcloud wordcloud = WordCloud( width=800, height=400, background_color='white', colormap='viridis', max_words=50 ).generate_from_frequencies(word_freq) # Convert to plot fig, ax = plt.subplots(figsize=(10, 5)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title('WordCloud - Keywords Extraction', fontsize=16, pad=20) return fig except Exception as e: print(f"Error creating keyword cloud: {e}") return None def create_concept_network(self, concepts): """Buat network visualization untuk concepts""" try: if not concepts: return None # Prepare data for network concept_df = pd.DataFrame(concepts) top_concepts = concept_df.nlargest(15, 'frequency') # Create bubble chart fig = px.scatter( top_concepts, x='frequency', y=[1] * len(top_concepts), # Dummy y-axis size='frequency', color='frequency', hover_name='concept', size_max=60, title='Top 15 Concepts - Frequency Distribution' ) fig.update_layout( xaxis_title='Frekuensi', yaxis_title='', template='plotly_white', height=400, showlegend=False ) fig.update_yaxes(showticklabels=False) return fig except Exception as e: print(f"Error creating concept network: {e}") return None def perform_advanced_analysis(metadata_df, progress_callback=None): """ Fungsi utama untuk melakukan analisis lanjutan """ try: analyzer = AdvancedTextAnalysis() # Prepare texts texts = [] for idx, row in metadata_df.iterrows(): title = row.get('judul', '') content = row.get('konten', '') combined_text = f"{title}. {content}" if title else content texts.append(combined_text) if not texts: return "Tidak ada teks untuk dianalisis", None, None, None, None results = {} # 1. Topic Modeling if progress_callback: progress_callback(0.2, "Melakukan Topic Modeling...") topic_results = analyzer.topic_modeling_lda(texts, num_topics=5, num_words=8) results['topics'] = topic_results # 2. Keyword Extraction if progress_callback: progress_callback(0.4, "Melakukan Keyword Extraction...") yake_keywords = analyzer.extract_keywords_yake(texts, num_keywords=5) tfidf_keywords = analyzer.extract_keywords_tfidf(texts, num_keywords=5) all_keywords = yake_keywords + tfidf_keywords results['keywords'] = all_keywords # 3. Text Summarization if progress_callback: progress_callback(0.6, "Melakukan Text Summarization...") summaries = analyzer.text_summarization(texts, ratio=0.3) results['summaries'] = summaries # 4. Concept Extraction if progress_callback: progress_callback(0.8, "Melakukan Concept Extraction...") concepts = analyzer.concept_extraction(texts, min_freq=2) results['concepts'] = concepts # Create visualizations if progress_callback: progress_callback(0.9, "Membuat visualisasi...") topic_viz = analyzer.create_topic_visualization(topic_results['topics'] if topic_results else []) keyword_viz = analyzer.create_keyword_cloud(all_keywords) concept_viz = analyzer.create_concept_network(concepts) # Save results to CSV save_advanced_analysis_results(results) if progress_callback: progress_callback(1.0, "Analisis lanjutan selesai!") return "✅ Analisis lanjutan berhasil!", results, topic_viz, keyword_viz, concept_viz except Exception as e: error_msg = f"❌ Error dalam analisis lanjutan: {str(e)}" print(error_msg) return error_msg, None, None, None, None def save_advanced_analysis_results(results): """Simpan hasil analisis lanjutan ke CSV""" try: os.makedirs('analisis', exist_ok=True) # Save topics if 'topics' in results and results['topics']: topics_df = pd.DataFrame(results['topics']['topics']) topics_df.to_csv('analisis/topic_modeling_results.csv', index=False) # Save keywords if 'keywords' in results and results['keywords']: keywords_df = pd.DataFrame(results['keywords']) keywords_df.to_csv('analisis/keyword_extraction_results.csv', index=False) # Save summaries if 'summaries' in results and results['summaries']: summaries_df = pd.DataFrame(results['summaries']) summaries_df.to_csv('analisis/text_summarization_results.csv', index=False) # Save concepts if 'concepts' in results and results['concepts']: concepts_df = pd.DataFrame(results['concepts']) concepts_df.to_csv('analisis/concept_extraction_results.csv', index=False) print("✅ Hasil analisis lanjutan disimpan ke folder 'analisis'") except Exception as e: print(f"❌ Error menyimpan hasil analisis lanjutan: {e}") # Tambahkan fungsi dummy untuk menghindari error di main app def perform_advanced_analysis_wrapper(): """Wrapper function untuk analisis lanjutan""" try: # Load metadata metadata_df = pd.read_csv('scrapper_result/article_metadata.csv') if metadata_df.empty: return "❌ Tidak ada data untuk dianalisis", None, None, None, None, None, None, None # Perform analysis result_msg, results, topic_viz, keyword_viz, concept_viz = perform_advanced_analysis(metadata_df) # Prepare dataframes topic_df = pd.DataFrame(results['topics']['topics']) if results and 'topics' in results else pd.DataFrame() keyword_df = pd.DataFrame(results['keywords']) if results and 'keywords' in results else pd.DataFrame() summary_df = pd.DataFrame(results['summaries']) if results and 'summaries' in results else pd.DataFrame() concept_df = pd.DataFrame(results['concepts']) if results and 'concepts' in results else pd.DataFrame() return result_msg, topic_viz, keyword_viz, concept_viz, topic_df, keyword_df, summary_df, concept_df except Exception as e: return f"❌ Error: {str(e)}", None, None, None, pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()