abdfajar707 commited on
Commit
d868235
Β·
verified Β·
1 Parent(s): 2cfa172

Update advanced_analysis.py

Browse files
Files changed (1) hide show
  1. advanced_analysis.py +53 -22
advanced_analysis.py CHANGED
@@ -11,11 +11,13 @@ from sklearn.feature_extraction import text
11
  import gensim
12
  from gensim import corpora
13
  from gensim.models import LdaModel
14
- from gensim.summarization import summarize as gensim_summarize
 
15
  from transformers import pipeline
16
  import torch
17
- from keybert import KeyBERT
18
- from yake import KeywordExtractor
 
19
  import spacy
20
  from collections import defaultdict
21
  import matplotlib.pyplot as plt
@@ -24,6 +26,7 @@ import plotly.graph_objects as go
24
  from wordcloud import WordCloud
25
  import io
26
  import base64
 
27
 
28
  # Download NLTK data
29
  try:
@@ -40,8 +43,9 @@ class AdvancedTextAnalysis:
40
  def __init__(self):
41
  self.sentiment_analyzer = None
42
  self.summarizer = None
43
- self.keybert_model = None
44
- self.nlp = None
 
45
  self.stop_words_id = None
46
 
47
  # Load stopwords Indonesia
@@ -138,32 +142,34 @@ class AdvancedTextAnalysis:
138
 
139
  def extract_keywords_yake(self, texts, num_keywords=10):
140
  """
141
- Extract keywords menggunakan YAKE
142
  """
143
  try:
144
- keyword_extractor = KeywordExtractor(
145
- lan="id",
146
- n=2, # n-gram size
147
- dedupLim=0.8,
148
- dedupFunc='seqm',
149
- windowsSize=1,
150
- top=num_keywords
151
- )
152
-
153
  all_keywords = []
154
  for i, text in enumerate(texts):
155
  if not text or len(text.strip()) < 50:
156
  continue
157
 
158
  processed_text = self.preprocess_text(text)
159
- keywords = keyword_extractor.extract_keywords(processed_text)
160
 
161
- for score, keyword in keywords:
 
 
 
 
 
 
 
 
 
 
 
 
162
  all_keywords.append({
163
  'doc_id': i,
164
  'keyword': keyword,
165
  'score': round(score, 4),
166
- 'type': 'YAKE'
167
  })
168
 
169
  return all_keywords
@@ -210,7 +216,7 @@ class AdvancedTextAnalysis:
210
 
211
  def text_summarization(self, texts, ratio=0.3):
212
  """
213
- Text summarization menggunakan extractive methods
214
  """
215
  try:
216
  summaries = []
@@ -227,8 +233,10 @@ class AdvancedTextAnalysis:
227
  continue
228
 
229
  try:
230
- # Gunakan gensim untuk summarization
231
- summary = gensim_summarize(text, ratio=ratio)
 
 
232
 
233
  summaries.append({
234
  'doc_id': i,
@@ -516,4 +524,27 @@ def save_advanced_analysis_results(results):
516
  print("βœ… Hasil analisis lanjutan disimpan ke folder 'analisis'")
517
 
518
  except Exception as e:
519
- print(f"❌ Error menyimpan hasil analisis lanjutan: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import gensim
12
  from gensim import corpora
13
  from gensim.models import LdaModel
14
+ # Hapus impor yang bermasalah
15
+ # from gensim.summarization import summarize as gensim_summarize
16
  from transformers import pipeline
17
  import torch
18
+ # Hapus impor yang membutuhkan OMP
19
+ # from keybert import KeyBERT
20
+ # from yake import KeywordExtractor
21
  import spacy
22
  from collections import defaultdict
23
  import matplotlib.pyplot as plt
 
26
  from wordcloud import WordCloud
27
  import io
28
  import base64
29
+ import os
30
 
31
  # Download NLTK data
32
  try:
 
43
  def __init__(self):
44
  self.sentiment_analyzer = None
45
  self.summarizer = None
46
+ # Hapus model yang bermasalah
47
+ # self.keybert_model = None
48
+ # self.nlp = None
49
  self.stop_words_id = None
50
 
51
  # Load stopwords Indonesia
 
142
 
143
  def extract_keywords_yake(self, texts, num_keywords=10):
144
  """
145
+ Extract keywords menggunakan YAKE - Fallback version
146
  """
147
  try:
 
 
 
 
 
 
 
 
 
148
  all_keywords = []
149
  for i, text in enumerate(texts):
150
  if not text or len(text.strip()) < 50:
151
  continue
152
 
153
  processed_text = self.preprocess_text(text)
 
154
 
155
+ # Simple keyword extraction based on TF-IDF as fallback
156
+ words = processed_text.split()
157
+ word_freq = Counter(words)
158
+
159
+ # Remove stopwords and short words
160
+ filtered_words = {word: freq for word, freq in word_freq.items()
161
+ if word not in self.stop_words_id and len(word) > 2}
162
+
163
+ # Get top keywords
164
+ top_keywords = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)[:num_keywords]
165
+
166
+ for keyword, freq in top_keywords:
167
+ score = freq / len(words) # Simple frequency-based score
168
  all_keywords.append({
169
  'doc_id': i,
170
  'keyword': keyword,
171
  'score': round(score, 4),
172
+ 'type': 'FREQUENCY'
173
  })
174
 
175
  return all_keywords
 
216
 
217
  def text_summarization(self, texts, ratio=0.3):
218
  """
219
+ Text summarization menggunakan extractive methods - Simplified version
220
  """
221
  try:
222
  summaries = []
 
233
  continue
234
 
235
  try:
236
+ # Simple extractive summarization: take first few sentences
237
+ sentences = sent_tokenize(text)
238
+ num_sentences = max(1, int(len(sentences) * ratio))
239
+ summary = ' '.join(sentences[:num_sentences])
240
 
241
  summaries.append({
242
  'doc_id': i,
 
524
  print("βœ… Hasil analisis lanjutan disimpan ke folder 'analisis'")
525
 
526
  except Exception as e:
527
+ print(f"❌ Error menyimpan hasil analisis lanjutan: {e}")
528
+
529
+ # Tambahkan fungsi dummy untuk menghindari error di main app
530
+ def perform_advanced_analysis_wrapper():
531
+ """Wrapper function untuk analisis lanjutan"""
532
+ try:
533
+ # Load metadata
534
+ metadata_df = pd.read_csv('scrapper_result/article_metadata.csv')
535
+
536
+ if metadata_df.empty:
537
+ return "❌ Tidak ada data untuk dianalisis", None, None, None, None, None, None, None
538
+
539
+ # Perform analysis
540
+ result_msg, results, topic_viz, keyword_viz, concept_viz = perform_advanced_analysis(metadata_df)
541
+
542
+ # Prepare dataframes
543
+ topic_df = pd.DataFrame(results['topics']['topics']) if results and 'topics' in results else pd.DataFrame()
544
+ keyword_df = pd.DataFrame(results['keywords']) if results and 'keywords' in results else pd.DataFrame()
545
+ summary_df = pd.DataFrame(results['summaries']) if results and 'summaries' in results else pd.DataFrame()
546
+ concept_df = pd.DataFrame(results['concepts']) if results and 'concepts' in results else pd.DataFrame()
547
+
548
+ return result_msg, topic_viz, keyword_viz, concept_viz, topic_df, keyword_df, summary_df, concept_df
549
+ except Exception as e:
550
+ return f"❌ Error: {str(e)}", None, None, None, pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()