import pandas as pd
import numpy as np
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text
import gensim
from gensim import corpora
from gensim.models import LdaModel
# Hapus impor yang bermasalah
# from gensim.summarization import summarize as gensim_summarize
from transformers import pipeline
import torch
# Hapus impor yang membutuhkan OMP
# from keybert import KeyBERT
# from yake import KeywordExtractor
import spacy
from collections import defaultdict
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import io
import base64
import os

# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

class AdvancedTextAnalysis:
    def __init__(self):
        self.sentiment_analyzer = None
        self.summarizer = None
        # Hapus model yang bermasalah
        # self.keybert_model = None
        # self.nlp = None
        self.stop_words_id = None
        
        # Load stopwords Indonesia
        self._load_indonesian_stopwords()
        
    def _load_indonesian_stopwords(self):
        """Load stopwords Indonesia"""
        try:
            # Basic Indonesian stopwords
            self.stop_words_id = set(stopwords.words('indonesian'))
        except:
            # Fallback stopwords
            self.stop_words_id = {
                'yang', 'dan', 'di', 'dengan', 'ini', 'itu', 'dari', 'dalam', 
                'untuk', 'pada', 'ke', 'tidak', 'akan', 'ada', 'adalah', 
                'atau', 'juga', 'bahwa', 'sebagai', 'dapat', 'oleh', 'karena'
            }
    
    def preprocess_text(self, text):
        """Preprocessing teks untuk analisis"""
        if not text or pd.isna(text):
            return ""
        
        # Clean text
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip().lower()
        
        return text
    
    def topic_modeling_lda(self, texts, num_topics=5, num_words=10):
        """
        Topic Modeling menggunakan LDA
        """
        try:
            # Preprocess texts
            processed_texts = [self.preprocess_text(text) for text in texts]
            
            # Create document-term matrix
            vectorizer = CountVectorizer(
                max_features=1000,
                stop_words=list(self.stop_words_id),
                min_df=2,
                max_df=0.95
            )
            
            dtm = vectorizer.fit_transform(processed_texts)
            feature_names = vectorizer.get_feature_names_out()
            
            # Apply LDA
            lda = LatentDirichletAllocation(
                n_components=num_topics,
                random_state=42,
                max_iter=10
            )
            
            lda.fit(dtm)
            
            # Extract topics
            topics = []
            for topic_idx, topic in enumerate(lda.components_):
                top_words_idx = topic.argsort()[-num_words:][::-1]
                top_words = [feature_names[i] for i in top_words_idx]
                topic_words = ", ".join(top_words)
                topics.append({
                    'topic_id': topic_idx + 1,
                    'keywords': topic_words,
                    'top_words': top_words,
                    'topic_weight': round(topic.sum(), 2)
                })
            
            # Get topic distribution for documents
            topic_distribution = lda.transform(dtm)
            doc_topics = []
            for i, dist in enumerate(topic_distribution):
                dominant_topic = dist.argmax()
                doc_topics.append({
                    'doc_id': i,
                    'dominant_topic': dominant_topic + 1,
                    'topic_confidence': round(dist[dominant_topic], 3),
                    'topic_distribution': dist.tolist()
                })
            
            return {
                'topics': topics,
                'document_topics': doc_topics,
                'model': lda,
                'vectorizer': vectorizer
            }
            
        except Exception as e:
            print(f"Error in topic modeling: {e}")
            return None
    
    def extract_keywords_yake(self, texts, num_keywords=10):
        """
        Extract keywords menggunakan YAKE - Fallback version
        """
        try:
            all_keywords = []
            for i, text in enumerate(texts):
                if not text or len(text.strip()) < 50:
                    continue
                
                processed_text = self.preprocess_text(text)
                
                # Simple keyword extraction based on TF-IDF as fallback
                words = processed_text.split()
                word_freq = Counter(words)
                
                # Remove stopwords and short words
                filtered_words = {word: freq for word, freq in word_freq.items() 
                                if word not in self.stop_words_id and len(word) > 2}
                
                # Get top keywords
                top_keywords = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)[:num_keywords]
                
                for keyword, freq in top_keywords:
                    score = freq / len(words)  # Simple frequency-based score
                    all_keywords.append({
                        'doc_id': i,
                        'keyword': keyword,
                        'score': round(score, 4),
                        'type': 'FREQUENCY'
                    })
            
            return all_keywords
            
        except Exception as e:
            print(f"Error in YAKE keyword extraction: {e}")
            return []
    
    def extract_keywords_tfidf(self, texts, num_keywords=10):
        """
        Extract keywords menggunakan TF-IDF
        """
        try:
            processed_texts = [self.preprocess_text(text) for text in texts]
            
            vectorizer = TfidfVectorizer(
                max_features=100,
                stop_words=list(self.stop_words_id),
                ngram_range=(1, 2)
            )
            
            tfidf_matrix = vectorizer.fit_transform(processed_texts)
            feature_names = vectorizer.get_feature_names_out()
            
            all_keywords = []
            for i, doc_vector in enumerate(tfidf_matrix):
                scores = doc_vector.toarray().flatten()
                top_indices = scores.argsort()[-num_keywords:][::-1]
                
                for idx in top_indices:
                    if scores[idx] > 0:
                        all_keywords.append({
                            'doc_id': i,
                            'keyword': feature_names[idx],
                            'score': round(scores[idx], 4),
                            'type': 'TF-IDF'
                        })
            
            return all_keywords
            
        except Exception as e:
            print(f"Error in TF-IDF keyword extraction: {e}")
            return []
    
    def text_summarization(self, texts, ratio=0.3):
        """
        Text summarization menggunakan extractive methods - Simplified version
        """
        try:
            summaries = []
            
            for i, text in enumerate(texts):
                if not text or len(text.strip()) < 200:
                    summaries.append({
                        'doc_id': i,
                        'summary': 'Teks terlalu pendek untuk diringkas',
                        'original_length': len(text) if text else 0,
                        'summary_length': 0,
                        'compression_ratio': 0
                    })
                    continue
                
                try:
                    # Simple extractive summarization: take first few sentences
                    sentences = sent_tokenize(text)
                    num_sentences = max(1, int(len(sentences) * ratio))
                    summary = ' '.join(sentences[:num_sentences])
                    
                    summaries.append({
                        'doc_id': i,
                        'summary': summary,
                        'original_length': len(text),
                        'summary_length': len(summary),
                        'compression_ratio': round(len(summary) / len(text), 2)
                    })
                    
                except Exception as e:
                    # Fallback: ambil beberapa kalimat pertama
                    sentences = sent_tokenize(text)
                    if len(sentences) > 3:
                        summary = ' '.join(sentences[:3])
                    else:
                        summary = text[:500] + '...' if len(text) > 500 else text
                    
                    summaries.append({
                        'doc_id': i,
                        'summary': summary,
                        'original_length': len(text),
                        'summary_length': len(summary),
                        'compression_ratio': round(len(summary) / len(text), 2)
                    })
            
            return summaries
            
        except Exception as e:
            print(f"Error in text summarization: {e}")
            return []
    
    def concept_extraction(self, texts, min_freq=2):
        """
        Extract concepts menggunakan frequency analysis dan pattern matching
        """
        try:
            # Preprocess texts
            processed_texts = [self.preprocess_text(text) for text in texts]
            
            # Extract noun phrases (simple pattern-based)
            concepts = []
            concept_patterns = [
                r'\b(\w+\s+\w+\s+\w+)\b',  # 3-word phrases
                r'\b(\w+\s+\w+)\b',         # 2-word phrases
            ]
            
            for pattern in concept_patterns:
                for i, text in enumerate(processed_texts):
                    matches = re.findall(pattern, text)
                    for match in matches:
                        # Filter out phrases with stopwords
                        words = match.split()
                        if not all(word in self.stop_words_id for word in words):
                            concepts.append({
                                'doc_id': i,
                                'concept': match,
                                'length': len(words),
                                'type': 'noun_phrase'
                            })
            
            # Frequency analysis
            concept_freq = Counter([c['concept'] for c in concepts])
            
            # Filter by frequency and prepare results
            concept_results = []
            for concept, freq in concept_freq.items():
                if freq >= min_freq:
                    concept_results.append({
                        'concept': concept,
                        'frequency': freq,
                        'type': 'noun_phrase'
                    })
            
            return concept_results[:50]  # Return top 50 concepts
            
        except Exception as e:
            print(f"Error in concept extraction: {e}")
            return []
    
    def create_topic_visualization(self, topics):
        """Buat visualisasi untuk topic modeling"""
        try:
            if not topics:
                return None
            
            # Prepare data for visualization
            topic_data = []
            for topic in topics:
                words = topic['top_words'][:5]  # Top 5 words per topic
                for word in words:
                    topic_data.append({
                        'topic': f'Topik {topic["topic_id"]}',
                        'word': word,
                        'importance': 1  # Placeholder
                    })
            
            df = pd.DataFrame(topic_data)
            
            # Create bar chart
            fig = px.bar(df, x='word', y='importance', color='topic',
                        title='Distribusi Kata dalam Topik',
                        labels={'word': 'Kata', 'importance': 'Importance'},
                        height=400)
            
            fig.update_layout(
                xaxis_tickangle=-45,
                showlegend=True,
                template='plotly_white'
            )
            
            return fig
            
        except Exception as e:
            print(f"Error creating topic visualization: {e}")
            return None
    
    def create_keyword_cloud(self, keywords):
        """Buat wordcloud untuk keywords"""
        try:
            if not keywords:
                return None
            
            # Create word frequencies
            word_freq = {}
            for kw in keywords:
                word = kw['keyword']
                score = kw['score']
                word_freq[word] = word_freq.get(word, 0) + score
            
            # Create wordcloud
            wordcloud = WordCloud(
                width=800,
                height=400,
                background_color='white',
                colormap='viridis',
                max_words=50
            ).generate_from_frequencies(word_freq)
            
            # Convert to plot
            fig, ax = plt.subplots(figsize=(10, 5))
            ax.imshow(wordcloud, interpolation='bilinear')
            ax.axis('off')
            ax.set_title('WordCloud - Keywords Extraction', fontsize=16, pad=20)
            
            return fig
            
        except Exception as e:
            print(f"Error creating keyword cloud: {e}")
            return None
    
    def create_concept_network(self, concepts):
        """Buat network visualization untuk concepts"""
        try:
            if not concepts:
                return None
            
            # Prepare data for network
            concept_df = pd.DataFrame(concepts)
            top_concepts = concept_df.nlargest(15, 'frequency')
            
            # Create bubble chart
            fig = px.scatter(
                top_concepts,
                x='frequency',
                y=[1] * len(top_concepts),  # Dummy y-axis
                size='frequency',
                color='frequency',
                hover_name='concept',
                size_max=60,
                title='Top 15 Concepts - Frequency Distribution'
            )
            
            fig.update_layout(
                xaxis_title='Frekuensi',
                yaxis_title='',
                template='plotly_white',
                height=400,
                showlegend=False
            )
            
            fig.update_yaxes(showticklabels=False)
            
            return fig
            
        except Exception as e:
            print(f"Error creating concept network: {e}")
            return None

def perform_advanced_analysis(metadata_df, progress_callback=None):
    """
    Fungsi utama untuk melakukan analisis lanjutan
    """
    try:
        analyzer = AdvancedTextAnalysis()
        
        # Prepare texts
        texts = []
        for idx, row in metadata_df.iterrows():
            title = row.get('judul', '')
            content = row.get('konten', '')
            combined_text = f"{title}. {content}" if title else content
            texts.append(combined_text)
        
        if not texts:
            return "Tidak ada teks untuk dianalisis", None, None, None, None
        
        results = {}
        
        # 1. Topic Modeling
        if progress_callback:
            progress_callback(0.2, "Melakukan Topic Modeling...")
        
        topic_results = analyzer.topic_modeling_lda(texts, num_topics=5, num_words=8)
        results['topics'] = topic_results
        
        # 2. Keyword Extraction
        if progress_callback:
            progress_callback(0.4, "Melakukan Keyword Extraction...")
        
        yake_keywords = analyzer.extract_keywords_yake(texts, num_keywords=5)
        tfidf_keywords = analyzer.extract_keywords_tfidf(texts, num_keywords=5)
        all_keywords = yake_keywords + tfidf_keywords
        results['keywords'] = all_keywords
        
        # 3. Text Summarization
        if progress_callback:
            progress_callback(0.6, "Melakukan Text Summarization...")
        
        summaries = analyzer.text_summarization(texts, ratio=0.3)
        results['summaries'] = summaries
        
        # 4. Concept Extraction
        if progress_callback:
            progress_callback(0.8, "Melakukan Concept Extraction...")
        
        concepts = analyzer.concept_extraction(texts, min_freq=2)
        results['concepts'] = concepts
        
        # Create visualizations
        if progress_callback:
            progress_callback(0.9, "Membuat visualisasi...")
        
        topic_viz = analyzer.create_topic_visualization(topic_results['topics'] if topic_results else [])
        keyword_viz = analyzer.create_keyword_cloud(all_keywords)
        concept_viz = analyzer.create_concept_network(concepts)
        
        # Save results to CSV
        save_advanced_analysis_results(results)
        
        if progress_callback:
            progress_callback(1.0, "Analisis lanjutan selesai!")
        
        return "✅ Analisis lanjutan berhasil!", results, topic_viz, keyword_viz, concept_viz
        
    except Exception as e:
        error_msg = f"❌ Error dalam analisis lanjutan: {str(e)}"
        print(error_msg)
        return error_msg, None, None, None, None

def save_advanced_analysis_results(results):
    """Simpan hasil analisis lanjutan ke CSV"""
    try:
        os.makedirs('analisis', exist_ok=True)
        
        # Save topics
        if 'topics' in results and results['topics']:
            topics_df = pd.DataFrame(results['topics']['topics'])
            topics_df.to_csv('analisis/topic_modeling_results.csv', index=False)
        
        # Save keywords
        if 'keywords' in results and results['keywords']:
            keywords_df = pd.DataFrame(results['keywords'])
            keywords_df.to_csv('analisis/keyword_extraction_results.csv', index=False)
        
        # Save summaries
        if 'summaries' in results and results['summaries']:
            summaries_df = pd.DataFrame(results['summaries'])
            summaries_df.to_csv('analisis/text_summarization_results.csv', index=False)
        
        # Save concepts
        if 'concepts' in results and results['concepts']:
            concepts_df = pd.DataFrame(results['concepts'])
            concepts_df.to_csv('analisis/concept_extraction_results.csv', index=False)
        
        print("✅ Hasil analisis lanjutan disimpan ke folder 'analisis'")
        
    except Exception as e:
        print(f"❌ Error menyimpan hasil analisis lanjutan: {e}")

# Tambahkan fungsi dummy untuk menghindari error di main app
def perform_advanced_analysis_wrapper():
    """Wrapper function untuk analisis lanjutan"""
    try:
        # Load metadata
        metadata_df = pd.read_csv('scrapper_result/article_metadata.csv')
        
        if metadata_df.empty:
            return "❌ Tidak ada data untuk dianalisis", None, None, None, None, None, None, None
        
        # Perform analysis
        result_msg, results, topic_viz, keyword_viz, concept_viz = perform_advanced_analysis(metadata_df)
        
        # Prepare dataframes
        topic_df = pd.DataFrame(results['topics']['topics']) if results and 'topics' in results else pd.DataFrame()
        keyword_df = pd.DataFrame(results['keywords']) if results and 'keywords' in results else pd.DataFrame()
        summary_df = pd.DataFrame(results['summaries']) if results and 'summaries' in results else pd.DataFrame()
        concept_df = pd.DataFrame(results['concepts']) if results and 'concepts' in results else pd.DataFrame()
        
        return result_msg, topic_viz, keyword_viz, concept_viz, topic_df, keyword_df, summary_df, concept_df
    except Exception as e:
        return f"❌ Error: {str(e)}", None, None, None, pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()