"""
Aqeedah RAG Retrieval System
============================

This module implements a hybrid retrieval system combining BM25 (sparse) and 
dense embeddings (FAISS) for Arabic Islamic theology (Aqeedah) question-answering.

The system loads pre-computed embeddings from HuggingFace and performs efficient
semantic search over a corpus of authenticated Islamic scholarly texts.

Author: Abdullah Alamodi
Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
"""

import re
import numpy as np
import torch
import pyarabic.araby as araby
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from tqdm import tqdm


# --- Normalization Functions ---

def normalize_for_bm25(text):
    """
    Aggressive normalization for keyword matching.
    
    Args:
        text (str): Arabic text to normalize
        
    Returns:
        str: Normalized text suitable for BM25 indexing
    """
    text = araby.normalize_hamza(text)   # Normalize alef, hamza
    text = araby.strip_diacritics(text)  # Strip all diacritics
    text = araby.strip_tatweel(text)     # Strip tatweel (elongation)

    # Remove punctuation that harms BM25 token matching, especially question marks.
    # Keep alphanumeric and Arabic letters; replace punctuation with a space.
    # This removes Arabic question mark '؟' and ASCII '?', commas, dots, parentheses, etc.
    text = re.sub(r"[؟\?\.,،؛:!\"'\(\)\[\]\-–—…«»ـ]", " ", text)

    # Collapse multiple whitespace into a single space and strip ends
    text = re.sub(r"\s+", " ", text).strip()
    return text


def normalize_for_dense(text):
    """
    Light normalization for dense embeddings, preserves diacritics.
    
    Args:
        text (str): Arabic text to normalize
        
    Returns:
        str: Lightly normalized text suitable for dense embeddings
    """
    text = araby.normalize_hamza(text)   # Only normalize alef, hamza
    return text


# --- Helper function for pooling embeddings ---

def _average_pool(last_hidden_states, attention_mask):
    """
    Performs average pooling on the last hidden states, respecting the attention mask.
    
    Args:
        last_hidden_states: Model output hidden states
        attention_mask: Attention mask tensor
        
    Returns:
        torch.Tensor: Pooled embeddings
    """
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# --- HybridRetriever Class ---

class HybridRetriever:
    """
    Hybrid retrieval system combining BM25 and dense embeddings.
    
    This class loads the Aqeedah dataset from HuggingFace, creates a BM25 index,
    and loads pre-computed FAISS embeddings for efficient hybrid search.
    
    Args:
        embedding_model (str): HuggingFace model name for query encoding
        dataset_name (str): HuggingFace dataset name
        batch_size (int): Batch size for encoding (if needed)
        use_gpu (bool): Whether to use GPU if available
    """
    
    def __init__(
        self, 
        embedding_model="aubmindlab/bert-base-arabertv02",
        dataset_name="abdullah-alamodi/aqeedah-rag-dataset",
        batch_size=32,
        use_gpu=True
    ):
        self.dataset_name = dataset_name
        self.batch_size = batch_size
        self.device = torch.device("cuda" if (torch.cuda.is_available() and use_gpu) else "cpu")
        
        print(f"🚀 Initializing HybridRetriever")
        print(f"   Device: {self.device}")
        print(f"   Embedding model: {embedding_model}")
        print(f"   Dataset: {dataset_name}")
        
        # --- Load Dataset from HuggingFace ---
        print(f"\n📥 Loading dataset from HuggingFace...")
        self.dataset = load_dataset(dataset_name, split="train")
        self.documents = [
            {
                "content": doc["content"],
                "meta": doc["meta"]
            }
            for doc in self.dataset
        ]
        print(f"   ✓ Loaded {len(self.documents)} documents")
        
        # --- Load Dense Model ---
        print(f"\n🤖 Loading embedding model...")
        self.dense_tokenizer = AutoTokenizer.from_pretrained(embedding_model)
        self.dense_model = AutoModel.from_pretrained(embedding_model).to(self.device).eval()
        print(f"   ✓ Model loaded on {self.device}")
        
        # --- Create BM25 Index ---
        print(f"\n🔍 Building BM25 index...")
        bm25_corpus = [normalize_for_bm25(doc['content']) for doc in self.documents]
        tokenized_corpus = [doc.split() for doc in tqdm(bm25_corpus, desc="   Tokenizing")]
        self.bm25 = BM25Okapi(tokenized_corpus)
        print(f"   ✓ BM25 index created")
        
        # --- Load FAISS Index from HuggingFace ---
        print(f"\n⚡ Loading FAISS index...")
        try:
            # Try to load pre-computed index from dataset
            self.dataset.load_faiss_index("embeddings", "embeddings_index")
            print(f"   ✓ FAISS index loaded from HuggingFace")
        except Exception as e:
            print(f"   ⚠️  Could not load FAISS index from HF: {e}")
            print(f"   📊 Creating FAISS index from embeddings column...")
            # Create index from embeddings column if not available
            self.dataset.add_faiss_index(column="embeddings")
            print(f"   ✓ FAISS index created")
        
        print(f"\n✅ HybridRetriever initialized successfully!\n")
    
    def search(self, query, top_k=5):
        """
        Perform hybrid search combining BM25 and dense retrieval.
        
        Args:
            query (str): Arabic query text
            top_k (int): Number of top results to return
            
        Returns:
            list: List of retrieved documents with metadata
        """
        # A. Sparse Search (BM25)
        norm_query_bm25 = normalize_for_bm25(query)
        tokenized_query = norm_query_bm25.split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        bm25_top_k_indices = np.argsort(bm25_scores)[::-1][:top_k]
        
        # B. Dense Search (FAISS)
        norm_query_dense = f"query: {normalize_for_dense(query)}"
        inputs = self.dense_tokenizer(
            [norm_query_dense], 
            max_length=512, 
            padding=True, 
            truncation=True, 
            return_tensors='pt'
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.dense_model(**inputs)
        
        query_embedding = _average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
        query_embedding = query_embedding.cpu().numpy()[0]
        
        # Search using HuggingFace dataset's FAISS index
        scores, retrieved = self.dataset.get_nearest_examples(
            "embeddings", 
            query_embedding, 
            k=top_k
        )
        
        # Get FAISS indices (retrieve actual indices from the dataset)
        # Since get_nearest_examples returns the actual data, we need to find indices
        faiss_contents = retrieved['content']
        faiss_indices = []
        for content in faiss_contents:
            # Find index of this content in original documents
            for idx, doc in enumerate(self.documents):
                if doc['content'] == content:
                    faiss_indices.append(idx)
                    break
        
        # C. Combine results (union of BM25 and FAISS)
        fused_indices = list(set(bm25_top_k_indices.tolist()) | set(faiss_indices))
        
        # Return unique results
        results = []
        seen_contents = set()
        for idx in fused_indices:
            if idx < len(self.documents):
                content = self.documents[idx]['content']
                if content not in seen_contents:
                    results.append(self.documents[idx])
                    seen_contents.add(content)
        
        return results[:top_k * 2]  # Return up to 2x top_k for better coverage


# --- Example Usage ---

if __name__ == "__main__":
    # Initialize retriever
    retriever = HybridRetriever()
    
    # Example queries
    test_queries = [
        "ما معنى شهادة أن لا إله إلا الله وأن محمدا رسول الله صلى الله عليه وسلم؟",
        "ما الفرق بين الإسلام والإيمان؟",
        "ما هي أركان الإيمان؟"
    ]
    
    for query in test_queries:
        print(f"\n{'='*80}")
        print(f"❓ Query: {query}")
        print(f"{'='*80}")
        
        results = retriever.search(query, top_k=3)
        
        for i, result in enumerate(results, 1):
            print(f"\n[{i}] {result['meta']['doc_name']}")
            print(f"    Author: {result['meta']['author_name']}")
            print(f"    Paragraph: {result['meta']['paragraph_number']}")
            print(f"    Preview: {result['content'][:150]}...")