""" Aqeedah RAG Retrieval System ============================ This module implements a hybrid retrieval system combining BM25 (sparse) and dense embeddings (FAISS) for Arabic Islamic theology (Aqeedah) question-answering. The system loads pre-computed embeddings from HuggingFace and performs efficient semantic search over a corpus of authenticated Islamic scholarly texts. Author: Abdullah Alamodi Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset """ import re import numpy as np import torch import pyarabic.araby as araby from rank_bm25 import BM25Okapi from transformers import AutoTokenizer, AutoModel from datasets import load_dataset from tqdm import tqdm # --- Normalization Functions --- def normalize_for_bm25(text): """ Aggressive normalization for keyword matching. Args: text (str): Arabic text to normalize Returns: str: Normalized text suitable for BM25 indexing """ text = araby.normalize_hamza(text) # Normalize alef, hamza text = araby.strip_diacritics(text) # Strip all diacritics text = araby.strip_tatweel(text) # Strip tatweel (elongation) # Remove punctuation that harms BM25 token matching, especially question marks. # Keep alphanumeric and Arabic letters; replace punctuation with a space. # This removes Arabic question mark '؟' and ASCII '?', commas, dots, parentheses, etc. text = re.sub(r"[؟\?\.,،؛:!\"'\(\)\[\]\-–—…«»ـ]", " ", text) # Collapse multiple whitespace into a single space and strip ends text = re.sub(r"\s+", " ", text).strip() return text def normalize_for_dense(text): """ Light normalization for dense embeddings, preserves diacritics. Args: text (str): Arabic text to normalize Returns: str: Lightly normalized text suitable for dense embeddings """ text = araby.normalize_hamza(text) # Only normalize alef, hamza return text # --- Helper function for pooling embeddings --- def _average_pool(last_hidden_states, attention_mask): """ Performs average pooling on the last hidden states, respecting the attention mask. Args: last_hidden_states: Model output hidden states attention_mask: Attention mask tensor Returns: torch.Tensor: Pooled embeddings """ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] # --- HybridRetriever Class --- class HybridRetriever: """ Hybrid retrieval system combining BM25 and dense embeddings. This class loads the Aqeedah dataset from HuggingFace, creates a BM25 index, and loads pre-computed FAISS embeddings for efficient hybrid search. Args: embedding_model (str): HuggingFace model name for query encoding dataset_name (str): HuggingFace dataset name batch_size (int): Batch size for encoding (if needed) use_gpu (bool): Whether to use GPU if available """ def __init__( self, embedding_model="aubmindlab/bert-base-arabertv02", dataset_name="abdullah-alamodi/aqeedah-rag-dataset", batch_size=32, use_gpu=True ): self.dataset_name = dataset_name self.batch_size = batch_size self.device = torch.device("cuda" if (torch.cuda.is_available() and use_gpu) else "cpu") print(f"🚀 Initializing HybridRetriever") print(f" Device: {self.device}") print(f" Embedding model: {embedding_model}") print(f" Dataset: {dataset_name}") # --- Load Dataset from HuggingFace --- print(f"\n📥 Loading dataset from HuggingFace...") self.dataset = load_dataset(dataset_name, split="train") self.documents = [ { "content": doc["content"], "meta": doc["meta"] } for doc in self.dataset ] print(f" ✓ Loaded {len(self.documents)} documents") # --- Load Dense Model --- print(f"\n🤖 Loading embedding model...") self.dense_tokenizer = AutoTokenizer.from_pretrained(embedding_model) self.dense_model = AutoModel.from_pretrained(embedding_model).to(self.device).eval() print(f" ✓ Model loaded on {self.device}") # --- Create BM25 Index --- print(f"\n🔍 Building BM25 index...") bm25_corpus = [normalize_for_bm25(doc['content']) for doc in self.documents] tokenized_corpus = [doc.split() for doc in tqdm(bm25_corpus, desc=" Tokenizing")] self.bm25 = BM25Okapi(tokenized_corpus) print(f" ✓ BM25 index created") # --- Load FAISS Index from HuggingFace --- print(f"\n⚡ Loading FAISS index...") try: # Try to load pre-computed index from dataset self.dataset.load_faiss_index("embeddings", "embeddings_index") print(f" ✓ FAISS index loaded from HuggingFace") except Exception as e: print(f" ⚠️ Could not load FAISS index from HF: {e}") print(f" 📊 Creating FAISS index from embeddings column...") # Create index from embeddings column if not available self.dataset.add_faiss_index(column="embeddings") print(f" ✓ FAISS index created") print(f"\n✅ HybridRetriever initialized successfully!\n") def search(self, query, top_k=5): """ Perform hybrid search combining BM25 and dense retrieval. Args: query (str): Arabic query text top_k (int): Number of top results to return Returns: list: List of retrieved documents with metadata """ # A. Sparse Search (BM25) norm_query_bm25 = normalize_for_bm25(query) tokenized_query = norm_query_bm25.split() bm25_scores = self.bm25.get_scores(tokenized_query) bm25_top_k_indices = np.argsort(bm25_scores)[::-1][:top_k] # B. Dense Search (FAISS) norm_query_dense = f"query: {normalize_for_dense(query)}" inputs = self.dense_tokenizer( [norm_query_dense], max_length=512, padding=True, truncation=True, return_tensors='pt' ).to(self.device) with torch.no_grad(): outputs = self.dense_model(**inputs) query_embedding = _average_pool(outputs.last_hidden_state, inputs['attention_mask']) query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1) query_embedding = query_embedding.cpu().numpy()[0] # Search using HuggingFace dataset's FAISS index scores, retrieved = self.dataset.get_nearest_examples( "embeddings", query_embedding, k=top_k ) # Get FAISS indices (retrieve actual indices from the dataset) # Since get_nearest_examples returns the actual data, we need to find indices faiss_contents = retrieved['content'] faiss_indices = [] for content in faiss_contents: # Find index of this content in original documents for idx, doc in enumerate(self.documents): if doc['content'] == content: faiss_indices.append(idx) break # C. Combine results (union of BM25 and FAISS) fused_indices = list(set(bm25_top_k_indices.tolist()) | set(faiss_indices)) # Return unique results results = [] seen_contents = set() for idx in fused_indices: if idx < len(self.documents): content = self.documents[idx]['content'] if content not in seen_contents: results.append(self.documents[idx]) seen_contents.add(content) return results[:top_k * 2] # Return up to 2x top_k for better coverage # --- Example Usage --- if __name__ == "__main__": # Initialize retriever retriever = HybridRetriever() # Example queries test_queries = [ "ما معنى شهادة أن لا إله إلا الله وأن محمدا رسول الله صلى الله عليه وسلم؟", "ما الفرق بين الإسلام والإيمان؟", "ما هي أركان الإيمان؟" ] for query in test_queries: print(f"\n{'='*80}") print(f"❓ Query: {query}") print(f"{'='*80}") results = retriever.search(query, top_k=3) for i, result in enumerate(results, 1): print(f"\n[{i}] {result['meta']['doc_name']}") print(f" Author: {result['meta']['author_name']}") print(f" Paragraph: {result['meta']['paragraph_number']}") print(f" Preview: {result['content'][:150]}...")