Spaces:

abdullah-alamodi
/

aqeedah-ai

Running

abdullah-alamodi commited on 9 days ago

Commit

6801b16

1 Parent(s): 29c6541

Add regex import and enhance normalization function for BM25

Files changed (1) hide show

retrieval.py CHANGED Viewed

@@ -12,6 +12,7 @@ Author: Abdullah Alamodi
 Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
 """
 import numpy as np
 import torch
 import pyarabic.araby as araby
@@ -36,6 +37,14 @@ def normalize_for_bm25(text):
     text = araby.normalize_hamza(text)   # Normalize alef, hamza
     text = araby.strip_diacritics(text)  # Strip all diacritics
     text = araby.strip_tatweel(text)     # Strip tatweel (elongation)
     return text

 Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
 """
+import re
 import numpy as np
 import torch
 import pyarabic.araby as araby
     text = araby.normalize_hamza(text)   # Normalize alef, hamza
     text = araby.strip_diacritics(text)  # Strip all diacritics
     text = araby.strip_tatweel(text)     # Strip tatweel (elongation)
+    # Remove punctuation that harms BM25 token matching, especially question marks.
+    # Keep alphanumeric and Arabic letters; replace punctuation with a space.
+    # This removes Arabic question mark '؟' and ASCII '?', commas, dots, parentheses, etc.
+    text = re.sub(r"[؟\?\.,،؛:!\"'\(\)\[\]\-–—…«»ـ]", " ", text)
+    # Collapse multiple whitespace into a single space and strip ends
+    text = re.sub(r"\s+", " ", text).strip()
     return text