abdullah-alamodi commited on
Commit
6801b16
·
1 Parent(s): 29c6541

Add regex import and enhance normalization function for BM25

Browse files
Files changed (1) hide show
  1. retrieval.py +9 -0
retrieval.py CHANGED
@@ -12,6 +12,7 @@ Author: Abdullah Alamodi
12
  Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
13
  """
14
 
 
15
  import numpy as np
16
  import torch
17
  import pyarabic.araby as araby
@@ -36,6 +37,14 @@ def normalize_for_bm25(text):
36
  text = araby.normalize_hamza(text) # Normalize alef, hamza
37
  text = araby.strip_diacritics(text) # Strip all diacritics
38
  text = araby.strip_tatweel(text) # Strip tatweel (elongation)
 
 
 
 
 
 
 
 
39
  return text
40
 
41
 
 
12
  Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
13
  """
14
 
15
+ import re
16
  import numpy as np
17
  import torch
18
  import pyarabic.araby as araby
 
37
  text = araby.normalize_hamza(text) # Normalize alef, hamza
38
  text = araby.strip_diacritics(text) # Strip all diacritics
39
  text = araby.strip_tatweel(text) # Strip tatweel (elongation)
40
+
41
+ # Remove punctuation that harms BM25 token matching, especially question marks.
42
+ # Keep alphanumeric and Arabic letters; replace punctuation with a space.
43
+ # This removes Arabic question mark '؟' and ASCII '?', commas, dots, parentheses, etc.
44
+ text = re.sub(r"[؟\?\.,،؛:!\"'\(\)\[\]\-–—…«»ـ]", " ", text)
45
+
46
+ # Collapse multiple whitespace into a single space and strip ends
47
+ text = re.sub(r"\s+", " ", text).strip()
48
  return text
49
 
50