Spaces:
Running
Running
Commit
·
6801b16
1
Parent(s):
29c6541
Add regex import and enhance normalization function for BM25
Browse files- retrieval.py +9 -0
retrieval.py
CHANGED
|
@@ -12,6 +12,7 @@ Author: Abdullah Alamodi
|
|
| 12 |
Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
|
| 13 |
"""
|
| 14 |
|
|
|
|
| 15 |
import numpy as np
|
| 16 |
import torch
|
| 17 |
import pyarabic.araby as araby
|
|
@@ -36,6 +37,14 @@ def normalize_for_bm25(text):
|
|
| 36 |
text = araby.normalize_hamza(text) # Normalize alef, hamza
|
| 37 |
text = araby.strip_diacritics(text) # Strip all diacritics
|
| 38 |
text = araby.strip_tatweel(text) # Strip tatweel (elongation)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return text
|
| 40 |
|
| 41 |
|
|
|
|
| 12 |
Dataset: https://huggingface.co/datasets/abdullah-alamodi/aqeedah-rag-dataset
|
| 13 |
"""
|
| 14 |
|
| 15 |
+
import re
|
| 16 |
import numpy as np
|
| 17 |
import torch
|
| 18 |
import pyarabic.araby as araby
|
|
|
|
| 37 |
text = araby.normalize_hamza(text) # Normalize alef, hamza
|
| 38 |
text = araby.strip_diacritics(text) # Strip all diacritics
|
| 39 |
text = araby.strip_tatweel(text) # Strip tatweel (elongation)
|
| 40 |
+
|
| 41 |
+
# Remove punctuation that harms BM25 token matching, especially question marks.
|
| 42 |
+
# Keep alphanumeric and Arabic letters; replace punctuation with a space.
|
| 43 |
+
# This removes Arabic question mark '؟' and ASCII '?', commas, dots, parentheses, etc.
|
| 44 |
+
text = re.sub(r"[؟\?\.,،؛:!\"'\(\)\[\]\-–—…«»ـ]", " ", text)
|
| 45 |
+
|
| 46 |
+
# Collapse multiple whitespace into a single space and strip ends
|
| 47 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 48 |
return text
|
| 49 |
|
| 50 |
|