Spaces:
Runtime error
Runtime error
comments
Browse files- client.py +3 -1
- preprocessing.py +4 -2
client.py
CHANGED
|
@@ -20,11 +20,12 @@ class HybridClient:
|
|
| 20 |
|
| 21 |
def create(self, collection: str):
|
| 22 |
if not self.qdrant_client.collection_exists(collection):
|
| 23 |
-
self.create_collection(
|
| 24 |
collection_name=collection,
|
| 25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
| 26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
| 27 |
)
|
|
|
|
| 28 |
return collection
|
| 29 |
return None
|
| 30 |
|
|
@@ -39,6 +40,7 @@ class HybridClient:
|
|
| 39 |
metadata=chunks,
|
| 40 |
parallel=0,
|
| 41 |
)
|
|
|
|
| 42 |
|
| 43 |
def search(self, collection, text: str, limit: int = 10):
|
| 44 |
search_result = self.qdrant_client.query(
|
|
|
|
| 20 |
|
| 21 |
def create(self, collection: str):
|
| 22 |
if not self.qdrant_client.collection_exists(collection):
|
| 23 |
+
self.qdrant_client.create_collection(
|
| 24 |
collection_name=collection,
|
| 25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
| 26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
| 27 |
)
|
| 28 |
+
print(f"--- {collection} collection created")
|
| 29 |
return collection
|
| 30 |
return None
|
| 31 |
|
|
|
|
| 40 |
metadata=chunks,
|
| 41 |
parallel=0,
|
| 42 |
)
|
| 43 |
+
print("--- pdf inserted")
|
| 44 |
|
| 45 |
def search(self, collection, text: str, limit: int = 10):
|
| 46 |
search_result = self.qdrant_client.query(
|
preprocessing.py
CHANGED
|
@@ -36,7 +36,7 @@ def majority_element(spans, param):
|
|
| 36 |
|
| 37 |
|
| 38 |
def clean_text(text):
|
| 39 |
-
|
| 40 |
words = text.split()
|
| 41 |
unique_words = OrderedDict.fromkeys(words)
|
| 42 |
cleaned_text = " ".join(unique_words)
|
|
@@ -88,6 +88,7 @@ def get_chunks(doc):
|
|
| 88 |
|
| 89 |
|
| 90 |
def process_activities(chunks):
|
|
|
|
| 91 |
# activities = []
|
| 92 |
i = 0
|
| 93 |
while i < len(chunks):
|
|
@@ -112,11 +113,12 @@ def process_activities(chunks):
|
|
| 112 |
return chunks
|
| 113 |
|
| 114 |
|
| 115 |
-
def
|
| 116 |
if buffer:
|
| 117 |
doc = pymupdf.open(stream=path, filetype="pdf")
|
| 118 |
else:
|
| 119 |
doc = pymupdf.open(path)
|
| 120 |
chunks = get_chunks(doc)
|
| 121 |
chunks = process_activities(chunks)
|
|
|
|
| 122 |
return chunks
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def clean_text(text):
|
| 39 |
+
"""Cleans repeated text (OCR error)"""
|
| 40 |
words = text.split()
|
| 41 |
unique_words = OrderedDict.fromkeys(words)
|
| 42 |
cleaned_text = " ".join(unique_words)
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
def process_activities(chunks):
|
| 91 |
+
"""Groups lines of 'Activity' together"""
|
| 92 |
# activities = []
|
| 93 |
i = 0
|
| 94 |
while i < len(chunks):
|
|
|
|
| 113 |
return chunks
|
| 114 |
|
| 115 |
|
| 116 |
+
def index_pdf(path, buffer=False):
|
| 117 |
if buffer:
|
| 118 |
doc = pymupdf.open(stream=path, filetype="pdf")
|
| 119 |
else:
|
| 120 |
doc = pymupdf.open(path)
|
| 121 |
chunks = get_chunks(doc)
|
| 122 |
chunks = process_activities(chunks)
|
| 123 |
+
print("--- pdf indexed")
|
| 124 |
return chunks
|