Spaces:

omkar334
/

agentic_rag

Runtime error

omkar334 commited on Sep 27, 2024

Commit

2ef8487

1 Parent(s): 76bc633

comments

Files changed (2) hide show

client.py CHANGED Viewed

@@ -20,11 +20,12 @@ class HybridClient:
     def create(self, collection: str):
         if not self.qdrant_client.collection_exists(collection):
-            self.create_collection(
                 collection_name=collection,
                 vectors_config=self.qdrant_client.get_fastembed_vector_params(),
                 sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
             )
             return collection
         return None
@@ -39,6 +40,7 @@ class HybridClient:
             metadata=chunks,
             parallel=0,
         )
     def search(self, collection, text: str, limit: int = 10):
         search_result = self.qdrant_client.query(

     def create(self, collection: str):
         if not self.qdrant_client.collection_exists(collection):
+            self.qdrant_client.create_collection(
                 collection_name=collection,
                 vectors_config=self.qdrant_client.get_fastembed_vector_params(),
                 sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
             )
+            print(f"--- {collection} collection created")
             return collection
         return None
             metadata=chunks,
             parallel=0,
         )
+        print("--- pdf inserted")
     def search(self, collection, text: str, limit: int = 10):
         search_result = self.qdrant_client.query(

preprocessing.py CHANGED Viewed

@@ -36,7 +36,7 @@ def majority_element(spans, param):
 def clean_text(text):
-    print("Cleaning = ", text)
     words = text.split()
     unique_words = OrderedDict.fromkeys(words)
     cleaned_text = " ".join(unique_words)
@@ -88,6 +88,7 @@ def get_chunks(doc):
 def process_activities(chunks):
     # activities = []
     i = 0
     while i < len(chunks):
@@ -112,11 +113,12 @@ def process_activities(chunks):
     return chunks
-def embed_pdf(path, buffer=False):
     if buffer:
         doc = pymupdf.open(stream=path, filetype="pdf")
     else:
         doc = pymupdf.open(path)
     chunks = get_chunks(doc)
     chunks = process_activities(chunks)
     return chunks

 def clean_text(text):
+    """Cleans repeated text (OCR error)"""
     words = text.split()
     unique_words = OrderedDict.fromkeys(words)
     cleaned_text = " ".join(unique_words)
 def process_activities(chunks):
+    """Groups lines of 'Activity' together"""
     # activities = []
     i = 0
     while i < len(chunks):
     return chunks
+def index_pdf(path, buffer=False):
     if buffer:
         doc = pymupdf.open(stream=path, filetype="pdf")
     else:
         doc = pymupdf.open(path)
     chunks = get_chunks(doc)
     chunks = process_activities(chunks)
+    print("--- pdf indexed")
     return chunks