farjadmalik commited on
Commit
657ce3b
·
0 Parent(s):

Initial commit: Iqbal Poetry RAG system

Browse files
.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # === Python virtual environments ===
2
+ .env_iqbal_rag/
3
+
4
+ # === Python cache and compiled files ===
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+
10
+ # === Dataset (do not track poetry data) ===
11
+ data/
12
+
13
+ # === Not Implemented Features ===
14
+ sft/
15
+
16
+
17
+ # === Environment variable/config files ===
18
+ .env
19
+
20
+ # === Logs ===
21
+ *.log
22
+
23
+ # === IDE/editor folders ===
24
+ .vscode/
25
+ .idea/
26
+
27
+ # === OS generated files ===
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # === Test and coverage outputs ===
32
+ .pytest_cache/
33
+ .coverage
34
+ htmlcov/
35
+
36
+ # === Jupyter Notebook checkpoints ===
37
+ .ipynb_checkpoints/
README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Iqbal Poetry RAG System
2
+
3
+ A Retrieval-Augmented Generation (RAG) system for exploring and querying the poetry of Allama Iqbal. This project leverages vector search and large language models (LLMs) to answer questions about Iqbal's poetry, providing relevant poem excerpts as context.
4
+
5
+ ---
6
+
7
+ ## Features
8
+
9
+ - **Semantic Search**: Retrieve the most relevant poems for a given question using vector embeddings.
10
+ - **LLM-Powered Answers**: Generate answers using a language model, grounded in retrieved poem context.
11
+ - **Configurable**: Easily adjust retrieval thresholds, model settings, and data sources.
12
+ - **Error Handling**: Robust error management for smoother user experience.
13
+ - **(Optional) Feedback Logging**: Log user feedback for continuous improvement.
14
+
15
+ ---
16
+
17
+ ## Installation
18
+
19
+ ### Prerequisites
20
+
21
+ - Python 3.9+
22
+ - [uv](https://github.com/astral-sh/uv) (a fast Python package installer, drop-in replacement for pip)
23
+
24
+ ### 1. Clone the repository
25
+
26
+ ```bash
27
+ git clone https://github.com/yourusername/iqbal_poetry_rag.git
28
+ cd iqbal_poetry_rag
29
+ ```
30
+
31
+ ### 2. Install dependencies
32
+
33
+ ```bash
34
+ uv pip install -r requirements.txt
35
+ ```
36
+
37
+ ---
38
+
39
+ ## Usage
40
+
41
+ 1. **Prepare your data**: Place your poems JSON file at the path specified in `app/config.py` (`JSON_FILE_PATH`).
42
+ 2. **Run the main application** (example, adjust as needed):
43
+
44
+ ```bash
45
+ python app/main.py
46
+ ```
47
+
48
+ 3. **Query the system**: Enter your question about Iqbal's poetry and receive contextually grounded answers.
49
+
50
+ ---
51
+
52
+ ## Project Structure
53
+
54
+ ```bash
55
+ iqbal_poetry_rag/
56
+
57
+ ├── app/
58
+ │ ├── RAGSystem.py # Main RAG system class
59
+ │ ├── main.py # Entry point for the application
60
+ │ └── config.py # Configuration (thresholds, file paths, etc.)
61
+
62
+ ├── rag/
63
+ │ ├── vector_store.py # Vector store initialization and building
64
+ │ ├── retriever.py # Retriever configuration
65
+ │ ├── llm.py # LLM initialization and prompt management
66
+
67
+ ├── utils/
68
+ │ ├── error_handling.py # Error handling decorators
69
+ │ └── feedback_logger.py # (Optional) Feedback logging
70
+
71
+ ├── requirements.txt # Project dependencies
72
+ └── README.md # This file
73
+ ```
74
+
75
+ ---
76
+
77
+ ## Configuration
78
+
79
+ Edit `app/config.py` to set:
80
+
81
+ - `SCORE_THRESHOLD`: Minimum similarity score for retrieved poems.
82
+ - `JSON_FILE_PATH`: Path to your poems data file.
83
+
84
+ ---
85
+
86
+ ## Contributing
87
+
88
+ Contributions are welcome! Please open issues or submit pull requests for improvements or bug fixes.
89
+
90
+ ---
91
+
92
+ ## License
93
+
94
+ [MIT License](LICENSE)
95
+
96
+ ---
97
+
98
+ ## Acknowledgements
99
+
100
+ - Inspired by the poetry of Allama Iqbal.
101
+ - Built with Python, vector search, and LLM technologies.
app/RAGSystem.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main RAG system implementation."""
2
+
3
+
4
+ from rag.vector_store import initialize_vector_store, build_vector_store_from_json
5
+ from rag.retriever import configure_retriever
6
+ from rag.llm import initialize_llm, get_rag_prompt
7
+ # from utils.feedback_logger import FeedbackLogger
8
+ from utils.error_handling import handle_rag_error
9
+ from app.config import SCORE_THRESHOLD, JSON_FILE_PATH
10
+
11
+ class IqbalRAGSystem:
12
+ """Manages the RAG system for Iqbal's poetry."""
13
+
14
+ def __init__(self):
15
+ """Initialize the RAG system components."""
16
+ # Build or load vector store
17
+ self.vector_store = build_vector_store_from_json(JSON_FILE_PATH)
18
+ self.retriever = configure_retriever(self.vector_store)
19
+ self.llm = initialize_llm()
20
+ self.prompt = get_rag_prompt()
21
+ self.chain = self.prompt | self.llm
22
+ # self.feedback_logger = FeedbackLogger()
23
+
24
+ @handle_rag_error
25
+ def query_rag(self, question):
26
+ """Process a query through the RAG system."""
27
+ print(f"**********************************************************************")
28
+ print(f"query_rag: {question}")
29
+ docs = self.retriever.invoke(question, config={'score_threshold': SCORE_THRESHOLD})
30
+ if not docs:
31
+ return "No relevant poems found", []
32
+
33
+ print(f"docs: {docs}")
34
+ context = "\n\n".join(doc.page_content for doc in docs)
35
+ print(f"context: {context}")
36
+ context_ids = [doc.metadata.get("poem_id", "") for doc in docs]
37
+ print(f"context_ids: {context_ids}")
38
+
39
+ response = self.chain.invoke({
40
+ 'context': context,
41
+ 'question': question
42
+ })
43
+ print(f"response: {response}")
44
+ print(f"**********************************************************************")
45
+
46
+ return response, context_ids
47
+
48
+ # def log_feedback(self, query, response, feedback, comment, context_ids):
49
+ # """Log user feedback."""
50
+ # self.feedback_logger.log_feedback(query, response, feedback, comment, context_ids)
app/__init__.py ADDED
File without changes
app/config.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration settings for the Iqbal Poetry RAG application."""
2
+
3
+ import os
4
+ import gradio as gr
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Application settings
11
+ APP_NAME = "RAG-Iqbal: Q&A with Allama Iqbal based on a poetry dataset"
12
+ DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
13
+
14
+ # API Keys
15
+ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
16
+
17
+ # Paths
18
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
+ CHROMA_DB_DIR = os.path.join(BASE_DIR, "data", "iqbalchroma_db")
20
+ FEEDBACK_DIR = os.path.join(BASE_DIR, "data", "feedback")
21
+ JSON_FILE_PATH = os.path.join(BASE_DIR, "data", "processed_data", "iqbal_poems_rag.json")
22
+
23
+ # Create necessary directories
24
+ os.makedirs(os.path.dirname(JSON_FILE_PATH), exist_ok=True)
25
+ os.makedirs(CHROMA_DB_DIR, exist_ok=True)
26
+ os.makedirs(FEEDBACK_DIR, exist_ok=True)
27
+
28
+ # RAG settings
29
+ EMBEDDING_MODEL = "llama3" # This should match your Ollama model name exactly
30
+ LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # "microsoft/phi-2"
31
+ RETRIEVER_K = 5
32
+ RETRIEVER_FETCH_K = 20
33
+ RETRIEVER_LAMBDA_MULT = 0.75
34
+ SCORE_THRESHOLD = 0.65
35
+
36
+ # Gradio settings
37
+ GRADIO_THEME = gr.themes.Soft() # Import gr here
38
+ GRADIO_SERVER_PORT = int(os.getenv("PORT", 7860))
app/gradio_interface.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio interface for the Iqbal Poetry RAG application."""
2
+
3
+ import gradio as gr
4
+ from app.config import APP_NAME, GRADIO_THEME, GRADIO_SERVER_PORT
5
+
6
+ def process_query(question, history):
7
+ """Process user query through the RAG system."""
8
+ try:
9
+ response, _ = rag_system.query_rag(question)
10
+ return response, _
11
+ except Exception as e:
12
+ error_message = f"Error processing query: {str(e)}"
13
+ print(f"Error in process_query: {error_message}") # Log error for debugging
14
+ return error_message
15
+
16
+ def handle_feedback(question, response, feedback, comment):
17
+ """Handle user feedback submission."""
18
+ if not question or not response:
19
+ return "Feedback not logged - missing data"
20
+
21
+ # Since feedback logging is commented out in IqbalRAGSystem, just return success
22
+ return "Feedback logging is currently disabled"
23
+
24
+ def create_gradio_interface():
25
+ """Create and configure the Gradio interface."""
26
+ with gr.Blocks(theme=GRADIO_THEME, title=APP_NAME) as app:
27
+ gr.Markdown(f"# {APP_NAME}")
28
+ gr.Markdown("""
29
+ Welcome to the Iqbal Poetry RAG system! Ask questions about Iqbal's philosophical poetry.
30
+ The system will search through Iqbal's poems and provide relevant answers based on the content.
31
+ """)
32
+
33
+ chatbot = gr.Chatbot(
34
+ label="Conversation",
35
+ height=500,
36
+ type="messages", # Explicitly set type to 'messages'
37
+ show_label=True,
38
+ container=True
39
+ )
40
+
41
+ with gr.Row():
42
+ with gr.Column(scale=4):
43
+ input_question = gr.Textbox(
44
+ label="Your Question",
45
+ placeholder="Ask about philosophical concepts in Iqbal's poetry...",
46
+ lines=3,
47
+ show_label=True
48
+ )
49
+
50
+ with gr.Column(scale=1):
51
+ submit_btn = gr.Button("Submit", variant="primary")
52
+
53
+ with gr.Accordion("Provide Feedback", open=False):
54
+ feedback_rating = gr.Radio(
55
+ ["Helpful", "Partially Helpful", "Incorrect"],
56
+ label="Feedback Rating",
57
+ value="Helpful"
58
+ )
59
+
60
+ feedback_comment = gr.Textbox(
61
+ label="Additional Comments",
62
+ placeholder="Provide detailed feedback...",
63
+ lines=3
64
+ )
65
+
66
+ feedback_btn = gr.Button("Submit Feedback")
67
+ feedback_status = gr.Markdown()
68
+
69
+ gr.Examples(
70
+ examples=[
71
+ "Explain Iqbal's concept of Khudi",
72
+ "Analyze the symbolism in 'The Himalayas' poem",
73
+ "Compare Iqbal's view of nature with Romantic poets",
74
+ "What is Iqbal's view on Western materialism?",
75
+ "Discuss the influence of Rumi on Iqbal's philosophy",
76
+ "What are the main themes in 'The Secrets of the Self'?",
77
+ "How does Iqbal view the relationship between God and man?"
78
+ ],
79
+ inputs=input_question
80
+ )
81
+
82
+ def user_input(user_message, history):
83
+ if not user_message.strip():
84
+ return "", history
85
+ # Convert to new message format
86
+ history = history + [{"role": "user", "content": user_message}]
87
+ return "", history
88
+
89
+ def bot_response(history):
90
+ user_message = history[-1]["content"]
91
+ bot_message, _ = process_query(user_message, history)
92
+ # Convert to new message format
93
+ history = history + [{"role": "assistant", "content": bot_message}]
94
+ return history
95
+
96
+ submit_btn.click(
97
+ user_input,
98
+ [input_question, chatbot],
99
+ [input_question, chatbot],
100
+ queue=False
101
+ ).then(
102
+ bot_response,
103
+ chatbot,
104
+ chatbot
105
+ )
106
+
107
+ input_question.submit(
108
+ user_input,
109
+ [input_question, chatbot],
110
+ [input_question, chatbot],
111
+ queue=False
112
+ ).then(
113
+ bot_response,
114
+ chatbot,
115
+ chatbot
116
+ )
117
+
118
+ feedback_btn.click(
119
+ fn=handle_feedback,
120
+ inputs=[input_question, chatbot, feedback_rating, feedback_comment],
121
+ outputs=feedback_status
122
+ )
123
+
124
+ return app
125
+
126
+ def launch_gradio_app(system=None):
127
+ """Launch the Gradio application."""
128
+ global rag_system
129
+ if system is not None:
130
+ globals()['rag_system'] = system
131
+
132
+ app = create_gradio_interface()
133
+ app.launch(
134
+ server_name="0.0.0.0",
135
+ server_port=GRADIO_SERVER_PORT,
136
+ share=False,
137
+ show_error=True # Show detailed error messages in the UI
138
+ )
rag/__init__.py ADDED
File without changes
rag/embeddings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Embedding functionality for the RAG system."""
2
+
3
+ import os
4
+ from langchain_ollama import OllamaEmbeddings
5
+ from app.config import EMBEDDING_MODEL
6
+
7
+ def get_embeddings():
8
+ """Initialize and return the embedding model."""
9
+ try:
10
+ return OllamaEmbeddings(
11
+ model=EMBEDDING_MODEL,
12
+ base_url="http://localhost:11434" # Explicitly set the base URL
13
+ )
14
+ except Exception as e:
15
+ print(f"Warning: Failed to initialize OllamaEmbeddings: {e}")
16
+ # Fallback to a different embedding method if needed
17
+ raise
rag/llm.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM configuration for the RAG system."""
2
+
3
+ from langchain_community.chat_models import ChatPerplexity
4
+ from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
6
+ from langchain_community.llms import HuggingFacePipeline
7
+ from app.config import PERPLEXITY_API_KEY, LLM_MODEL
8
+
9
+ def initialize_llm():
10
+ """Initialize and return the LLM."""
11
+ # return ChatPerplexity(
12
+ # pplx_api_key=PERPLEXITY_API_KEY,
13
+ # model=LLM_MODEL,
14
+ # temperature=0.2,
15
+ # max_tokens=1024
16
+ # )
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, trust_remote_code=True)
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ LLM_MODEL,
21
+ trust_remote_code=True,
22
+ device_map="auto",
23
+ torch_dtype="auto" # enables FP16 if available
24
+ )
25
+
26
+ pipe = pipeline(
27
+ "text-generation",
28
+ model=model,
29
+ tokenizer=tokenizer,
30
+ do_sample=True,
31
+ max_new_tokens=1024,
32
+ temperature=0.7,
33
+ top_p=0.9,
34
+ repetition_penalty=1.1,
35
+ return_full_text=False
36
+ )
37
+
38
+ return HuggingFacePipeline(pipeline=pipe)
39
+
40
+
41
+ def get_rag_prompt():
42
+ """Return the RAG prompt template."""
43
+ # return ChatPromptTemplate.from_template(
44
+ # """You are an expert on Allama Iqbal's poetry based on the context. Use only the context to answer and cite his verses in the answer.
45
+ # Context: {context}
46
+ # Question: {question}
47
+ # Answer in structured Markdown:"""
48
+ # )
49
+
50
+ return PromptTemplate.from_template(
51
+ """You are an expert on Allama Iqbal's poetry. Use the provided context to answer the question.
52
+ ### CONTEXT:{context}
53
+ ### QUESTION:{question}
54
+ ### ANSWER (Markdown formatted with poetic citations):
55
+ """
56
+ )
57
+
rag/retriever.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Retriever configuration for the RAG system."""
2
+
3
+ from app.config import RETRIEVER_K, RETRIEVER_FETCH_K, RETRIEVER_LAMBDA_MULT
4
+
5
+ def configure_retriever(vector_store):
6
+ """Configure and return the retriever."""
7
+ return vector_store.as_retriever(
8
+ search_type="mmr",
9
+ search_kwargs={
10
+ 'k': RETRIEVER_K,
11
+ 'fetch_k': RETRIEVER_FETCH_K,
12
+ 'lambda_mult': RETRIEVER_LAMBDA_MULT
13
+ }
14
+ )
rag/vector_store.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Vector store management for the RAG system."""
2
+
3
+ import json
4
+ import os
5
+ from langchain_chroma import Chroma # Updated import
6
+ from langchain_core.documents import Document
7
+ from app.config import CHROMA_DB_DIR
8
+ from rag.embeddings import get_embeddings
9
+
10
+ def initialize_vector_store():
11
+ """Initialize and return the Chroma vector store."""
12
+ embeddings = get_embeddings()
13
+ return Chroma(
14
+ persist_directory=CHROMA_DB_DIR,
15
+ embedding_function=embeddings
16
+ )
17
+
18
+ def build_vector_store_from_json(json_file_path):
19
+ """Build and persist a vector store from JSON data."""
20
+ # Check if vector store already exists
21
+ if os.path.exists(CHROMA_DB_DIR) and os.listdir(CHROMA_DB_DIR):
22
+ print(f"Vector store already exists at {CHROMA_DB_DIR}. Skipping creation.")
23
+ return initialize_vector_store()
24
+
25
+ print(f"Building vector store from {json_file_path}...")
26
+ embeddings = get_embeddings()
27
+
28
+ # Load JSON data
29
+ with open(json_file_path, 'r', encoding='utf-8') as f:
30
+ poems_data = json.load(f)
31
+
32
+ # Convert to documents
33
+ documents = []
34
+ for poem in poems_data:
35
+ # Create content with all available information
36
+ content_parts = []
37
+ if poem.get('poem_id'):
38
+ content_parts.append(f"Poem ID: {poem['poem_id']}")
39
+ if poem.get('book_title'):
40
+ content_parts.append(f"Book: {poem['book_title']}")
41
+ if poem.get('full_text'):
42
+ content_parts.append(f"Text:\n{poem['full_text']}")
43
+
44
+ # Create metadata
45
+ metadata = {
46
+ "poem_id": poem.get("poem_id", ""),
47
+ "book_id": poem.get("book_id", ""),
48
+ "book_title": poem.get("book_title", "Unknown")
49
+ }
50
+
51
+ # Create document
52
+ doc = Document(
53
+ page_content="\n\n".join(content_parts),
54
+ metadata=metadata
55
+ )
56
+
57
+ documents.append(doc)
58
+
59
+ print(f"Creating vector store with {len(documents)} documents...")
60
+
61
+ # Create vector store
62
+ os.makedirs(CHROMA_DB_DIR, exist_ok=True)
63
+ vector_store = Chroma.from_documents(
64
+ documents=documents,
65
+ embedding=embeddings,
66
+ persist_directory=CHROMA_DB_DIR
67
+ )
68
+
69
+ print(f"Vector store created and persisted at {CHROMA_DB_DIR}")
70
+ return vector_store
requirements.txt ADDED
Binary file (6.76 kB). View file
 
run.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entry point for the Iqbal Poetry RAG application."""
2
+
3
+ import os
4
+ import sys
5
+ import requests
6
+ import subprocess
7
+ from app.RAGSystem import IqbalRAGSystem
8
+ from app.gradio_interface import launch_gradio_app
9
+ from app.config import JSON_FILE_PATH
10
+
11
+ def check_ollama_availability():
12
+ """Check if Ollama is running and has the required models."""
13
+ try:
14
+ # First check if Ollama service is running
15
+ response = requests.get("http://localhost:11434/api/tags", timeout=5)
16
+
17
+ if response.status_code == 200:
18
+ # Check available models
19
+ available_models = [model["name"].lower() for model in response.json().get("models", [])]
20
+
21
+ if "llama3" not in available_models:
22
+ print("Warning: llama3 model not found in Ollama.")
23
+ # Try to check again with list command
24
+ try:
25
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
26
+ if "llama3" in result.stdout.lower():
27
+ print("Model found via command line. Proceeding...")
28
+ return True
29
+ else:
30
+ print("Please install it with: ollama pull llama3")
31
+ return False
32
+ except Exception:
33
+ print("Could not check models via command line.")
34
+ return False
35
+ return True
36
+ else:
37
+ print(f"Warning: Ollama is running but API call failed with status {response.status_code}.")
38
+ return False
39
+ except requests.exceptions.ConnectionError:
40
+ print("Error: Ollama server is not running. Please start Ollama with:")
41
+ print("ollama serve")
42
+ return False
43
+ except Exception as e:
44
+ print(f"Error checking Ollama: {str(e)}")
45
+ return False
46
+
47
+ def check_data_file():
48
+ """Check if the required data file exists."""
49
+ if not os.path.exists(JSON_FILE_PATH):
50
+ print(f"Error: Required data file not found at {JSON_FILE_PATH}")
51
+ print("Please ensure the iqbal_poems_rag.json file is present in the data/processed_data directory.")
52
+ return False
53
+ return True
54
+
55
+ if __name__ == "__main__":
56
+ # Check if data file exists
57
+ if not check_data_file():
58
+ sys.exit(1)
59
+
60
+ # Check if Ollama is available
61
+ ollama_available = check_ollama_availability()
62
+ if not ollama_available:
63
+ print("Warning: Proceeding without verifying Ollama setup...")
64
+
65
+ try:
66
+ # Initialize RAG system (this will build the vector store if needed)
67
+ print("Initializing Iqbal Poetry RAG system...")
68
+ rag_system = IqbalRAGSystem()
69
+
70
+ # Launch Gradio app
71
+ print("Launching Gradio interface...")
72
+ print(f"Access the application at http://localhost:{os.getenv('PORT', 7860)}")
73
+ launch_gradio_app(system=rag_system)
74
+ except Exception as e:
75
+ print(f"Error: Failed to start the application: {str(e)}")
76
+ sys.exit(1)
tests/inspector.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from langchain_chroma import Chroma
4
+ from langchain_core.documents import Document
5
+ from pprint import pprint
6
+ import pandas as pd
7
+
8
+ class ChromaVectorStoreInspector:
9
+ def __init__(self, persist_dir, embeddings):
10
+ """
11
+ Initialize inspector with existing Chroma store
12
+
13
+ Args:
14
+ persist_dir (str): Directory where Chroma data is stored
15
+ embeddings: Embeddings model used in the original store
16
+ """
17
+ self.vector_store = Chroma(
18
+ persist_directory=persist_dir,
19
+ embedding_function=embeddings
20
+ )
21
+ self.collection = self.vector_store._collection
22
+
23
+ def get_store_metadata(self):
24
+ """Get critical metadata about the vector store"""
25
+ return {
26
+ "document_count": self._get_document_count(),
27
+ "embedding_function": str(self.vector_store._embedding_function),
28
+ "persist_directory": self.vector_store._persist_directory,
29
+ "collection_name": self.collection.name
30
+ }
31
+
32
+ def _get_document_count(self):
33
+ """Get total number of documents in the collection"""
34
+ return self.collection.count()
35
+
36
+ def sample_documents(self, n=5):
37
+ """Retrieve sample documents with metadata"""
38
+ results = self.collection.get(limit=n)
39
+ return [
40
+ {
41
+ "id": doc_id,
42
+ "metadata": meta,
43
+ "content": doc[:200] + "..." if len(doc) > 200 else doc
44
+ }
45
+ for doc_id, meta, doc in zip(
46
+ results["ids"],
47
+ results["metadatas"],
48
+ results["documents"]
49
+ )
50
+ ]
51
+
52
+ def analyze_metadata(self):
53
+ """Analyze metadata distribution patterns"""
54
+ results = self.collection.get()
55
+ print(results)
56
+ # df = pd.DataFrame(results["metadatas"])
57
+
58
+ # analysis = {}
59
+ # if not df.empty:
60
+ # analysis["metadata_fields"] = list(df.columns)
61
+ # analysis["book_title_distribution"] = df["book_title"].value_counts().to_dict()
62
+ # analysis["missing_values"] = df.isna().sum().to_dict()
63
+
64
+ # return analysis
65
+
66
+ def test_semantic_search(self, query, k=3):
67
+ """Test the vector search functionality"""
68
+ results = self.vector_store.similarity_search(query, k=k)
69
+ return [
70
+ {
71
+ "content": doc.page_content[:150] + "...",
72
+ "metadata": doc.metadata,
73
+ "score": doc.metadata.get("score", 0.0)
74
+ }
75
+ for doc in results
76
+ ]
77
+
78
+ def full_health_check(self):
79
+ """Comprehensive store verification report"""
80
+ return {
81
+ "metadata": self.get_store_metadata(),
82
+ "sample_documents": self.sample_documents(),
83
+ "metadata_analysis": self.analyze_metadata(),
84
+ "search_test": self.test_semantic_search("philosophical concepts")
85
+ }
86
+
87
+ def verify_against_source(self, json_path):
88
+ """Verify vector store contents against source JSON"""
89
+ with open(json_path, "r") as f:
90
+ source_data = json.load(f)
91
+
92
+ source_ids = {p["poem_id"] for p in source_data}
93
+ stored_ids = set(self.collection.get()["ids"])
94
+
95
+ return {
96
+ "source_count": len(source_ids),
97
+ "stored_count": len(stored_ids),
98
+ "missing_in_store": source_ids - stored_ids,
99
+ "extra_in_store": stored_ids - source_ids
100
+ }
101
+
102
+ # Usage example
103
+ if __name__ == "__main__":
104
+ from rag.embeddings import get_embeddings # Your existing embeddings setup
105
+ from app.config import CHROMA_DB_DIR # Your config
106
+
107
+ # Initialize inspector
108
+ inspector = ChromaVectorStoreInspector(
109
+ persist_dir=CHROMA_DB_DIR,
110
+ embeddings=get_embeddings()
111
+ )
112
+
113
+ # print("\n=== Vector Store Metadata ===")
114
+ # pprint(inspector.get_store_metadata())
115
+
116
+ # print("\n=== Document Samples ===")
117
+ # pprint(inspector.sample_documents())
118
+
119
+ # print("\n=== Metadata Analysis ===")
120
+ # pprint(inspector.analyze_metadata())
121
+
122
+ # print("\n=== Source Verification ===")
123
+ # verification = inspector.verify_against_source("data/processed_data/iqbal_poems_rag.json")
124
+ # pprint(verification)
125
+
126
+ # print("\n=== Search Test Results ===")
127
+ # pprint(inspector.test_semantic_search("lost"))
128
+
129
+ print("\n=== Full Health Check ===")
130
+ pprint(inspector.full_health_check())
utils/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Internal imports
2
+ from .dataset_curator import DatasetCurator
3
+ from .dataset_downloader import DatasetDownloader
4
+
5
+
6
+ # Download the dataset from github
7
+ # downloader = DatasetDownloader(output_dir="data")
8
+ # downloader.download_from_github(source_name="github_iqbal_demystified")
9
+
10
+ # Process the dataset into a single file
11
+ # curator = DatasetCurator(data_path="data", output_dir="data/processed_data")
12
+ # dataset = curator.process_dataset(source="github_iqbal_demystified")
13
+ # print(f"Dataset processing complete. Stats:")
14
+ # print(f"- Books: {dataset['metadata']['total_books']}")
15
+ # print(f"- Poems: {dataset['metadata']['total_poems']}")
utils/dataset_curator.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library imports
2
+ import os
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Dict, List, Any, Optional
7
+
8
+ # Third-party imports
9
+ import yaml
10
+ from tqdm import tqdm
11
+
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.DEBUG,
15
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
16
+ handlers=[
17
+ logging.FileHandler("dataset_curation.log", encoding="utf-8"),
18
+ logging.StreamHandler()
19
+ ]
20
+ )
21
+ logger = logging.getLogger("DatasetCurator")
22
+
23
+
24
+ class DatasetCurator:
25
+ """
26
+ A robust dataset curator for processing Allama Iqbal's poetry collection
27
+ with nested YAML structures into a flattened JSON format optimized for RAG.
28
+
29
+ Features:
30
+ - Hierarchical data flattening
31
+ - Multilingual support (with English focus)
32
+ - Nested structure resolution
33
+ - Metadata preservation
34
+ - Data validation and error handling
35
+ """
36
+
37
+ def __init__(self, data_path: str, output_dir: str):
38
+ """
39
+ Initialize the curator with validated paths
40
+
41
+ Args:
42
+ data_root (str): Root directory containing 'lists' and 'poems' folders
43
+ output_dir (str): Directory for saving processed datasets
44
+ """
45
+ self.data_root = Path(data_path)
46
+ self.output_dir = Path(output_dir)
47
+ self.output_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ self.dataset = {
50
+ "metadata": {
51
+ "total_books": 0,
52
+ "total_poems": 0
53
+ },
54
+ "books": [],
55
+ "poems": []
56
+ }
57
+
58
+
59
+ def process_dataset(self, source: str = 'github_iqbal_demystified'):
60
+ """
61
+ Process the dataset based on the source.
62
+ """
63
+ if source == 'github_iqbal_demystified':
64
+ self.data_root = self.data_root / source
65
+ self.dataset = self.process_github_iqbal_demystified()
66
+ logger.info(f"Dataset processed successfully")
67
+ # logger.debug(f"Dataset: {self.dataset}")
68
+ else:
69
+ raise ValueError(f"Unsupported source: {self.source}")
70
+
71
+ # Save the dataset to various formats
72
+ self._save_dataset()
73
+
74
+ return self.dataset
75
+
76
+
77
+ def process_github_iqbal_demystified(self):
78
+ """
79
+ Main processing pipeline with error handling and progress tracking
80
+ """
81
+ try:
82
+ book_files = sorted((self.data_root / "lists").glob("List_*.yaml"))
83
+ logger.info(f"Found {len(book_files)} book files to process")
84
+
85
+ for book_file in tqdm(book_files, desc="Processing books"):
86
+ book_data = self._load_yaml(book_file)
87
+ book_id = book_file.stem.split("_")[-1]
88
+ processed_book = self._process_book(book_id, book_data)
89
+ self.dataset["books"].append(processed_book)
90
+
91
+ poems = self._process_poems(book_id, processed_book)
92
+ self.dataset["poems"].extend(poems)
93
+ self.dataset["metadata"]["total_poems"] += len(poems)
94
+ self.dataset["metadata"]["total_books"] = len(self.dataset["books"])
95
+
96
+ return self.dataset
97
+
98
+ except Exception as e:
99
+ logger.error(f"Processing failed: {str(e)}")
100
+ return None
101
+
102
+ def _process_book(self, book_id: str, raw_data: Dict) -> Dict:
103
+ """
104
+ Process book metadata with nested section structure
105
+
106
+ Args:
107
+ book_id (str): Unique identifier for the book
108
+ raw_data (Dict): Raw YAML data from list file
109
+
110
+ Returns:
111
+ Dict: Processed book structure with flattened metadata
112
+ """
113
+ book_structure = {
114
+ "id": book_id,
115
+ "titles": {},
116
+ "sections": [],
117
+ "metadata": {"total_sections": 0, "total_poems": 0}
118
+ }
119
+
120
+ # Process multilingual titles
121
+ for title_entry in raw_data.get("name", []):
122
+ lang = title_entry.get("lang", "unknown")
123
+ book_structure["titles"][lang] = title_entry.get("text", "")
124
+ if lang == 'en':
125
+ book_structure['primary_title'] = title_entry.get("text", "Unknown")
126
+
127
+ # Process sections
128
+ current_section = None
129
+ for section_data in raw_data.get("sections", []):
130
+ if "sectionName" in section_data:
131
+ if current_section:
132
+ book_structure["sections"].append(current_section)
133
+ book_structure["metadata"]["total_sections"] += 1
134
+
135
+ current_section = {
136
+ "id": len(book_structure["sections"]) + 1,
137
+ "titles": {},
138
+ "poems": [],
139
+ "poem_ids": [],
140
+ "metadata": {"total_poems": 0}
141
+ }
142
+
143
+ for name_entry in section_data["sectionName"]:
144
+ lang = name_entry.get("lang", "unknown")
145
+ current_section["titles"][lang] = name_entry.get("text", "")
146
+
147
+ if "poems" in section_data and current_section:
148
+ poems = self._process_poem_metadata(section_data["poems"])
149
+ poem_ids = [poem['id'] for poem in poems]
150
+ current_section["poems"].extend(poems)
151
+ current_section["poem_ids"].extend(poem_ids)
152
+ current_section["metadata"]["total_poems"] += len(poems)
153
+
154
+ if current_section:
155
+ book_structure["sections"].append(current_section)
156
+ book_structure["metadata"]["total_sections"] += 1
157
+
158
+ book_structure["metadata"]["total_poems"] = sum(
159
+ len(s["poems"]) for s in book_structure["sections"]
160
+ )
161
+ return book_structure
162
+
163
+
164
+ def _process_poem_metadata(self, poems: List[Dict]) -> List[Dict]:
165
+ """
166
+ Flatten poem metadata from nested structure
167
+
168
+ Args:
169
+ poems (List[Dict]): Raw poem metadata entries
170
+
171
+ Returns:
172
+ List[Dict]: Processed poem metadata
173
+ """
174
+ processed = []
175
+ for poem in poems:
176
+ processed_poem = {
177
+ "id": poem.get("id", ""),
178
+ "titles": {},
179
+ "metadata": {"languages": []} # Changed from set to list
180
+ }
181
+
182
+ for title_entry in poem.get("poemName", []):
183
+ lang = title_entry.get("lang", "unknown")
184
+ processed_poem["titles"][lang] = title_entry.get("text", "")
185
+ if lang not in processed_poem["metadata"]["languages"]:
186
+ processed_poem["metadata"]["languages"].append(lang)
187
+
188
+ processed.append(processed_poem)
189
+ return processed
190
+
191
+
192
+ def _process_poems(self, book_id: str, book_data: Dict) -> List[Dict]:
193
+ """
194
+ Process poem content files with validation and error handling
195
+
196
+ Args:
197
+ book_id (str): Parent book identifier
198
+ book_data (Dict): Processed book structure
199
+
200
+ Returns:
201
+ List[Dict]: Processed poems with flattened content
202
+ """
203
+ poems = []
204
+ book_name = book_data.get("primary_title", f"book_{book_id}")
205
+ sections = book_data.get("sections", [])
206
+ poem_dir = self.data_root / "poems" / book_id
207
+
208
+ if not poem_dir.exists():
209
+ logger.warning(f"Missing poem directory for book: {book_id}:{book_name}")
210
+ return []
211
+
212
+ for poem_file in poem_dir.glob("*.yaml"):
213
+ try:
214
+ poem_id = poem_file.stem
215
+ raw_data = self._load_yaml(poem_file)
216
+
217
+ # Create the generator expression, broken for readability
218
+ sectioninfo_generator = (
219
+ (section_info.get('id'), section_info.get('titles', {}).get('en'))
220
+ for section_info in sections
221
+ if poem_id in section_info.get('poem_ids', [])
222
+ )
223
+ # Use next() with the generator and a default tuple
224
+ section_id, section_name = next(sectioninfo_generator, (None, None))
225
+ # Create poem structure
226
+ poem = {
227
+ "id": poem_id,
228
+ "book_id": book_id,
229
+ "book_title": book_name,
230
+ "section_id": section_id,
231
+ "section_title": section_name,
232
+ "metadata": {"languages": []},
233
+ "content": {"descriptions": {}, "verses": []}
234
+ }
235
+
236
+ # Process descriptions
237
+ for desc_entry in raw_data.get("description", []):
238
+ lang = desc_entry.get("lang", "unknown")
239
+ poem["content"]["descriptions"][lang] = desc_entry.get("text", "")
240
+ if lang not in poem["metadata"]["languages"]:
241
+ poem["metadata"]["languages"].append(lang)
242
+
243
+ # Process verses with language detection
244
+ for verse in raw_data.get("sher", []):
245
+ processed_verse = self._process_verse(verse)
246
+ poem["content"]["verses"].append(processed_verse)
247
+ # Detect verse languages
248
+ for content in verse.get("sherContent", []):
249
+ lang = content.get("lang", "unknown")
250
+ if lang not in poem["metadata"]["languages"]:
251
+ poem["metadata"]["languages"].append(lang)
252
+
253
+ # Flatten structure with complete English detection
254
+ rag_poem = self._flatten_for_rag(poem)
255
+ if rag_poem: # Only add if English content exists
256
+ poems.append(rag_poem)
257
+ except Exception as e:
258
+ logger.error(f"Failed processing poem {poem_id}: {str(e)}")
259
+
260
+ return poems
261
+
262
+ def _process_verse(self, verse: Dict) -> Dict:
263
+ """
264
+ Process individual verse with multilingual content
265
+
266
+ Args:
267
+ verse (Dict): Raw verse data from YAML
268
+
269
+ Returns:
270
+ Dict: Processed verse structure
271
+ """
272
+ processed = {
273
+ "id": verse.get("id", ""),
274
+ "content": {},
275
+ "notes": []
276
+ }
277
+
278
+ for content_entry in verse.get("sherContent", []):
279
+ lang = content_entry.get("lang", "unknown")
280
+ processed["content"][lang] = {
281
+ "text": content_entry.get("text", ""),
282
+ "notes": [self._process_note(n) for n in content_entry.get("notes", [])]
283
+ }
284
+
285
+ return processed
286
+
287
+
288
+ def _process_note(self, note: Dict) -> Dict:
289
+ """
290
+ Standardize phrase/note structure
291
+
292
+ Args:
293
+ note (Dict): Raw note data
294
+
295
+ Returns:
296
+ Dict: Processed note structure
297
+ """
298
+ return {
299
+ "phrase": note.get("phrase", ""),
300
+ "meaning": note.get("meaning", ""),
301
+ "occurrences": note.get("occurrence", 1)
302
+ }
303
+
304
+
305
+ def _flatten_for_rag(self, poem: Dict) -> Dict:
306
+ """
307
+ Transform poem structure into RAG-optimized format
308
+
309
+ Args:
310
+ poem (Dict): Original poem structure
311
+
312
+ Returns:
313
+ Dict: Flattened structure with combined text fields
314
+ """
315
+ rag_poem = {
316
+ "poem_id": poem["id"],
317
+ "book_id": poem["book_id"],
318
+ "book_title": poem["book_title"],
319
+ "section_id": poem["section_id"],
320
+ "section_title": poem["section_title"],
321
+ "text_blocks": [],
322
+ "full_text": ""
323
+ }
324
+
325
+ # Extract English content from all sources
326
+ en_content = {
327
+ "descriptions": poem["content"]["descriptions"].get("en", ""),
328
+ "verses": [],
329
+ "phrases": []
330
+ }
331
+
332
+ # Process verses
333
+ for verse in poem["content"]["verses"]:
334
+ if "en" in verse["content"]:
335
+ en_content["verses"].append(verse["content"]["en"]["text"])
336
+ en_content["phrases"].extend(
337
+ f"{note['phrase']}: {note['meaning']}"
338
+ for note in verse["content"]["en"].get("notes", [])
339
+ )
340
+
341
+ # Build full text if English content exists
342
+ if en_content["verses"]:
343
+ rag_poem["full_text"] = "\n\n".join([
344
+ en_content["descriptions"],
345
+ "\n".join(en_content["verses"])
346
+ ])
347
+ rag_poem["text_blocks"] = en_content["verses"]
348
+ rag_poem["phrases"] = en_content["phrases"]
349
+ return rag_poem
350
+
351
+ logger.warning(f"No English content found for poem {poem['id']}")
352
+ return None
353
+
354
+
355
+ def _save_dataset(self):
356
+ """Save datasets with proper serialization checks"""
357
+ base_path = self.output_dir / "iqbal_poems"
358
+
359
+ # Save full dataset
360
+ with open(f"{base_path}_full.json", "w", encoding="utf-8") as f:
361
+ json.dump(self.dataset, f, ensure_ascii=True, indent=2)
362
+
363
+ # Save RAG-optimized poems (only those with English content)
364
+ rag_data = [p for p in self.dataset["poems"] if p is not None]
365
+
366
+ with open(f"{base_path}_rag.json", "w", encoding="utf-8") as f:
367
+ json.dump(rag_data, f, ensure_ascii=True, indent=2)
368
+
369
+ logger.info(f"Saved {len(rag_data)} RAG-ready poems")
370
+
371
+ def _load_yaml(self, path: Path) -> Dict:
372
+ """
373
+ Safe YAML loader with validation
374
+
375
+ Args:
376
+ path (Path): Path to YAML file
377
+
378
+ Returns:
379
+ Dict: Parsed YAML content
380
+ """
381
+ try:
382
+ with open(path, "r", encoding="utf-8") as f:
383
+ return yaml.safe_load(f)
384
+ except Exception as e:
385
+ logger.error(f"Failed loading YAML from {path}: {str(e)}")
386
+ raise
utils/dataset_downloader.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # External imports
2
+ import os
3
+ import requests
4
+ import yaml
5
+ import time
6
+ import logging
7
+
8
+ from bs4 import BeautifulSoup
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17
+ handlers=[
18
+ logging.FileHandler("dataset_downloader.log"),
19
+ logging.StreamHandler()
20
+ ]
21
+ )
22
+ logger = logging.getLogger("DatasetDownloader")
23
+
24
+
25
+ # Constants at module level
26
+ SOURCES = {
27
+ "github_iqbal_demystified": "https://raw.githubusercontent.com/AzeemGhumman/iqbal-demystified-dataset/master/data",
28
+ "iqbal_cyberlibrary": "https://iqbalcyberlibrary.net",
29
+ "allama_iqbal_poetry": "https://blogs.library.mcgill.ca/islamicstudieslibrary/allama-iqbal-poetry-%DA%A9%D9%84%D8%A7%D9%85-%D8%B9%D9%84%D8%A7%D9%85%DB%81-%D9%85%D8%AD%D9%85%D8%AF-%D8%A7%D9%82%D8%A8%D8%A7%D9%84/",
30
+ "iqbal_review": "https://www.allamaiqbal.com/publications/journals/review/",
31
+ "rekhta": "https://www.rekhta.org/poets/allama-iqbal/ghazals"
32
+ }
33
+
34
+
35
+ class DatasetDownloader:
36
+ """
37
+ A class to download dataset from various sources.
38
+ """
39
+
40
+ def __init__(self, output_dir: str = "data", number_of_books: int = 11, max_workers: int = 5) -> None:
41
+ """Initialize the dataset downloader with configuration parameters.
42
+
43
+ Args:
44
+ output_dir (str): Directory to store the downloaded files. Defaults to "data".
45
+ max_workers (int): Maximum number of concurrent workers. Defaults to 5.
46
+ """
47
+ if max_workers < 1:
48
+ raise ValueError("max_workers must be at least 1")
49
+
50
+ self.output_dir = Path(output_dir)
51
+ self.max_workers = max_workers
52
+ self.number_of_books = number_of_books
53
+ # Constant variables
54
+ self.sources = SOURCES
55
+
56
+
57
+ def download_from_github(self, source_name: str = "github_iqbal_demystified"):
58
+ """Download dataset from GitHub."""
59
+ logger.info("Downloading dataset from GitHub")
60
+
61
+ # Check if the source name is valid
62
+ if source_name not in self.sources:
63
+ raise ValueError(f"Source name {source_name} not found in sources")
64
+
65
+ # Get the source name and base url
66
+ base_url = self.sources[source_name]
67
+ folders = ["lists", "poems"]
68
+
69
+ # Create the folders for the source
70
+ for folder in folders:
71
+ output_path = self.output_dir / source_name / folder
72
+ os.makedirs(output_path, exist_ok=True)
73
+
74
+ # Fetch the list metadata from the GitHub repository
75
+ book_ids = self._download_github_lists(source_name, base_url, folder="lists")
76
+ # Fetch the poems from the GitHub repository
77
+ poem_ids = self._download_github_poems(source_name, base_url, folder="poems", book_ids=book_ids)
78
+
79
+ logger.info(f"Completed fetching data from Iqbal Demystified GitHub repository. Total poems fetched: {len(poem_ids)}")
80
+
81
+
82
+ def _download_github_lists(self, source_name: str, base_url: str, folder: str) -> list:
83
+ """Fetch the list metadata from the GitHub repository."""
84
+
85
+ logger.info(f"Fetching book metadata from {folder} folder")
86
+
87
+ book_ids = []
88
+ # Fetch the metadata for each book along with the poems
89
+ for index in tqdm(range(self.number_of_books), desc="Fetching book metadata"):
90
+ book_id = f"{index+1:03}"
91
+ # Create the output path for the book
92
+ output_path = self.output_dir / source_name / folder / f"List_{book_id}.yaml"
93
+ # Fetch the metadata for the book using requests
94
+ metadata_url = f"{base_url}/lists/List_{book_id}.yaml"
95
+ # Skip if already downloaded
96
+ if output_path.exists():
97
+ logger.debug(f"List_{book_id}.yaml already exists, skipping download")
98
+ book_ids.append(book_id)
99
+ continue
100
+
101
+ try:
102
+ response = requests.get(metadata_url)
103
+ response.raise_for_status()
104
+ if response.status_code == 200:
105
+ with open(output_path, "w", encoding="utf-8") as f:
106
+ f.write(response.text)
107
+
108
+ book_ids.append(book_id)
109
+ logger.info(f"Successfully fetched List_{book_id}.yaml")
110
+ except Exception as e:
111
+ logger.error(f"Error fetching metadata for {book_id}: {e}")
112
+
113
+ # Respect rate limits
114
+ time.sleep(0.5)
115
+
116
+ logger.info(f"Fetched {len(book_ids)} book lists")
117
+ return book_ids
118
+
119
+
120
+ def _download_github_poems(self, source_name: str, base_url: str, folder: str, book_ids: list) -> list:
121
+ """Fetch the poems from the GitHub repository."""
122
+ # List to store the fetched poems
123
+ fetched_poems = []
124
+ # Fetch the poems for each book by first reading the list metadata and then fetching the poems
125
+ for id in tqdm(book_ids, desc=f"Fetching books metadata, poems and shers"):
126
+ metadata_path = self.output_dir / source_name / "lists" / f"List_{id}.yaml"
127
+ if not metadata_path.exists():
128
+ logger.error(f"Metadata file for book {id} does not exist")
129
+ continue
130
+
131
+ # Create directory for this book's poems
132
+ poems_path = self.output_dir / source_name / folder / id
133
+ os.makedirs(poems_path, exist_ok=True)
134
+
135
+ # Load and parse the list file
136
+ try:
137
+ with open(metadata_path, "r", encoding="utf-8") as f:
138
+ book_metadata = yaml.safe_load(f)
139
+ except Exception as e:
140
+ logger.error(f"Error parsing list file for book {id}: {str(e)}")
141
+ continue
142
+
143
+ # Extract all poem IDs from the list
144
+ poem_ids = []
145
+ for section in book_metadata.get('sections', []):
146
+ if 'poems' in section:
147
+ for poem in section['poems']:
148
+ if 'id' in poem:
149
+ poem_ids.append(poem['id'])
150
+
151
+ # Fetch each poem
152
+ fetched_poems = []
153
+ for poem_id in tqdm(poem_ids, desc=f"Fetching poems for book {id}"):
154
+ poem_url = f"{base_url}/poems/{id}/{poem_id}.yaml"
155
+ output_path = poems_path / f"{poem_id}.yaml"
156
+
157
+ # Skip if already downloaded
158
+ if output_path.exists():
159
+ logger.debug(f"Poem {poem_id} already exists, skipping download")
160
+ fetched_poems.append(poem_id)
161
+ continue
162
+
163
+ try:
164
+ response = requests.get(poem_url, timeout=10)
165
+
166
+ if response.status_code == 200:
167
+ with open(output_path, "w", encoding="utf-8") as f:
168
+ f.write(response.text)
169
+
170
+ fetched_poems.append(poem_id)
171
+ print(f"Successfully fetched poem {poem_id}")
172
+ else:
173
+ print(f"Failed to fetch poem {poem_id}: {response.status_code}")
174
+
175
+ # Respect rate limits
176
+ time.sleep(0.5)
177
+
178
+ except Exception as e:
179
+ logger.error(f"Error fetching poem {poem_id}: {str(e)}")
180
+
181
+ logger.info(f"Fetched {len(fetched_poems)} poems for book {id}")
182
+ return fetched_poems
183
+
utils/error_handling.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Error handling utilities."""
2
+
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def handle_rag_error(func):
8
+ """Decorator for handling RAG system errors."""
9
+ def wrapper(*args, **kwargs):
10
+ try:
11
+ return func(*args, **kwargs)
12
+ except Exception as e:
13
+ logger.error(f"Error in RAG system: {str(e)}")
14
+ return f"An error occurred: {str(e)}", []
15
+ return wrapper