Iqbal_Poetry_RAG / utils /dataset_downloader.py
farjadmalik's picture
Initial commit: Iqbal Poetry RAG system
657ce3b
raw
history blame
7.54 kB
# External imports
import os
import requests
import yaml
import time
import logging
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("dataset_downloader.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger("DatasetDownloader")
# Constants at module level
SOURCES = {
"github_iqbal_demystified": "https://raw.githubusercontent.com/AzeemGhumman/iqbal-demystified-dataset/master/data",
"iqbal_cyberlibrary": "https://iqbalcyberlibrary.net",
"allama_iqbal_poetry": "https://blogs.library.mcgill.ca/islamicstudieslibrary/allama-iqbal-poetry-%DA%A9%D9%84%D8%A7%D9%85-%D8%B9%D9%84%D8%A7%D9%85%DB%81-%D9%85%D8%AD%D9%85%D8%AF-%D8%A7%D9%82%D8%A8%D8%A7%D9%84/",
"iqbal_review": "https://www.allamaiqbal.com/publications/journals/review/",
"rekhta": "https://www.rekhta.org/poets/allama-iqbal/ghazals"
}
class DatasetDownloader:
"""
A class to download dataset from various sources.
"""
def __init__(self, output_dir: str = "data", number_of_books: int = 11, max_workers: int = 5) -> None:
"""Initialize the dataset downloader with configuration parameters.
Args:
output_dir (str): Directory to store the downloaded files. Defaults to "data".
max_workers (int): Maximum number of concurrent workers. Defaults to 5.
"""
if max_workers < 1:
raise ValueError("max_workers must be at least 1")
self.output_dir = Path(output_dir)
self.max_workers = max_workers
self.number_of_books = number_of_books
# Constant variables
self.sources = SOURCES
def download_from_github(self, source_name: str = "github_iqbal_demystified"):
"""Download dataset from GitHub."""
logger.info("Downloading dataset from GitHub")
# Check if the source name is valid
if source_name not in self.sources:
raise ValueError(f"Source name {source_name} not found in sources")
# Get the source name and base url
base_url = self.sources[source_name]
folders = ["lists", "poems"]
# Create the folders for the source
for folder in folders:
output_path = self.output_dir / source_name / folder
os.makedirs(output_path, exist_ok=True)
# Fetch the list metadata from the GitHub repository
book_ids = self._download_github_lists(source_name, base_url, folder="lists")
# Fetch the poems from the GitHub repository
poem_ids = self._download_github_poems(source_name, base_url, folder="poems", book_ids=book_ids)
logger.info(f"Completed fetching data from Iqbal Demystified GitHub repository. Total poems fetched: {len(poem_ids)}")
def _download_github_lists(self, source_name: str, base_url: str, folder: str) -> list:
"""Fetch the list metadata from the GitHub repository."""
logger.info(f"Fetching book metadata from {folder} folder")
book_ids = []
# Fetch the metadata for each book along with the poems
for index in tqdm(range(self.number_of_books), desc="Fetching book metadata"):
book_id = f"{index+1:03}"
# Create the output path for the book
output_path = self.output_dir / source_name / folder / f"List_{book_id}.yaml"
# Fetch the metadata for the book using requests
metadata_url = f"{base_url}/lists/List_{book_id}.yaml"
# Skip if already downloaded
if output_path.exists():
logger.debug(f"List_{book_id}.yaml already exists, skipping download")
book_ids.append(book_id)
continue
try:
response = requests.get(metadata_url)
response.raise_for_status()
if response.status_code == 200:
with open(output_path, "w", encoding="utf-8") as f:
f.write(response.text)
book_ids.append(book_id)
logger.info(f"Successfully fetched List_{book_id}.yaml")
except Exception as e:
logger.error(f"Error fetching metadata for {book_id}: {e}")
# Respect rate limits
time.sleep(0.5)
logger.info(f"Fetched {len(book_ids)} book lists")
return book_ids
def _download_github_poems(self, source_name: str, base_url: str, folder: str, book_ids: list) -> list:
"""Fetch the poems from the GitHub repository."""
# List to store the fetched poems
fetched_poems = []
# Fetch the poems for each book by first reading the list metadata and then fetching the poems
for id in tqdm(book_ids, desc=f"Fetching books metadata, poems and shers"):
metadata_path = self.output_dir / source_name / "lists" / f"List_{id}.yaml"
if not metadata_path.exists():
logger.error(f"Metadata file for book {id} does not exist")
continue
# Create directory for this book's poems
poems_path = self.output_dir / source_name / folder / id
os.makedirs(poems_path, exist_ok=True)
# Load and parse the list file
try:
with open(metadata_path, "r", encoding="utf-8") as f:
book_metadata = yaml.safe_load(f)
except Exception as e:
logger.error(f"Error parsing list file for book {id}: {str(e)}")
continue
# Extract all poem IDs from the list
poem_ids = []
for section in book_metadata.get('sections', []):
if 'poems' in section:
for poem in section['poems']:
if 'id' in poem:
poem_ids.append(poem['id'])
# Fetch each poem
fetched_poems = []
for poem_id in tqdm(poem_ids, desc=f"Fetching poems for book {id}"):
poem_url = f"{base_url}/poems/{id}/{poem_id}.yaml"
output_path = poems_path / f"{poem_id}.yaml"
# Skip if already downloaded
if output_path.exists():
logger.debug(f"Poem {poem_id} already exists, skipping download")
fetched_poems.append(poem_id)
continue
try:
response = requests.get(poem_url, timeout=10)
if response.status_code == 200:
with open(output_path, "w", encoding="utf-8") as f:
f.write(response.text)
fetched_poems.append(poem_id)
print(f"Successfully fetched poem {poem_id}")
else:
print(f"Failed to fetch poem {poem_id}: {response.status_code}")
# Respect rate limits
time.sleep(0.5)
except Exception as e:
logger.error(f"Error fetching poem {poem_id}: {str(e)}")
logger.info(f"Fetched {len(fetched_poems)} poems for book {id}")
return fetched_poems