# External imports import os import requests import yaml import time import logging from bs4 import BeautifulSoup from pathlib import Path from tqdm import tqdm # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("dataset_downloader.log"), logging.StreamHandler() ] ) logger = logging.getLogger("DatasetDownloader") # Constants at module level SOURCES = { "github_iqbal_demystified": "https://raw.githubusercontent.com/AzeemGhumman/iqbal-demystified-dataset/master/data", "iqbal_cyberlibrary": "https://iqbalcyberlibrary.net", "allama_iqbal_poetry": "https://blogs.library.mcgill.ca/islamicstudieslibrary/allama-iqbal-poetry-%DA%A9%D9%84%D8%A7%D9%85-%D8%B9%D9%84%D8%A7%D9%85%DB%81-%D9%85%D8%AD%D9%85%D8%AF-%D8%A7%D9%82%D8%A8%D8%A7%D9%84/", "iqbal_review": "https://www.allamaiqbal.com/publications/journals/review/", "rekhta": "https://www.rekhta.org/poets/allama-iqbal/ghazals" } class DatasetDownloader: """ A class to download dataset from various sources. """ def __init__(self, output_dir: str = "data", number_of_books: int = 11, max_workers: int = 5) -> None: """Initialize the dataset downloader with configuration parameters. Args: output_dir (str): Directory to store the downloaded files. Defaults to "data". max_workers (int): Maximum number of concurrent workers. Defaults to 5. """ if max_workers < 1: raise ValueError("max_workers must be at least 1") self.output_dir = Path(output_dir) self.max_workers = max_workers self.number_of_books = number_of_books # Constant variables self.sources = SOURCES def download_from_github(self, source_name: str = "github_iqbal_demystified"): """Download dataset from GitHub.""" logger.info("Downloading dataset from GitHub") # Check if the source name is valid if source_name not in self.sources: raise ValueError(f"Source name {source_name} not found in sources") # Get the source name and base url base_url = self.sources[source_name] folders = ["lists", "poems"] # Create the folders for the source for folder in folders: output_path = self.output_dir / source_name / folder os.makedirs(output_path, exist_ok=True) # Fetch the list metadata from the GitHub repository book_ids = self._download_github_lists(source_name, base_url, folder="lists") # Fetch the poems from the GitHub repository poem_ids = self._download_github_poems(source_name, base_url, folder="poems", book_ids=book_ids) logger.info(f"Completed fetching data from Iqbal Demystified GitHub repository. Total poems fetched: {len(poem_ids)}") def _download_github_lists(self, source_name: str, base_url: str, folder: str) -> list: """Fetch the list metadata from the GitHub repository.""" logger.info(f"Fetching book metadata from {folder} folder") book_ids = [] # Fetch the metadata for each book along with the poems for index in tqdm(range(self.number_of_books), desc="Fetching book metadata"): book_id = f"{index+1:03}" # Create the output path for the book output_path = self.output_dir / source_name / folder / f"List_{book_id}.yaml" # Fetch the metadata for the book using requests metadata_url = f"{base_url}/lists/List_{book_id}.yaml" # Skip if already downloaded if output_path.exists(): logger.debug(f"List_{book_id}.yaml already exists, skipping download") book_ids.append(book_id) continue try: response = requests.get(metadata_url) response.raise_for_status() if response.status_code == 200: with open(output_path, "w", encoding="utf-8") as f: f.write(response.text) book_ids.append(book_id) logger.info(f"Successfully fetched List_{book_id}.yaml") except Exception as e: logger.error(f"Error fetching metadata for {book_id}: {e}") # Respect rate limits time.sleep(0.5) logger.info(f"Fetched {len(book_ids)} book lists") return book_ids def _download_github_poems(self, source_name: str, base_url: str, folder: str, book_ids: list) -> list: """Fetch the poems from the GitHub repository.""" # List to store the fetched poems fetched_poems = [] # Fetch the poems for each book by first reading the list metadata and then fetching the poems for id in tqdm(book_ids, desc=f"Fetching books metadata, poems and shers"): metadata_path = self.output_dir / source_name / "lists" / f"List_{id}.yaml" if not metadata_path.exists(): logger.error(f"Metadata file for book {id} does not exist") continue # Create directory for this book's poems poems_path = self.output_dir / source_name / folder / id os.makedirs(poems_path, exist_ok=True) # Load and parse the list file try: with open(metadata_path, "r", encoding="utf-8") as f: book_metadata = yaml.safe_load(f) except Exception as e: logger.error(f"Error parsing list file for book {id}: {str(e)}") continue # Extract all poem IDs from the list poem_ids = [] for section in book_metadata.get('sections', []): if 'poems' in section: for poem in section['poems']: if 'id' in poem: poem_ids.append(poem['id']) # Fetch each poem fetched_poems = [] for poem_id in tqdm(poem_ids, desc=f"Fetching poems for book {id}"): poem_url = f"{base_url}/poems/{id}/{poem_id}.yaml" output_path = poems_path / f"{poem_id}.yaml" # Skip if already downloaded if output_path.exists(): logger.debug(f"Poem {poem_id} already exists, skipping download") fetched_poems.append(poem_id) continue try: response = requests.get(poem_url, timeout=10) if response.status_code == 200: with open(output_path, "w", encoding="utf-8") as f: f.write(response.text) fetched_poems.append(poem_id) print(f"Successfully fetched poem {poem_id}") else: print(f"Failed to fetch poem {poem_id}: {response.status_code}") # Respect rate limits time.sleep(0.5) except Exception as e: logger.error(f"Error fetching poem {poem_id}: {str(e)}") logger.info(f"Fetched {len(fetched_poems)} poems for book {id}") return fetched_poems