# External imports
import os
import requests
import yaml
import time
import logging

from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("dataset_downloader.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("DatasetDownloader")


# Constants at module level
SOURCES = {
    "github_iqbal_demystified": "https://raw.githubusercontent.com/AzeemGhumman/iqbal-demystified-dataset/master/data",
    "iqbal_cyberlibrary": "https://iqbalcyberlibrary.net",
    "allama_iqbal_poetry": "https://blogs.library.mcgill.ca/islamicstudieslibrary/allama-iqbal-poetry-%DA%A9%D9%84%D8%A7%D9%85-%D8%B9%D9%84%D8%A7%D9%85%DB%81-%D9%85%D8%AD%D9%85%D8%AF-%D8%A7%D9%82%D8%A8%D8%A7%D9%84/",
    "iqbal_review": "https://www.allamaiqbal.com/publications/journals/review/",
    "rekhta": "https://www.rekhta.org/poets/allama-iqbal/ghazals"
}


class DatasetDownloader:
    """
    A class to download dataset from various sources.
    """
    
    def __init__(self, output_dir: str = "data", number_of_books: int = 11, max_workers: int = 5) -> None:
        """Initialize the dataset downloader with configuration parameters.
        
        Args:
            output_dir (str): Directory to store the downloaded files. Defaults to "data".
            max_workers (int): Maximum number of concurrent workers. Defaults to 5.
        """
        if max_workers < 1:
            raise ValueError("max_workers must be at least 1")
    
        self.output_dir = Path(output_dir)
        self.max_workers = max_workers
        self.number_of_books = number_of_books
        # Constant variables
        self.sources = SOURCES


    def download_from_github(self, source_name: str = "github_iqbal_demystified"):
        """Download dataset from GitHub."""
        logger.info("Downloading dataset from GitHub")

        # Check if the source name is valid
        if source_name not in self.sources:
            raise ValueError(f"Source name {source_name} not found in sources")
        
        # Get the source name and base url
        base_url = self.sources[source_name]
        folders = ["lists", "poems"]
        
        # Create the folders for the source
        for folder in folders:
            output_path = self.output_dir / source_name / folder
            os.makedirs(output_path, exist_ok=True)

        # Fetch the list metadata from the GitHub repository
        book_ids = self._download_github_lists(source_name, base_url, folder="lists")
        # Fetch the poems from the GitHub repository
        poem_ids = self._download_github_poems(source_name, base_url, folder="poems", book_ids=book_ids)

        logger.info(f"Completed fetching data from Iqbal Demystified GitHub repository. Total poems fetched: {len(poem_ids)}")

    
    def _download_github_lists(self, source_name: str, base_url: str, folder: str) -> list:
        """Fetch the list metadata from the GitHub repository."""

        logger.info(f"Fetching book metadata from {folder} folder")
        
        book_ids = []
        # Fetch the metadata for each book along with the poems
        for index in tqdm(range(self.number_of_books), desc="Fetching book metadata"):
            book_id = f"{index+1:03}"
            # Create the output path for the book
            output_path = self.output_dir / source_name / folder / f"List_{book_id}.yaml"
            # Fetch the metadata for the book using requests
            metadata_url = f"{base_url}/lists/List_{book_id}.yaml"
            # Skip if already downloaded
            if output_path.exists():
                logger.debug(f"List_{book_id}.yaml already exists, skipping download")
                book_ids.append(book_id)
                continue

            try:
                response = requests.get(metadata_url)
                response.raise_for_status()
                if response.status_code == 200:
                    with open(output_path, "w", encoding="utf-8") as f:
                        f.write(response.text)
                
                    book_ids.append(book_id)
                    logger.info(f"Successfully fetched List_{book_id}.yaml")
            except Exception as e:
                logger.error(f"Error fetching metadata for {book_id}: {e}")
            
            # Respect rate limits
            time.sleep(0.5)

        logger.info(f"Fetched {len(book_ids)} book lists")
        return book_ids
    
    
    def _download_github_poems(self, source_name: str, base_url: str, folder: str, book_ids: list) -> list:
        """Fetch the poems from the GitHub repository."""
        # List to store the fetched poems
        fetched_poems = []
        # Fetch the poems for each book by first reading the list metadata and then fetching the poems
        for id in tqdm(book_ids, desc=f"Fetching books metadata, poems and shers"):
            metadata_path = self.output_dir / source_name / "lists" / f"List_{id}.yaml"
            if not metadata_path.exists():
                logger.error(f"Metadata file for book {id} does not exist")
                continue

            # Create directory for this book's poems
            poems_path = self.output_dir / source_name / folder / id
            os.makedirs(poems_path, exist_ok=True)

            # Load and parse the list file
            try:
                with open(metadata_path, "r", encoding="utf-8") as f:
                    book_metadata = yaml.safe_load(f)
            except Exception as e:
                logger.error(f"Error parsing list file for book {id}: {str(e)}")
                continue
 
            # Extract all poem IDs from the list
            poem_ids = []
            for section in book_metadata.get('sections', []):
                if 'poems' in section:
                    for poem in section['poems']:
                        if 'id' in poem:
                            poem_ids.append(poem['id'])
            
            # Fetch each poem
            fetched_poems = []
            for poem_id in tqdm(poem_ids, desc=f"Fetching poems for book {id}"):
                poem_url = f"{base_url}/poems/{id}/{poem_id}.yaml"
                output_path = poems_path / f"{poem_id}.yaml"
                
                # Skip if already downloaded
                if output_path.exists():
                    logger.debug(f"Poem {poem_id} already exists, skipping download")
                    fetched_poems.append(poem_id)
                    continue
                
                try:
                    response = requests.get(poem_url, timeout=10)
                    
                    if response.status_code == 200:
                        with open(output_path, "w", encoding="utf-8") as f:
                            f.write(response.text)
                        
                        fetched_poems.append(poem_id)
                        print(f"Successfully fetched poem {poem_id}")
                    else:
                        print(f"Failed to fetch poem {poem_id}: {response.status_code}")
                    
                    # Respect rate limits
                    time.sleep(0.5)
                    
                except Exception as e:
                    logger.error(f"Error fetching poem {poem_id}: {str(e)}")

            logger.info(f"Fetched {len(fetched_poems)} poems for book {id}")
        return fetched_poems