import PyPDF2 import os from src.logger import setup_logger logger = setup_logger(__name__) def dataloader(data_path): pdf_path = os.path.join('assets', 'data', data_path) text = [] try: logger.info(f"Attempting to read PDF from: {pdf_path}") with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) total_pages = len(pdf_reader.pages) logger.info(f"PDF loaded successfully. Total pages: {total_pages}") for i, page in enumerate(pdf_reader.pages, 1): try: page_text = page.extract_text() text.append(page_text) logger.info(f"Extracted text from page {i}/{total_pages}") except Exception as e: logger.error(f"Error extracting text from page {i}: {str(e)}") logger.info("PDF text extraction completed") return text except FileNotFoundError: logger.error(f"PDF file not found at {pdf_path}") return [] except Exception as e: logger.error(f"An error occurred while reading the PDF: {str(e)}") return []