Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import os | |
from src.logger import setup_logger | |
logger = setup_logger(__name__) | |
def dataloader(data_path): | |
pdf_path = os.path.join('assets', 'data', data_path) | |
text = [] | |
try: | |
logger.info(f"Attempting to read PDF from: {pdf_path}") | |
with open(pdf_path, 'rb') as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
total_pages = len(pdf_reader.pages) | |
logger.info(f"PDF loaded successfully. Total pages: {total_pages}") | |
for i, page in enumerate(pdf_reader.pages, 1): | |
try: | |
page_text = page.extract_text() | |
text.append(page_text) | |
logger.info(f"Extracted text from page {i}/{total_pages}") | |
except Exception as e: | |
logger.error(f"Error extracting text from page {i}: {str(e)}") | |
logger.info("PDF text extraction completed") | |
return text | |
except FileNotFoundError: | |
logger.error(f"PDF file not found at {pdf_path}") | |
return [] | |
except Exception as e: | |
logger.error(f"An error occurred while reading the PDF: {str(e)}") | |
return [] | |