LegalAlly / src /dataloader.py
Rohil Bansal
New structure
7a7b50b
raw
history blame
1.19 kB
import PyPDF2
import os
from src.logger import setup_logger
logger = setup_logger(__name__)
def dataloader(data_path):
pdf_path = os.path.join('assets', 'data', data_path)
text = []
try:
logger.info(f"Attempting to read PDF from: {pdf_path}")
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
total_pages = len(pdf_reader.pages)
logger.info(f"PDF loaded successfully. Total pages: {total_pages}")
for i, page in enumerate(pdf_reader.pages, 1):
try:
page_text = page.extract_text()
text.append(page_text)
logger.info(f"Extracted text from page {i}/{total_pages}")
except Exception as e:
logger.error(f"Error extracting text from page {i}: {str(e)}")
logger.info("PDF text extraction completed")
return text
except FileNotFoundError:
logger.error(f"PDF file not found at {pdf_path}")
return []
except Exception as e:
logger.error(f"An error occurred while reading the PDF: {str(e)}")
return []