|
import boto3 |
|
import os |
|
import urllib.parse |
|
import logging |
|
from botocore.exceptions import NoCredentialsError, ClientError |
|
|
|
|
|
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') |
|
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY') |
|
|
|
|
|
|
|
def get_s3_client(): |
|
try: |
|
s3_client = boto3.client( |
|
's3', |
|
aws_access_key_id=aws_access_key_id, |
|
aws_secret_access_key=aws_secret_access_key, |
|
region_name='us-west-2' |
|
) |
|
logging.info("S3 client initialized successfully.") |
|
return s3_client |
|
except NoCredentialsError as e: |
|
logging.error(f"Failed to initialize S3 client: {str(e)}") |
|
raise |
|
except Exception as e: |
|
logging.error(f"Failed to initialize S3 client: {str(e)}") |
|
raise |
|
|
|
|
|
def read_s3_file(bucket_name, key): |
|
try: |
|
logging.info(f"Reading file from S3: bucket={bucket_name}, key={key}") |
|
s3 = get_s3_client() |
|
response = s3.get_object(Bucket=bucket_name, Key=key) |
|
|
|
content = response['Body'].read() |
|
metadata = response.get('Metadata', {}) |
|
|
|
|
|
file_format = metadata.get('file_format') |
|
if not file_format: |
|
|
|
content_type = response.get('ContentType') |
|
if content_type: |
|
if 'word' in content_type: |
|
file_format = 'docx' |
|
elif 'pdf' in content_type: |
|
file_format = 'pdf' |
|
elif 'text' in content_type: |
|
file_format = 'txt' |
|
else: |
|
file_format = 'unknown' |
|
else: |
|
raise ValueError("File format could not be determined from metadata or Content-Type.") |
|
|
|
logging.info(f"File read successfully from S3: bucket={bucket_name}, key={key}, format={file_format}") |
|
return content, metadata, file_format |
|
except s3.exceptions.NoSuchKey: |
|
logging.error(f"File not found in S3: bucket={bucket_name}, key={key}") |
|
raise FileNotFoundError(f"File not found: bucket={bucket_name}, key={key}") |
|
except NoCredentialsError: |
|
logging.error("AWS credentials not found.") |
|
raise PermissionError("AWS credentials not found.") |
|
except ClientError as e: |
|
logging.error(f"Error reading file from S3: {str(e)}") |
|
raise |
|
except Exception as e: |
|
logging.error(f"Error reading file from S3: {str(e)}") |
|
raise |
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |