vishanth10
init commit from repo
0181645
raw
history blame
2.78 kB
import boto3
import os
import urllib.parse
import logging
from botocore.exceptions import NoCredentialsError, ClientError
#bucket_name = "document-ingestion-drive-dev"
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
def get_s3_client():
try:
s3_client = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name='us-west-2'
)
logging.info("S3 client initialized successfully.")
return s3_client
except NoCredentialsError as e:
logging.error(f"Failed to initialize S3 client: {str(e)}")
raise
except Exception as e:
logging.error(f"Failed to initialize S3 client: {str(e)}")
raise
def read_s3_file(bucket_name, key):
try:
logging.info(f"Reading file from S3: bucket={bucket_name}, key={key}")
s3 = get_s3_client()
response = s3.get_object(Bucket=bucket_name, Key=key)
content = response['Body'].read()
metadata = response.get('Metadata', {})
# Attempt to get the file format from metadata
file_format = metadata.get('file_format') # Assuming 'file_format' is set as custom metadata
if not file_format:
# Fallback to using Content-Type if 'file_format' is not set in metadata
content_type = response.get('ContentType')
if content_type:
if 'word' in content_type:
file_format = 'docx'
elif 'pdf' in content_type:
file_format = 'pdf'
elif 'text' in content_type:
file_format = 'txt'
else:
file_format = 'unknown'
else:
raise ValueError("File format could not be determined from metadata or Content-Type.")
logging.info(f"File read successfully from S3: bucket={bucket_name}, key={key}, format={file_format}")
return content, metadata, file_format
except s3.exceptions.NoSuchKey:
logging.error(f"File not found in S3: bucket={bucket_name}, key={key}")
raise FileNotFoundError(f"File not found: bucket={bucket_name}, key={key}")
except NoCredentialsError:
logging.error("AWS credentials not found.")
raise PermissionError("AWS credentials not found.")
except ClientError as e:
logging.error(f"Error reading file from S3: {str(e)}")
raise
except Exception as e:
logging.error(f"Error reading file from S3: {str(e)}")
raise
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')