Spaces:

realrohilbansal
/

LegalAlly

Sleeping

LegalAlly / src /index.py

Rohil Bansal

committing chatbot

353edf3 6 months ago

3.29 kB

	#%%
	import sys
	import os
	from dotenv import load_dotenv
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.vectorstores import Chroma
	from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI

	# Load environment variables
	load_dotenv()

	# Set up environment variables
	try:
	tavily_api_key = os.getenv("TAVILY_API_KEY")
	os.environ["LANGCHAIN_TRACING_V2"] = "true"
	os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
	os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
	os.environ["LANGCHAIN_PROJECT"] = "legalairag"

	azure_endpoint = os.getenv("API_BASE")
	api_key = os.getenv("API_KEY")
	api_version = os.getenv("API_VERSION")

	print("Environment variables loaded successfully.")
	except Exception as e:
	print(f"Error loading environment variables: {e}")
	sys.exit(1)

	# Set up Azure OpenAI embeddings and model
	try:
	embd = AzureOpenAIEmbeddings(
	api_key=api_key,
	api_version=api_version,
	azure_endpoint=azure_endpoint
	)
	llm = AzureChatOpenAI(
	api_key=api_key,
	api_version=api_version,
	azure_endpoint=azure_endpoint
	)
	print("Azure OpenAI embeddings and model set up successfully.")
	except Exception as e:
	print(f"Error setting up Azure OpenAI: {e}")
	sys.exit(1)

	# Set working directory
	print("Starting Directory: ", os.getcwd())
	if not os.getcwd().endswith("Ally"):
	os.chdir("..")
	sys.path.append(os.getcwd())
	print("Current Directory: ", os.getcwd())

	# Function to check if vector store exists
	def vector_store_exists(persist_directory):
	return os.path.exists(persist_directory) and len(os.listdir(persist_directory)) > 0

	# Load and process documents
	try:
	print("Loading PDF document...")
	docs = PyPDFLoader("assets/data/Mandel-IntroEconTheory.pdf").load()
	print("PDF loaded successfully.")

	print("Splitting documents...")
	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=500, chunk_overlap=100
	)
	doc_splits = text_splitter.split_documents(docs)
	print(f"Documents split into {len(doc_splits)} chunks.")
	except Exception as e:
	print(f"Error processing documents: {e}")
	sys.exit(1)

	# Create or load vector store
	try:
	persist_directory = './vectordb'
	if not vector_store_exists(persist_directory):
	print("Creating new vector store...")
	vectorstore = Chroma.from_documents(
	documents=doc_splits,
	collection_name="rag-chroma",
	embedding=embd,
	persist_directory=persist_directory
	)
	print("New vector store created and populated.")
	else:
	print("Loading existing vector store...")
	vectorstore = Chroma(
	persist_directory=persist_directory,
	embedding_function=embd,
	collection_name="rag-chroma"
	)
	print("Existing vector store loaded.")

	retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
	print("Retriever set up successfully.")
	except Exception as e:
	print(f"Error with vector store operations: {e}")
	sys.exit(1)

	print("Index setup completed successfully.")