Spaces:
Sleeping
Sleeping
#%% | |
import sys | |
import os | |
from dotenv import load_dotenv | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_community.vectorstores import Chroma | |
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI | |
# Load environment variables | |
load_dotenv() | |
# Set up environment variables | |
try: | |
tavily_api_key = os.getenv("TAVILY_API_KEY") | |
os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" | |
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY") | |
os.environ["LANGCHAIN_PROJECT"] = "legalairag" | |
azure_endpoint = os.getenv("API_BASE") | |
api_key = os.getenv("API_KEY") | |
api_version = os.getenv("API_VERSION") | |
print("Environment variables loaded successfully.") | |
except Exception as e: | |
print(f"Error loading environment variables: {e}") | |
sys.exit(1) | |
# Set up Azure OpenAI embeddings and model | |
try: | |
embd = AzureOpenAIEmbeddings( | |
api_key=api_key, | |
api_version=api_version, | |
azure_endpoint=azure_endpoint | |
) | |
llm = AzureChatOpenAI( | |
api_key=api_key, | |
api_version=api_version, | |
azure_endpoint=azure_endpoint | |
) | |
print("Azure OpenAI embeddings and model set up successfully.") | |
except Exception as e: | |
print(f"Error setting up Azure OpenAI: {e}") | |
sys.exit(1) | |
# Set working directory | |
print("Starting Directory: ", os.getcwd()) | |
if not os.getcwd().endswith("Ally"): | |
os.chdir("..") | |
sys.path.append(os.getcwd()) | |
print("Current Directory: ", os.getcwd()) | |
# Function to check if vector store exists | |
def vector_store_exists(persist_directory): | |
return os.path.exists(persist_directory) and len(os.listdir(persist_directory)) > 0 | |
# Load and process documents | |
try: | |
print("Loading PDF document...") | |
docs = PyPDFLoader("assets/data/Mandel-IntroEconTheory.pdf").load() | |
print("PDF loaded successfully.") | |
print("Splitting documents...") | |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=500, chunk_overlap=100 | |
) | |
doc_splits = text_splitter.split_documents(docs) | |
print(f"Documents split into {len(doc_splits)} chunks.") | |
except Exception as e: | |
print(f"Error processing documents: {e}") | |
sys.exit(1) | |
# Create or load vector store | |
try: | |
persist_directory = './vectordb' | |
if not vector_store_exists(persist_directory): | |
print("Creating new vector store...") | |
vectorstore = Chroma.from_documents( | |
documents=doc_splits, | |
collection_name="rag-chroma", | |
embedding=embd, | |
persist_directory=persist_directory | |
) | |
print("New vector store created and populated.") | |
else: | |
print("Loading existing vector store...") | |
vectorstore = Chroma( | |
persist_directory=persist_directory, | |
embedding_function=embd, | |
collection_name="rag-chroma" | |
) | |
print("Existing vector store loaded.") | |
retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) | |
print("Retriever set up successfully.") | |
except Exception as e: | |
print(f"Error with vector store operations: {e}") | |
sys.exit(1) | |
print("Index setup completed successfully.") |