Spaces:

valleeneutral
/

article_research_bot

Running

App Files Files Community

article_research_bot / app.py

valleeneutral

Upload 4 files

d963628 verified 12 days ago

raw

history blame

5.43 kB

	import os
	import streamlit as st
	import pickle
	import faiss
	import time
	from langchain import OpenAI
	from langchain.chains import RetrievalQAWithSourcesChain
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import UnstructuredURLLoader
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain_community.docstore.in_memory import InMemoryDocstore
	from langchain.schema import Document
	from dotenv import load_dotenv

	load_dotenv() # take environment variables from .env (especially openai api key)


	st.title("Article/News Research Tool")
	st.sidebar.title("Article URLs...")

	# Initialize session state for Q&A history
	if "qa_history" not in st.session_state:
	st.session_state.qa_history = []

	# Ask the user how many URLs they want to input
	num_urls = st.sidebar.number_input("How many URLs do you want to process?", min_value=1, max_value=10, value=3)

	urls = []
	for i in range(num_urls):
	url = st.sidebar.text_input(f"URL {i+1}")
	urls.append(url)

	# urls = []
	# for i in range(3):
	# url = st.sidebar.text_input(f"URL {i+1}")
	# urls.append(url)

	process_url_clicked = st.sidebar.button("Process Article URLs")
	# file_path = "faiss_store_openai.pkl"
	#
	main_placeholder = st.empty()
	llm = OpenAI(temperature=0.5, max_tokens=500)

	index_path = "faiss_index.bin"
	docs_path = "docs.pkl"
	index_to_docstore_id_path = "index_to_docstore_id.pkl"

	if process_url_clicked:
	# load data
	loader = UnstructuredURLLoader(urls=urls)
	main_placeholder.text("Data Loading...Initiated...")
	data = loader.load()

	# split data
	text_splitter = RecursiveCharacterTextSplitter(
	# separators=['\n\n', '\n', '.', ','],
	chunk_size=1000,
	# chunk_overlap=200
	)

	main_placeholder.text("Text Splitter...Initiated...")
	docs = text_splitter.split_documents(data)

	# create embeddings and save it to FAISS index
	embeddings = OpenAIEmbeddings()
	embedding_dimension = 1536
	docstore_dict = {str(i): doc for i, doc in enumerate(docs)}
	docstore = InMemoryDocstore(docstore_dict)

	# Create FAISS vector index
	index = faiss.IndexFlatL2(embedding_dimension)

	# Initialize the FAISS vector store with a correct mapping
	index_to_docstore_id = {i: str(i) for i in range(len(docs))}
	vector_store = FAISS(embedding_function=embeddings, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

	# Add documents to the FAISS index
	vector_store.add_documents(docs)
	main_placeholder.text("Embedding Vector Building Initiated...")

	# Save the FAISS index and documents separately
	# index_path = "faiss_index.bin"
	faiss.write_index(vector_store.index, index_path)
	# docs_path = "docs.pkl"
	with open(docs_path, "wb") as f:
	pickle.dump(docs, f)

	# Save the index_to_docstore_id mapping
	# index_to_docstore_id_path = "index_to_docstore_id.pkl"
	with open(index_to_docstore_id_path, "wb") as f:
	pickle.dump(vector_store.index_to_docstore_id, f)


	query = main_placeholder.text_input("Question: ")
	if query:
	# Load the FAISS index and documents
	if os.path.exists(index_path) and os.path.exists(docs_path) and os.path.exists(index_to_docstore_id_path):
	index = faiss.read_index(index_path)
	with open(docs_path, "rb") as f:
	docs = pickle.load(f)
	with open(index_to_docstore_id_path, "rb") as f:
	index_to_docstore_id = pickle.load(f)
	docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs)})
	# print(f"Loaded document store keys: {list(docstore._dict.keys())[:10]}") # Debug output
	embeddings = OpenAIEmbeddings() # Recreate embeddings object
	vector_store = FAISS(embedding_function=embeddings, index=index, docstore=docstore,
	index_to_docstore_id=index_to_docstore_id)

	chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_store.as_retriever())
	result = chain.invoke({"question": query}, return_only_outputs=True)

	# Extract and display the result
	answer = result.get("answer", "No answer found.")
	sources = result.get("sources", "No sources available.")

	# Add to session state history
	st.session_state.qa_history.append({"question": query, "answer": answer, "sources": sources})

	# result will be a dictionary of this format --> {"answer": "", "sources": [] }
	st.subheader("Response:")
	st.write(result["answer"])

	# Display sources, if available
	sources = result.get("sources", "")
	if sources:
	st.subheader("Sources:")
	sources_list = sources.split("\n") # Split the sources by newline
	for source in sources_list:
	st.write(source)

	# Display all questions and answers from the session
	if st.session_state.qa_history:
	st.write("---------------------------------------------------------------------")
	st.subheader("History:")
	for entry in st.session_state.qa_history:
	st.write(f"Q: {entry['question']}")
	st.write(f"A: {entry['answer']}")
	st.write(f"Sources: {entry['sources']}")