valleeneutral commited on
Commit
d963628
·
verified ·
1 Parent(s): acd4f5e

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +144 -0
  2. docs.pkl +3 -0
  3. faiss_index.bin +3 -0
  4. index_to_docstore_id.pkl +3 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pickle
4
+ import faiss
5
+ import time
6
+ from langchain import OpenAI
7
+ from langchain.chains import RetrievalQAWithSourcesChain
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.document_loaders import UnstructuredURLLoader
10
+ from langchain.embeddings import OpenAIEmbeddings
11
+ from langchain.vectorstores import FAISS
12
+ from langchain_community.docstore.in_memory import InMemoryDocstore
13
+ from langchain.schema import Document
14
+ from dotenv import load_dotenv
15
+
16
+ load_dotenv() # take environment variables from .env (especially openai api key)
17
+
18
+
19
+ st.title("Article/News Research Tool")
20
+ st.sidebar.title("Article URLs...")
21
+
22
+ # Initialize session state for Q&A history
23
+ if "qa_history" not in st.session_state:
24
+ st.session_state.qa_history = []
25
+
26
+ # Ask the user how many URLs they want to input
27
+ num_urls = st.sidebar.number_input("How many URLs do you want to process?", min_value=1, max_value=10, value=3)
28
+
29
+ urls = []
30
+ for i in range(num_urls):
31
+ url = st.sidebar.text_input(f"URL {i+1}")
32
+ urls.append(url)
33
+
34
+ # urls = []
35
+ # for i in range(3):
36
+ # url = st.sidebar.text_input(f"URL {i+1}")
37
+ # urls.append(url)
38
+
39
+ process_url_clicked = st.sidebar.button("Process Article URLs")
40
+ # file_path = "faiss_store_openai.pkl"
41
+ #
42
+ main_placeholder = st.empty()
43
+ llm = OpenAI(temperature=0.5, max_tokens=500)
44
+
45
+ index_path = "faiss_index.bin"
46
+ docs_path = "docs.pkl"
47
+ index_to_docstore_id_path = "index_to_docstore_id.pkl"
48
+
49
+ if process_url_clicked:
50
+ # load data
51
+ loader = UnstructuredURLLoader(urls=urls)
52
+ main_placeholder.text("Data Loading...Initiated...")
53
+ data = loader.load()
54
+
55
+ # split data
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ # separators=['\n\n', '\n', '.', ','],
58
+ chunk_size=1000,
59
+ # chunk_overlap=200
60
+ )
61
+
62
+ main_placeholder.text("Text Splitter...Initiated...")
63
+ docs = text_splitter.split_documents(data)
64
+
65
+ # create embeddings and save it to FAISS index
66
+ embeddings = OpenAIEmbeddings()
67
+ embedding_dimension = 1536
68
+ docstore_dict = {str(i): doc for i, doc in enumerate(docs)}
69
+ docstore = InMemoryDocstore(docstore_dict)
70
+
71
+ # Create FAISS vector index
72
+ index = faiss.IndexFlatL2(embedding_dimension)
73
+
74
+ # Initialize the FAISS vector store with a correct mapping
75
+ index_to_docstore_id = {i: str(i) for i in range(len(docs))}
76
+ vector_store = FAISS(embedding_function=embeddings, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)
77
+
78
+ # Add documents to the FAISS index
79
+ vector_store.add_documents(docs)
80
+ main_placeholder.text("Embedding Vector Building Initiated...")
81
+
82
+ # Save the FAISS index and documents separately
83
+ # index_path = "faiss_index.bin"
84
+ faiss.write_index(vector_store.index, index_path)
85
+ # docs_path = "docs.pkl"
86
+ with open(docs_path, "wb") as f:
87
+ pickle.dump(docs, f)
88
+
89
+ # Save the index_to_docstore_id mapping
90
+ # index_to_docstore_id_path = "index_to_docstore_id.pkl"
91
+ with open(index_to_docstore_id_path, "wb") as f:
92
+ pickle.dump(vector_store.index_to_docstore_id, f)
93
+
94
+
95
+ query = main_placeholder.text_input("Question: ")
96
+ if query:
97
+ # Load the FAISS index and documents
98
+ if os.path.exists(index_path) and os.path.exists(docs_path) and os.path.exists(index_to_docstore_id_path):
99
+ index = faiss.read_index(index_path)
100
+ with open(docs_path, "rb") as f:
101
+ docs = pickle.load(f)
102
+ with open(index_to_docstore_id_path, "rb") as f:
103
+ index_to_docstore_id = pickle.load(f)
104
+ docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs)})
105
+ # print(f"Loaded document store keys: {list(docstore._dict.keys())[:10]}") # Debug output
106
+ embeddings = OpenAIEmbeddings() # Recreate embeddings object
107
+ vector_store = FAISS(embedding_function=embeddings, index=index, docstore=docstore,
108
+ index_to_docstore_id=index_to_docstore_id)
109
+
110
+ chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_store.as_retriever())
111
+ result = chain.invoke({"question": query}, return_only_outputs=True)
112
+
113
+ # Extract and display the result
114
+ answer = result.get("answer", "No answer found.")
115
+ sources = result.get("sources", "No sources available.")
116
+
117
+ # Add to session state history
118
+ st.session_state.qa_history.append({"question": query, "answer": answer, "sources": sources})
119
+
120
+ # result will be a dictionary of this format --> {"answer": "", "sources": [] }
121
+ st.subheader("Response:")
122
+ st.write(result["answer"])
123
+
124
+ # Display sources, if available
125
+ sources = result.get("sources", "")
126
+ if sources:
127
+ st.subheader("Sources:")
128
+ sources_list = sources.split("\n") # Split the sources by newline
129
+ for source in sources_list:
130
+ st.write(source)
131
+
132
+ # Display all questions and answers from the session
133
+ if st.session_state.qa_history:
134
+ st.write("---------------------------------------------------------------------")
135
+ st.subheader("History:")
136
+ for entry in st.session_state.qa_history:
137
+ st.write(f"**Q:** {entry['question']}")
138
+ st.write(f"**A:** {entry['answer']}")
139
+ st.write(f"**Sources:** {entry['sources']}")
140
+
141
+
142
+
143
+
144
+
docs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6903cc803413644c4fb54ef160afb7b9a9d243ddda6af09062f7637ebfd74315
3
+ size 9623
faiss_index.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aa11ea1b8f510b6a5fa520c86ed7a7ffcece407a82f37efd44d38206cb1e42c
3
+ size 61485
index_to_docstore_id.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:227dda40bef66805a4fba7be994a3804fb7aea24144a8b8fb99f597ba4b3d355
3
+ size 486