Mehmet Emin Aydin commited on
Commit
5d16466
·
unverified ·
1 Parent(s): 97d68e6

Create app.py

Browse files

Main project architecture implemented

Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+
5
+ import streamlit as st
6
+ import requests
7
+ import subprocess
8
+ import atexit
9
+ import os
10
+ import signal
11
+ import os
12
+ import PyPDF2
13
+ from docx import Document
14
+ from fastapi import UploadFile, FastAPI, File, Form, UploadFile, HTTPException
15
+ from langchain.text_splitter import CharacterTextSplitter
16
+ from langchain_community.embeddings import HuggingFaceEmbeddings
17
+ from langchain_community.vectorstores import FAISS
18
+ from langchain_google_genai import ChatGoogleGenerativeAI
19
+ import pickle
20
+ from datetime import datetime
21
+ import io
22
+ from dotenv import load_dotenv
23
+ class User:
24
+ def __init__(self, username):
25
+ self.username = username
26
+ self.llm = "gemini-pro"
27
+ self.embedder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
28
+
29
+ async def upload_documents(user: User, files: list[UploadFile]) -> tuple[str, int]:
30
+ text = await _extract_text_from_document(files)
31
+ chunks = await _chunk_text(text)
32
+ pkl_name, status_code = await _create_embeddings_and_save(user, chunks)
33
+ if status_code == 200:
34
+ return "Document uploaded successfully.", 200
35
+ else:
36
+ return "Failed to upload document.", 500
37
+
38
+ async def _extract_text_from_document(files: list[UploadFile]) -> str:
39
+ text = ""
40
+ for file in files:
41
+ byte_object = await file.read()
42
+ file_name = file.filename
43
+ file_extension = os.path.splitext(file_name)[1]
44
+ if file_extension == '.txt':
45
+ text += byte_object.decode('utf-8')
46
+ elif file_extension == '.pdf':
47
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(byte_object))
48
+ for page_number in range(len(pdf_reader.pages)):
49
+ page = pdf_reader.pages[page_number]
50
+ text += page.extract_text()
51
+ elif file_extension == '.docx':
52
+ doc = Document(io.BytesIO(byte_object))
53
+ for paragraph in doc.paragraphs:
54
+ text += paragraph.text + "\n"
55
+ return text
56
+
57
+ async def _chunk_text(text: str) -> list[str]:
58
+ chunks = None
59
+ text_splitter = CharacterTextSplitter(
60
+ separator="\n",
61
+ chunk_size=512,
62
+ chunk_overlap=10,
63
+ length_function=len
64
+ )
65
+ chunks = text_splitter.split_text(text)
66
+ return chunks
67
+
68
+ async def _create_embeddings_and_save(user: User, chunks: any) -> FAISS:
69
+ embeddings = HuggingFaceEmbeddings(model_name=user.embedder)
70
+ pkl_name = os.path.join(user.username + ".pkl")
71
+ vector_store = FAISS.from_texts(chunks, embeddings, metadatas=[{"source": f"{pkl_name}:{i}"} for i in range(len(chunks))])
72
+ with open(pkl_name, "wb") as f:
73
+ pickle.dump(vector_store, f)
74
+ return vector_store
75
+
76
+ async def ask_question(user: User, question: str, api_key: str) -> tuple[str, int]:
77
+ username = user.username
78
+ vector_store = await _get_vector_file(username)
79
+ if vector_store is None:
80
+ return "Document not found.", 400
81
+
82
+ if api_key is not None:
83
+ os.environ["GOOGLE_API_KEY"] = api_key
84
+ else:
85
+ is_loaded = load_dotenv()
86
+ if is_loaded == False:
87
+ return "API key not found.", 400
88
+
89
+ llm = ChatGoogleGenerativeAI(model=user.llm, temperature=0, max_output_tokens=256, top_k = 40, top_p = 0.8)
90
+ docs = vector_store.similarity_search(question)
91
+ retrieved_chunks = docs[0].page_content + docs[1].page_content + docs[2].page_content
92
+ system_message="Figure out the answer of the question by the given information pieces. ALWAYS answer with the language of the question."
93
+ prompt = system_message + "Question: " + question + " Context: " + retrieved_chunks
94
+ try:
95
+ response = llm.invoke(prompt)
96
+ except Exception:
97
+ return "Wrong API key.", 400
98
+ answer = response.content + " **<Most Related Chunk>** " + retrieved_chunks
99
+ await _log(user, question, system_message, retrieved_chunks, response.content)
100
+ return answer, 200
101
+
102
+ async def _get_vector_file(username: str)-> any:
103
+ with open(username+".pkl", "rb") as f:
104
+ vector_store = pickle.load(f)
105
+ return vector_store
106
+
107
+ async def _log(user: User, question: str, system_message: str, retrieved_chunks: str, answer: str):
108
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
109
+ log_message = (
110
+ f"{timestamp}, Username: {user.username}, Question: {question}, "
111
+ f"LLM: {user.llm}, Embedder: {user.embedder}, System Message: {system_message}, "
112
+ f"Retrieved Texts: {retrieved_chunks}, Answer: {answer}\n"
113
+ )
114
+ with open("log.txt", "a", encoding="utf-8") as file:
115
+ file.write(log_message)
116
+
117
+
118
+ app = FastAPI()
119
+ @app.post("/document-uploader")
120
+ async def document_uploader(username: str = Form(...), files: list[UploadFile] = File(...)):
121
+ user = User(username=username)
122
+ response, status_code = await upload_documents(user, files)
123
+ if status_code == 200:
124
+ return {response}
125
+ else:
126
+ raise HTTPException(status_code=status_code, detail=response)
127
+
128
+ @app.post("/question-answerer")
129
+ async def question_answerer(username: str = Form(...), question: str = Form(...), api_key = File(None)):
130
+ user = User(username=username)
131
+ response, status_code = await ask_question(user, question, api_key)
132
+ if status_code == 200:
133
+ return {response}
134
+ else:
135
+ raise HTTPException(status_code=status_code, detail=response)
136
+
137
+
138
+ def main():
139
+ st.title("Free Multilingual RAG")
140
+
141
+ tabs = ["Upload Document", "Ask Question"]
142
+ active_tab = st.radio("Upload documents first, ask questions later:", tabs)
143
+
144
+ if active_tab == "Upload Document":
145
+ upload_document()
146
+ elif active_tab == "Ask Question":
147
+ ask_question()
148
+
149
+ def upload_document():
150
+ st.write("Several files can be uploaded, each upload crushes the old one. Depending on the number and size of files, the upload process may take a long time.")
151
+
152
+ username = st.text_input("Enter a username (just something that represents you):")
153
+ uploaded_files = st.file_uploader("Upload your documents (for now it only works with files that has .txt, .pdf or .docx extension):", accept_multiple_files=True)
154
+
155
+ if uploaded_files:
156
+ st.write("Number of uploaded files:", len(uploaded_files))
157
+
158
+ for uploaded_file in uploaded_files:
159
+ file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
160
+ st.write(file_details)
161
+
162
+ files = [("files", (uploaded_file.name, uploaded_file, uploaded_file.type)) for uploaded_file in uploaded_files]
163
+
164
+ payload = {'username': username}
165
+
166
+ with st.spinner('Loading...'):
167
+ response = requests.post("http://localhost:8000/document-uploader/", files=files, data=payload)
168
+
169
+ if response.status_code == 200:
170
+ st.success(response.text)
171
+ else:
172
+ st.error("Error:", response.text)
173
+
174
+
175
+ def ask_question():
176
+ username = st.text_input("Enter a username (just something that represents you):")
177
+ api_key = st.text_input("Add your Google API key. It is free. Key acquisition video: [https://www.youtube.com/watch?v=brCkpzAD0gc]: (If you do not trust you can download and use the app in your local too)", type="password")
178
+ question = st.text_area("Enter the question you want to ask in your document (the more detailed your question, the more accurate an answer you will get): ")
179
+
180
+ if st.button("Ask"):
181
+ if not question:
182
+ st.warning("Please enter a question.")
183
+ elif not username:
184
+ st.warning("Please enter a username.")
185
+ else:
186
+ payload = {'username': username, 'question': question, 'api_key': api_key}
187
+
188
+ with st.spinner('Question is getting answered...'):
189
+ response = requests.post("http://localhost:8000/question-answerer/", data=payload)
190
+
191
+ if response.status_code == 200:
192
+ st.success("Answer: " + response.text)
193
+ else:
194
+ print(response)
195
+ st.error("Error:", response.text)
196
+
197
+ uvicorn_process = None
198
+
199
+ def run_fastapi():
200
+ global uvicorn_process
201
+ if uvicorn_process is None:
202
+ uvicorn_process = subprocess.Popen(["uvicorn", "app:app", "--host", "127.0.0.1", "--port", "8000"])
203
+ print("FastAPI server has been started.")
204
+
205
+ def cleanup():
206
+ global uvicorn_process
207
+ if uvicorn_process:
208
+ os.kill(uvicorn_process.pid, signal.SIGTERM)
209
+ uvicorn_process.wait()
210
+ print("FastAPI server has been closed.")
211
+
212
+ if __name__ == "__main__":
213
+ run_fastapi()
214
+ atexit.register(cleanup)
215
+ main()