ashok2216 commited on
Commit
cf2d248
·
verified ·
1 Parent(s): 0c3d325

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -1
app.py CHANGED
@@ -1,3 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import chromadb
2
  from chromadb.utils import embedding_functions
3
  from chromadb.config import Settings
@@ -7,7 +97,6 @@ import streamlit as st
7
  import fitz # PyMuPDF for PDF parsing
8
 
9
 
10
-
11
  # Configure ChromaDB with persistent SQLite database
12
  config = Settings(
13
  persist_directory="./chromadb_data",
@@ -25,6 +114,10 @@ def setup_chromadb():
25
  )
26
  return client, collection
27
 
 
 
 
 
28
  def extract_text_from_pdf(uploaded_file):
29
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
30
  text = ""
@@ -64,6 +157,11 @@ def main():
64
  uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
65
  if uploaded_file:
66
  try:
 
 
 
 
 
67
  pdf_text = extract_text_from_pdf(uploaded_file)
68
  st.success("Text extracted successfully!")
69
  st.text_area("Extracted Text:", pdf_text, height=300)
@@ -87,3 +185,4 @@ def main():
87
 
88
  if __name__ == "__main__":
89
  main()
 
 
1
+ # import chromadb
2
+ # from chromadb.utils import embedding_functions
3
+ # from chromadb.config import Settings
4
+ # from sentence_transformers import SentenceTransformer
5
+ # from transformers import pipeline
6
+ # import streamlit as st
7
+ # import fitz # PyMuPDF for PDF parsing
8
+
9
+
10
+
11
+ # # Configure ChromaDB with persistent SQLite database
12
+ # config = Settings(
13
+ # persist_directory="./chromadb_data",
14
+ # chroma_db_impl="sqlite",
15
+ # )
16
+
17
+ # # Initialize persistent client with SQLite
18
+ # def setup_chromadb():
19
+ # client = chromadb.PersistentClient(path="./chromadb_data")
20
+ # collection = client.get_or_create_collection(
21
+ # name="pdf_data",
22
+ # embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
23
+ # model_name="sentence-transformers/all-MiniLM-L6-v2"
24
+ # ),
25
+ # )
26
+ # return client, collection
27
+
28
+ # def extract_text_from_pdf(uploaded_file):
29
+ # with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
30
+ # text = ""
31
+ # for page in doc:
32
+ # text += page.get_text()
33
+ # return text
34
+
35
+ # def add_pdf_text_to_db(collection, pdf_text):
36
+ # sentences = pdf_text.split("\n") # Split text into lines for granularity
37
+ # for idx, sentence in enumerate(sentences):
38
+ # if sentence.strip(): # Avoid empty lines
39
+ # collection.add(
40
+ # ids=[f"pdf_text_{idx}"],
41
+ # documents=[sentence],
42
+ # metadatas={"line_number": idx, "text": sentence}
43
+ # )
44
+
45
+ # def query_pdf_data(collection, query, retriever_model):
46
+ # results = collection.query(
47
+ # query_texts=[query],
48
+ # n_results=3
49
+ # )
50
+ # context = " ".join([doc for doc in results["documents"][0]])
51
+ # answer = retriever_model(f"Context: {context}\nQuestion: {query}")
52
+ # return answer, results["metadatas"]
53
+
54
+ # # Streamlit Interface
55
+ # def main():
56
+ # st.title("PDF Chatbot with Retrieval-Augmented Generation")
57
+ # st.write("Upload a PDF, and ask questions about its content!")
58
+
59
+ # # Initialize components
60
+ # client, collection = setup_chromadb()
61
+ # retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
62
+
63
+ # # File upload
64
+ # uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
65
+ # if uploaded_file:
66
+ # try:
67
+ # pdf_text = extract_text_from_pdf(uploaded_file)
68
+ # st.success("Text extracted successfully!")
69
+ # st.text_area("Extracted Text:", pdf_text, height=300)
70
+ # add_pdf_text_to_db(collection, pdf_text)
71
+ # st.success("PDF text has been added to the database. You can now query it!")
72
+ # except Exception as e:
73
+ # st.error(f"Error extracting text: {e}")
74
+
75
+ # query = st.text_input("Enter your query about the PDF:")
76
+ # if query:
77
+ # try:
78
+ # answer, metadata = query_pdf_data(collection, query, retriever_model)
79
+ # st.subheader("Answer:")
80
+ # st.write(answer[0]['generated_text'])
81
+ # st.subheader("Retrieved Context:")
82
+ # for meta in metadata[0]:
83
+ # st.write(meta)
84
+ # except Exception as e:
85
+ # st.error(f"An error occurred: {str(e)}")
86
+
87
+
88
+ # if __name__ == "__main__":
89
+ # main()
90
+
91
  import chromadb
92
  from chromadb.utils import embedding_functions
93
  from chromadb.config import Settings
 
97
  import fitz # PyMuPDF for PDF parsing
98
 
99
 
 
100
  # Configure ChromaDB with persistent SQLite database
101
  config = Settings(
102
  persist_directory="./chromadb_data",
 
114
  )
115
  return client, collection
116
 
117
+ # Clear the collection
118
+ def clear_collection(collection):
119
+ collection.delete(where={}) # Delete all entries in the collection
120
+
121
  def extract_text_from_pdf(uploaded_file):
122
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
123
  text = ""
 
157
  uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
158
  if uploaded_file:
159
  try:
160
+ # Clear existing data
161
+ clear_collection(collection)
162
+ st.info("Existing data cleared from the database.")
163
+
164
+ # Extract and add new data
165
  pdf_text = extract_text_from_pdf(uploaded_file)
166
  st.success("Text extracted successfully!")
167
  st.text_area("Extracted Text:", pdf_text, height=300)
 
185
 
186
  if __name__ == "__main__":
187
  main()
188
+