Spaces:

ashok2216
/

pdf-chatbot

Sleeping

App Files Files Community

ashok2216 commited on Nov 19, 2024

Commit

cf2d248

verified ·

1 Parent(s): 0c3d325

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -1

app.py CHANGED Viewed

@@ -1,3 +1,93 @@
 import chromadb
 from chromadb.utils import embedding_functions
 from chromadb.config import Settings
@@ -7,7 +97,6 @@ import streamlit as st
 import fitz  # PyMuPDF for PDF parsing
 # Configure ChromaDB with persistent SQLite database
 config = Settings(
     persist_directory="./chromadb_data",
@@ -25,6 +114,10 @@ def setup_chromadb():
     )
     return client, collection
 def extract_text_from_pdf(uploaded_file):
     with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
         text = ""
@@ -64,6 +157,11 @@ def main():
     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
     if uploaded_file:
         try:
             pdf_text = extract_text_from_pdf(uploaded_file)
             st.success("Text extracted successfully!")
             st.text_area("Extracted Text:", pdf_text, height=300)
@@ -87,3 +185,4 @@ def main():
 if __name__ == "__main__":
     main()

+# import chromadb
+# from chromadb.utils import embedding_functions
+# from chromadb.config import Settings
+# from sentence_transformers import SentenceTransformer
+# from transformers import pipeline
+# import streamlit as st
+# import fitz  # PyMuPDF for PDF parsing
+# # Configure ChromaDB with persistent SQLite database
+# config = Settings(
+#     persist_directory="./chromadb_data",
+#     chroma_db_impl="sqlite",
+# )
+# # Initialize persistent client with SQLite
+# def setup_chromadb():
+#     client = chromadb.PersistentClient(path="./chromadb_data")
+#     collection = client.get_or_create_collection(
+#         name="pdf_data",
+#         embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
+#             model_name="sentence-transformers/all-MiniLM-L6-v2"
+#         ),
+#     )
+#     return client, collection
+# def extract_text_from_pdf(uploaded_file):
+#     with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
+#         text = ""
+#         for page in doc:
+#             text += page.get_text()
+#         return text
+# def add_pdf_text_to_db(collection, pdf_text):
+#     sentences = pdf_text.split("\n")  # Split text into lines for granularity
+#     for idx, sentence in enumerate(sentences):
+#         if sentence.strip():  # Avoid empty lines
+#             collection.add(
+#                 ids=[f"pdf_text_{idx}"],
+#                 documents=[sentence],
+#                 metadatas={"line_number": idx, "text": sentence}
+#             )
+# def query_pdf_data(collection, query, retriever_model):
+#     results = collection.query(
+#         query_texts=[query],
+#         n_results=3
+#     )
+#     context = " ".join([doc for doc in results["documents"][0]])
+#     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
+#     return answer, results["metadatas"]
+# # Streamlit Interface
+# def main():
+#     st.title("PDF Chatbot with Retrieval-Augmented Generation")
+#     st.write("Upload a PDF, and ask questions about its content!")
+#     # Initialize components
+#     client, collection = setup_chromadb()
+#     retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
+#     # File upload
+#     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
+#     if uploaded_file:
+#         try:
+#             pdf_text = extract_text_from_pdf(uploaded_file)
+#             st.success("Text extracted successfully!")
+#             st.text_area("Extracted Text:", pdf_text, height=300)
+#             add_pdf_text_to_db(collection, pdf_text)
+#             st.success("PDF text has been added to the database. You can now query it!")
+#         except Exception as e:
+#             st.error(f"Error extracting text: {e}")
+#         query = st.text_input("Enter your query about the PDF:")
+#         if query:
+#             try:
+#                 answer, metadata = query_pdf_data(collection, query, retriever_model)
+#                 st.subheader("Answer:")
+#                 st.write(answer[0]['generated_text'])
+#                 st.subheader("Retrieved Context:")
+#                 for meta in metadata[0]:
+#                     st.write(meta)
+#             except Exception as e:
+#                 st.error(f"An error occurred: {str(e)}")
+# if __name__ == "__main__":
+#     main()
 import chromadb
 from chromadb.utils import embedding_functions
 from chromadb.config import Settings
 import fitz  # PyMuPDF for PDF parsing
 # Configure ChromaDB with persistent SQLite database
 config = Settings(
     persist_directory="./chromadb_data",
     )
     return client, collection
+# Clear the collection
+def clear_collection(collection):
+    collection.delete(where={})  # Delete all entries in the collection
 def extract_text_from_pdf(uploaded_file):
     with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
         text = ""
     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
     if uploaded_file:
         try:
+            # Clear existing data
+            clear_collection(collection)
+            st.info("Existing data cleared from the database.")
+            # Extract and add new data
             pdf_text = extract_text_from_pdf(uploaded_file)
             st.success("Text extracted successfully!")
             st.text_area("Extracted Text:", pdf_text, height=300)
 if __name__ == "__main__":
     main()