Spaces:

ashok2216
/

pdf-chatbot

Sleeping

App Files Files Community

ashok2216 commited on Nov 19, 2024

Commit

7e364b6

verified ·

1 Parent(s): c7fe08b

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -91

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-# import chromadb
-# from chromadb.utils import embedding_functions
-# from sentence_transformers import SentenceTransformer
-# from transformers import pipeline
-# import streamlit as st
-# import fitz  # PyMuPDF for PDF parsing
 # # Step 1: Setup ChromaDB
 # def setup_chromadb():
@@ -19,92 +19,98 @@
 #     collection = client.create_collection("pdf_data", embedding_function=ef)
 #     return client, collection
-# # Step 2: Extract Text from PDF
-# def extract_text_from_pdf(pdf_path):
-#     pdf_text = ""
-#     with fitz.open(pdf_path) as doc:
-#         for page in doc:
-#             pdf_text += page.get_text()
-#     return pdf_text
-# # Step 3: Add Extracted Text to Vector Database
-# def add_pdf_text_to_db(collection, pdf_text):
-#     sentences = pdf_text.split("\n")  # Split text into lines for granularity
-#     for idx, sentence in enumerate(sentences):
-#         if sentence.strip():  # Avoid empty lines
-#             collection.add(
-#                 ids=[f"pdf_text_{idx}"],
-#                 documents=[sentence],
-#                 metadatas={"line_number": idx, "text": sentence}
-#             )
-# # Step 4: Query Function
-# def query_pdf_data(collection, query, retriever_model):
-#     results = collection.query(
-#         query_texts=[query],
-#         n_results=3
-#     )
-#     context = " ".join([doc for doc in results["documents"][0]])
-#     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
-#     return answer, results["metadatas"]
-# # Streamlit Interface
-# def main():
-#     st.title("PDF Chatbot with Retrieval-Augmented Generation")
-#     st.write("Upload a PDF, and ask questions about its content!")
-#     # Initialize components
-#     client, collection = setup_chromadb()
-#     retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
-#     # File upload
-#     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
-#     if uploaded_file:
-#         st.write("Extracting text and populating the database...")
-#         pdf_text = extract_text_from_pdf(uploaded_file)
-#         add_pdf_text_to_db(collection, pdf_text)
-#         st.success("PDF text has been added to the database. You can now query it!")
-#         # Query Input
-#         query = st.text_input("Enter your query about the PDF:")
-#         if query:
-#             try:
-#                 answer, metadata = query_pdf_data(collection, query, retriever_model)
-#                 st.subheader("Answer:")
-#                 st.write(answer[0]['generated_text'])
-#                 st.subheader("Retrieved Context:")
-#                 for meta in metadata[0]:
-#                     st.write(meta)
-#             except Exception as e:
-#                 st.error(f"An error occurred: {str(e)}")
-# if __name__ == "__main__":
-#     main()
-import streamlit as st
-from streamlit_chromadb_connection.chromadb_connection import ChromadbConnection
-configuration = {
-    "client": "HttpClient",
-    "host": "localhost",
-    "port": 8000,
-}
-conn = st.connection(name="http_connection",
-                     type=ChromadbConnection,
-                     **configuration)
-collection_name = "documents_collection"
-embedding_function_name = "DefaultEmbedding"
-conn.create_collection(collection_name=collection_name,
-                       embedding_function_name=embedding_function_name)
-collection_name = "documents_collection"
-conn.get_collection_data(collection_name=collection_name)
 # import tempfile

+import chromadb
+from chromadb.utils import embedding_functions
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import streamlit as st
+import fitz  # PyMuPDF for PDF parsing
 # # Step 1: Setup ChromaDB
 # def setup_chromadb():
 #     collection = client.create_collection("pdf_data", embedding_function=ef)
 #     return client, collection
+# import chromadb
+from chromadb.config import Settings
+# Configure ChromaDB with persistent SQLite database
+config = Settings(
+    persist_directory="./chromadb_data",
+    chroma_db_impl="sqlite",
+)
+# Initialize ChromaDB client
+def setup_chromadb():
+    try:
+        client = chromadb.Client(config)
+        collections = client.list_collections()
+        print(f"Existing collections: {collections}")
+        if "pdf_data" in [c.name for c in collections]:
+            client.delete_collection("pdf_data")
+            print("Existing collection 'pdf_data' deleted.")
+        collection = client.create_collection(
+            "pdf_data",
+            embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            ),
+        )
+        return client, collection
+    except Exception as e:
+        print("Error setting up ChromaDB:", e)
+        raise e
+# Step 2: Extract Text from PDF
+def extract_text_from_pdf(pdf_path):
+    pdf_text = ""
+    with fitz.open(pdf_path) as doc:
+        for page in doc:
+            pdf_text += page.get_text()
+    return pdf_text
+# Step 3: Add Extracted Text to Vector Database
+def add_pdf_text_to_db(collection, pdf_text):
+    sentences = pdf_text.split("\n")  # Split text into lines for granularity
+    for idx, sentence in enumerate(sentences):
+        if sentence.strip():  # Avoid empty lines
+            collection.add(
+                ids=[f"pdf_text_{idx}"],
+                documents=[sentence],
+                metadatas={"line_number": idx, "text": sentence}
+            )
+# Step 4: Query Function
+def query_pdf_data(collection, query, retriever_model):
+    results = collection.query(
+        query_texts=[query],
+        n_results=3
+    )
+    context = " ".join([doc for doc in results["documents"][0]])
+    answer = retriever_model(f"Context: {context}\nQuestion: {query}")
+    return answer, results["metadatas"]
+# Streamlit Interface
+def main():
+    st.title("PDF Chatbot with Retrieval-Augmented Generation")
+    st.write("Upload a PDF, and ask questions about its content!")
+    # Initialize components
+    client, collection = setup_chromadb()
+    retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
+    # File upload
+    uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
+    if uploaded_file:
+        st.write("Extracting text and populating the database...")
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        add_pdf_text_to_db(collection, pdf_text)
+        st.success("PDF text has been added to the database. You can now query it!")
+        # Query Input
+        query = st.text_input("Enter your query about the PDF:")
+        if query:
+            try:
+                answer, metadata = query_pdf_data(collection, query, retriever_model)
+                st.subheader("Answer:")
+                st.write(answer[0]['generated_text'])
+                st.subheader("Retrieved Context:")
+                for meta in metadata[0]:
+                    st.write(meta)
+            except Exception as e:
+                st.error(f"An error occurred: {str(e)}")
+if __name__ == "__main__":
+    main()
 # import tempfile