Spaces:

ashok2216
/

pdf-chatbot

Sleeping

App Files Files Community

ashok2216 commited on Nov 19, 2024

Commit

e4a6244

verified ·

1 Parent(s): 8d71f5d

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -90

app.py CHANGED Viewed

@@ -1,93 +1,3 @@
-# import chromadb
-# from chromadb.utils import embedding_functions
-# from chromadb.config import Settings
-# from sentence_transformers import SentenceTransformer
-# from transformers import pipeline
-# import streamlit as st
-# import fitz  # PyMuPDF for PDF parsing
-# # Configure ChromaDB with persistent SQLite database
-# config = Settings(
-#     persist_directory="./chromadb_data",
-#     chroma_db_impl="sqlite",
-# )
-# # Initialize persistent client with SQLite
-# def setup_chromadb():
-#     client = chromadb.PersistentClient(path="./chromadb_data")
-#     collection = client.get_or_create_collection(
-#         name="pdf_data",
-#         embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
-#             model_name="sentence-transformers/all-MiniLM-L6-v2"
-#         ),
-#     )
-#     return client, collection
-# def extract_text_from_pdf(uploaded_file):
-#     with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
-#         text = ""
-#         for page in doc:
-#             text += page.get_text()
-#         return text
-# def add_pdf_text_to_db(collection, pdf_text):
-#     sentences = pdf_text.split("\n")  # Split text into lines for granularity
-#     for idx, sentence in enumerate(sentences):
-#         if sentence.strip():  # Avoid empty lines
-#             collection.add(
-#                 ids=[f"pdf_text_{idx}"],
-#                 documents=[sentence],
-#                 metadatas={"line_number": idx, "text": sentence}
-#             )
-# def query_pdf_data(collection, query, retriever_model):
-#     results = collection.query(
-#         query_texts=[query],
-#         n_results=3
-#     )
-#     context = " ".join([doc for doc in results["documents"][0]])
-#     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
-#     return answer, results["metadatas"]
-# # Streamlit Interface
-# def main():
-#     st.title("PDF Chatbot with Retrieval-Augmented Generation")
-#     st.write("Upload a PDF, and ask questions about its content!")
-#     # Initialize components
-#     client, collection = setup_chromadb()
-#     retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
-#     # File upload
-#     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
-#     if uploaded_file:
-#         try:
-#             pdf_text = extract_text_from_pdf(uploaded_file)
-#             st.success("Text extracted successfully!")
-#             st.text_area("Extracted Text:", pdf_text, height=300)
-#             add_pdf_text_to_db(collection, pdf_text)
-#             st.success("PDF text has been added to the database. You can now query it!")
-#         except Exception as e:
-#             st.error(f"Error extracting text: {e}")
-#         query = st.text_input("Enter your query about the PDF:")
-#         if query:
-#             try:
-#                 answer, metadata = query_pdf_data(collection, query, retriever_model)
-#                 st.subheader("Answer:")
-#                 st.write(answer[0]['generated_text'])
-#                 st.subheader("Retrieved Context:")
-#                 for meta in metadata[0]:
-#                     st.write(meta)
-#             except Exception as e:
-#                 st.error(f"An error occurred: {str(e)}")
-# if __name__ == "__main__":
-#     main()
 import chromadb
 from chromadb.utils import embedding_functions
 from chromadb.config import Settings
@@ -173,6 +83,7 @@ def main():
             st.text_area("Extracted Text:", pdf_text, height=300)
             add_pdf_text_to_db(collection, pdf_text)
             st.success("PDF text has been added to the database. You can now query it!")
         except Exception as e:
             st.error(f"Error extracting text: {e}")

 import chromadb
 from chromadb.utils import embedding_functions
 from chromadb.config import Settings
             st.text_area("Extracted Text:", pdf_text, height=300)
             add_pdf_text_to_db(collection, pdf_text)
             st.success("PDF text has been added to the database. You can now query it!")
         except Exception as e:
             st.error(f"Error extracting text: {e}")