ashok2216 commited on
Commit
7e364b6
·
verified ·
1 Parent(s): c7fe08b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -91
app.py CHANGED
@@ -1,9 +1,9 @@
1
- # import chromadb
2
- # from chromadb.utils import embedding_functions
3
- # from sentence_transformers import SentenceTransformer
4
- # from transformers import pipeline
5
- # import streamlit as st
6
- # import fitz # PyMuPDF for PDF parsing
7
 
8
  # # Step 1: Setup ChromaDB
9
  # def setup_chromadb():
@@ -19,92 +19,98 @@
19
  # collection = client.create_collection("pdf_data", embedding_function=ef)
20
  # return client, collection
21
 
22
- # # Step 2: Extract Text from PDF
23
- # def extract_text_from_pdf(pdf_path):
24
- # pdf_text = ""
25
- # with fitz.open(pdf_path) as doc:
26
- # for page in doc:
27
- # pdf_text += page.get_text()
28
- # return pdf_text
29
-
30
- # # Step 3: Add Extracted Text to Vector Database
31
- # def add_pdf_text_to_db(collection, pdf_text):
32
- # sentences = pdf_text.split("\n") # Split text into lines for granularity
33
- # for idx, sentence in enumerate(sentences):
34
- # if sentence.strip(): # Avoid empty lines
35
- # collection.add(
36
- # ids=[f"pdf_text_{idx}"],
37
- # documents=[sentence],
38
- # metadatas={"line_number": idx, "text": sentence}
39
- # )
40
-
41
- # # Step 4: Query Function
42
- # def query_pdf_data(collection, query, retriever_model):
43
- # results = collection.query(
44
- # query_texts=[query],
45
- # n_results=3
46
- # )
47
- # context = " ".join([doc for doc in results["documents"][0]])
48
- # answer = retriever_model(f"Context: {context}\nQuestion: {query}")
49
- # return answer, results["metadatas"]
50
-
51
- # # Streamlit Interface
52
- # def main():
53
- # st.title("PDF Chatbot with Retrieval-Augmented Generation")
54
- # st.write("Upload a PDF, and ask questions about its content!")
55
-
56
- # # Initialize components
57
- # client, collection = setup_chromadb()
58
- # retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
59
-
60
- # # File upload
61
- # uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
62
- # if uploaded_file:
63
- # st.write("Extracting text and populating the database...")
64
- # pdf_text = extract_text_from_pdf(uploaded_file)
65
- # add_pdf_text_to_db(collection, pdf_text)
66
- # st.success("PDF text has been added to the database. You can now query it!")
67
-
68
- # # Query Input
69
- # query = st.text_input("Enter your query about the PDF:")
70
- # if query:
71
- # try:
72
- # answer, metadata = query_pdf_data(collection, query, retriever_model)
73
- # st.subheader("Answer:")
74
- # st.write(answer[0]['generated_text'])
75
- # st.subheader("Retrieved Context:")
76
- # for meta in metadata[0]:
77
- # st.write(meta)
78
- # except Exception as e:
79
- # st.error(f"An error occurred: {str(e)}")
80
-
81
- # if __name__ == "__main__":
82
- # main()
83
-
84
-
85
-
86
- import streamlit as st
87
- from streamlit_chromadb_connection.chromadb_connection import ChromadbConnection
88
-
89
- configuration = {
90
- "client": "HttpClient",
91
- "host": "localhost",
92
- "port": 8000,
93
- }
94
-
95
- conn = st.connection(name="http_connection",
96
- type=ChromadbConnection,
97
- **configuration)
98
-
99
- collection_name = "documents_collection"
100
-
101
- embedding_function_name = "DefaultEmbedding"
102
- conn.create_collection(collection_name=collection_name,
103
- embedding_function_name=embedding_function_name)
104
-
105
- collection_name = "documents_collection"
106
- conn.get_collection_data(collection_name=collection_name)
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  # import tempfile
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import pipeline
5
+ import streamlit as st
6
+ import fitz # PyMuPDF for PDF parsing
7
 
8
  # # Step 1: Setup ChromaDB
9
  # def setup_chromadb():
 
19
  # collection = client.create_collection("pdf_data", embedding_function=ef)
20
  # return client, collection
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # import chromadb
24
+ from chromadb.config import Settings
25
+
26
+ # Configure ChromaDB with persistent SQLite database
27
+ config = Settings(
28
+ persist_directory="./chromadb_data",
29
+ chroma_db_impl="sqlite",
30
+ )
31
+
32
+ # Initialize ChromaDB client
33
+ def setup_chromadb():
34
+ try:
35
+ client = chromadb.Client(config)
36
+ collections = client.list_collections()
37
+ print(f"Existing collections: {collections}")
38
+ if "pdf_data" in [c.name for c in collections]:
39
+ client.delete_collection("pdf_data")
40
+ print("Existing collection 'pdf_data' deleted.")
41
+ collection = client.create_collection(
42
+ "pdf_data",
43
+ embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
44
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
45
+ ),
46
+ )
47
+ return client, collection
48
+ except Exception as e:
49
+ print("Error setting up ChromaDB:", e)
50
+ raise e
51
+
52
+
53
+ # Step 2: Extract Text from PDF
54
+ def extract_text_from_pdf(pdf_path):
55
+ pdf_text = ""
56
+ with fitz.open(pdf_path) as doc:
57
+ for page in doc:
58
+ pdf_text += page.get_text()
59
+ return pdf_text
60
+
61
+ # Step 3: Add Extracted Text to Vector Database
62
+ def add_pdf_text_to_db(collection, pdf_text):
63
+ sentences = pdf_text.split("\n") # Split text into lines for granularity
64
+ for idx, sentence in enumerate(sentences):
65
+ if sentence.strip(): # Avoid empty lines
66
+ collection.add(
67
+ ids=[f"pdf_text_{idx}"],
68
+ documents=[sentence],
69
+ metadatas={"line_number": idx, "text": sentence}
70
+ )
71
+
72
+ # Step 4: Query Function
73
+ def query_pdf_data(collection, query, retriever_model):
74
+ results = collection.query(
75
+ query_texts=[query],
76
+ n_results=3
77
+ )
78
+ context = " ".join([doc for doc in results["documents"][0]])
79
+ answer = retriever_model(f"Context: {context}\nQuestion: {query}")
80
+ return answer, results["metadatas"]
81
+
82
+ # Streamlit Interface
83
+ def main():
84
+ st.title("PDF Chatbot with Retrieval-Augmented Generation")
85
+ st.write("Upload a PDF, and ask questions about its content!")
86
+
87
+ # Initialize components
88
+ client, collection = setup_chromadb()
89
+ retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
90
+
91
+ # File upload
92
+ uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
93
+ if uploaded_file:
94
+ st.write("Extracting text and populating the database...")
95
+ pdf_text = extract_text_from_pdf(uploaded_file)
96
+ add_pdf_text_to_db(collection, pdf_text)
97
+ st.success("PDF text has been added to the database. You can now query it!")
98
+
99
+ # Query Input
100
+ query = st.text_input("Enter your query about the PDF:")
101
+ if query:
102
+ try:
103
+ answer, metadata = query_pdf_data(collection, query, retriever_model)
104
+ st.subheader("Answer:")
105
+ st.write(answer[0]['generated_text'])
106
+ st.subheader("Retrieved Context:")
107
+ for meta in metadata[0]:
108
+ st.write(meta)
109
+ except Exception as e:
110
+ st.error(f"An error occurred: {str(e)}")
111
+
112
+ if __name__ == "__main__":
113
+ main()
114
 
115
 
116
  # import tempfile