Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -66,12 +66,19 @@ def setup_chromadb():
|
|
66 |
|
67 |
|
68 |
# Step 2: Extract Text from PDF
|
69 |
-
def extract_text_from_pdf(pdf_path):
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
for page in doc:
|
73 |
-
|
74 |
-
|
75 |
|
76 |
# Step 3: Add Extracted Text to Vector Database
|
77 |
def add_pdf_text_to_db(collection, pdf_text):
|
@@ -106,23 +113,30 @@ def main():
|
|
106 |
# File upload
|
107 |
uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
|
108 |
if uploaded_file:
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
if __name__ == "__main__":
|
128 |
main()
|
|
|
66 |
|
67 |
|
68 |
# Step 2: Extract Text from PDF
|
69 |
+
# def extract_text_from_pdf(pdf_path):
|
70 |
+
# pdf_text = ""
|
71 |
+
# with fitz.open(pdf_path) as doc:
|
72 |
+
# for page in doc:
|
73 |
+
# pdf_text += page.get_text()
|
74 |
+
# return pdf_text
|
75 |
+
|
76 |
+
def extract_text_from_pdf(uploaded_file):
|
77 |
+
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
|
78 |
+
text = ""
|
79 |
for page in doc:
|
80 |
+
text += page.get_text()
|
81 |
+
return text
|
82 |
|
83 |
# Step 3: Add Extracted Text to Vector Database
|
84 |
def add_pdf_text_to_db(collection, pdf_text):
|
|
|
113 |
# File upload
|
114 |
uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
|
115 |
if uploaded_file:
|
116 |
+
try:
|
117 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
118 |
+
st.success("Text extracted successfully!")
|
119 |
+
st.text_area("Extracted Text:", pdf_text, height=300)
|
120 |
+
except Exception as e:
|
121 |
+
st.error(f"Error extracting text: {e}")
|
122 |
+
# if uploaded_file:
|
123 |
+
# st.write("Extracting text and populating the database...")
|
124 |
+
# pdf_text = extract_text_from_pdf(uploaded_file)
|
125 |
+
# add_pdf_text_to_db(collection, pdf_text)
|
126 |
+
# st.success("PDF text has been added to the database. You can now query it!")
|
127 |
+
|
128 |
+
# # Query Input
|
129 |
+
# query = st.text_input("Enter your query about the PDF:")
|
130 |
+
# if query:
|
131 |
+
# try:
|
132 |
+
# answer, metadata = query_pdf_data(collection, query, retriever_model)
|
133 |
+
# st.subheader("Answer:")
|
134 |
+
# st.write(answer[0]['generated_text'])
|
135 |
+
# st.subheader("Retrieved Context:")
|
136 |
+
# for meta in metadata[0]:
|
137 |
+
# st.write(meta)
|
138 |
+
# except Exception as e:
|
139 |
+
# st.error(f"An error occurred: {str(e)}")
|
140 |
|
141 |
if __name__ == "__main__":
|
142 |
main()
|