Spaces:
Sleeping
Sleeping
DEBUG: pdfminer
Browse files- app.py +9 -7
- requirements.txt +3 -1
app.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
import io
|
2 |
import os
|
3 |
-
|
4 |
from functions import *
|
5 |
-
from
|
6 |
import pandas as pd
|
7 |
from fastapi import FastAPI, File, UploadFile
|
8 |
from pydantic import BaseModel
|
9 |
from fastapi.middleware.cors import CORSMiddleware
|
10 |
-
from langchain_community.document_loaders import
|
11 |
from src.api.speech_api import speech_translator_router
|
12 |
from functions import client as supabase
|
13 |
from urllib.parse import urlparse
|
@@ -153,10 +153,12 @@ async def newChatbot(chatbotName: str, username: str):
|
|
153 |
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
154 |
source = pdf.filename
|
155 |
pdf = await pdf.read()
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
161 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
162 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
|
|
1 |
import io
|
2 |
import os
|
3 |
+
import tempfile
|
4 |
from functions import *
|
5 |
+
from langchain_community.document_loaders import PDFMinerLoader
|
6 |
import pandas as pd
|
7 |
from fastapi import FastAPI, File, UploadFile
|
8 |
from pydantic import BaseModel
|
9 |
from fastapi.middleware.cors import CORSMiddleware
|
10 |
+
from langchain_community.document_loaders import WebBaseLoader
|
11 |
from src.api.speech_api import speech_translator_router
|
12 |
from functions import client as supabase
|
13 |
from urllib.parse import urlparse
|
|
|
153 |
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
154 |
source = pdf.filename
|
155 |
pdf = await pdf.read()
|
156 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
157 |
+
temp_file.write(pdf)
|
158 |
+
temp_file_path = temp_file.name
|
159 |
+
loader = PDFMinerLoader(file_path = temp_file_path, concatenate_pages = True)
|
160 |
+
text = loader.load()[0].page_content
|
161 |
+
os.remove(temp_file_path)
|
162 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
163 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
164 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
requirements.txt
CHANGED
@@ -70,6 +70,8 @@ websockets==12.0
|
|
70 |
bs4
|
71 |
huggingface-hub
|
72 |
fastembed-gpu
|
|
|
|
|
73 |
flashrank
|
74 |
langchain
|
75 |
langchain-community
|
@@ -78,7 +80,7 @@ langchain-huggingface
|
|
78 |
langchain-qdrant
|
79 |
langchain-groq
|
80 |
lxml
|
81 |
-
|
82 |
python-dotenv
|
83 |
pillow
|
84 |
pandas
|
|
|
70 |
bs4
|
71 |
huggingface-hub
|
72 |
fastembed-gpu
|
73 |
+
nest_asyncio
|
74 |
+
beautifulsoup4
|
75 |
flashrank
|
76 |
langchain
|
77 |
langchain-community
|
|
|
80 |
langchain-qdrant
|
81 |
langchain-groq
|
82 |
lxml
|
83 |
+
pdfminer.six
|
84 |
python-dotenv
|
85 |
pillow
|
86 |
pandas
|