Spaces:
Running
Running
luanpoppe
commited on
Commit
·
39fc36b
1
Parent(s):
78209bc
feat: começandoa a adicionar testes com pytest
Browse files- _utils/handle_files.py +5 -6
- requirements.txt +0 -0
- tests/bubble_integrations/test_obter_arquivo.py +28 -0
- tests/conftest.py +9 -0
- tests/fixtures/_pdf-uma-pagina.pdf +0 -0
- tests/test_handle_files.py +28 -0
- tests/test_splitters.py +82 -0
_utils/handle_files.py
CHANGED
@@ -4,6 +4,11 @@ from langchain_core.documents import Document as LangchainDocument
|
|
4 |
from llama_index import Document
|
5 |
from llama_parse import LlamaParse, ResultType
|
6 |
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def handle_pdf_files_from_serializer(files):
|
9 |
listaPDFs = []
|
@@ -28,11 +33,6 @@ def remove_pdf_temp_files(listaPDFs):
|
|
28 |
|
29 |
|
30 |
async def return_document_list_with_llama_parser(file: str):
|
31 |
-
llama_parser_keys = [
|
32 |
-
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
33 |
-
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
34 |
-
]
|
35 |
-
|
36 |
for key in llama_parser_keys:
|
37 |
documents: List[LangchainDocument] = []
|
38 |
if key:
|
@@ -48,7 +48,6 @@ async def return_document_list_with_llama_parser(file: str):
|
|
48 |
except:
|
49 |
print(f"Error with llama parser key ending with {key[-4:]}")
|
50 |
continue # Faz com que comece o próximo loop
|
51 |
-
print("parsed_document: ", parsed_document)
|
52 |
if len(parsed_document) == 0:
|
53 |
continue
|
54 |
|
|
|
4 |
from llama_index import Document
|
5 |
from llama_parse import LlamaParse, ResultType
|
6 |
|
7 |
+
llama_parser_keys = [
|
8 |
+
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
9 |
+
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
10 |
+
]
|
11 |
+
|
12 |
|
13 |
def handle_pdf_files_from_serializer(files):
|
14 |
listaPDFs = []
|
|
|
33 |
|
34 |
|
35 |
async def return_document_list_with_llama_parser(file: str):
|
|
|
|
|
|
|
|
|
|
|
36 |
for key in llama_parser_keys:
|
37 |
documents: List[LangchainDocument] = []
|
38 |
if key:
|
|
|
48 |
except:
|
49 |
print(f"Error with llama parser key ending with {key[-4:]}")
|
50 |
continue # Faz com que comece o próximo loop
|
|
|
51 |
if len(parsed_document) == 0:
|
52 |
continue
|
53 |
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
tests/bubble_integrations/test_obter_arquivo.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
3 |
+
from langchain_core.documents import Document
|
4 |
+
|
5 |
+
pdf_file_url = "https://vella.app.br/version-test/fileupload/f1736298232170x993758712541722200/0002269-86.2009.8.05.0032%20processo%20teste.pdf"
|
6 |
+
|
7 |
+
|
8 |
+
class TestObterArquivo:
|
9 |
+
@pytest.mark.asyncio
|
10 |
+
async def test_get_pdf_from_bubble_No_llama_parse(self):
|
11 |
+
should_use_llama_parse = False
|
12 |
+
result = await get_pdf_from_bubble(pdf_file_url, should_use_llama_parse)
|
13 |
+
|
14 |
+
assert isinstance(result, list)
|
15 |
+
assert len(result) > 0
|
16 |
+
print("\n\nresult", result)
|
17 |
+
assert all(isinstance(item, Document) for item in result)
|
18 |
+
|
19 |
+
# Teste abaixo não funciona com arquivos grandes -> O Llama Parse dá erro de timeout
|
20 |
+
# @pytest.mark.asyncio
|
21 |
+
# async def test_get_pdf_from_bubble_With_llama_parse(self):
|
22 |
+
# should_use_llama_parse = True
|
23 |
+
# result = await get_pdf_from_bubble(pdf_file_url, should_use_llama_parse)
|
24 |
+
|
25 |
+
# assert isinstance(result, list)
|
26 |
+
# assert len(result) > 0
|
27 |
+
# print("\n\nresult", result)
|
28 |
+
# assert all(isinstance(item, Document) for item in result)
|
tests/conftest.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import django
|
4 |
+
|
5 |
+
# Configura o Django
|
6 |
+
# Adiciona o diretório raiz do projeto ao sys.path
|
7 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
8 |
+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "setup.settings")
|
9 |
+
django.setup()
|
tests/fixtures/_pdf-uma-pagina.pdf
ADDED
Binary file (26.6 kB). View file
|
|
tests/test_handle_files.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
import os
|
3 |
+
from langchain_core.documents import Document
|
4 |
+
|
5 |
+
from _utils.handle_files import return_document_list_with_llama_parser
|
6 |
+
|
7 |
+
cwd = os.getcwd()
|
8 |
+
pdf_file_url = os.path.join(cwd, "tests", "fixtures", "_pdf-uma-pagina.pdf")
|
9 |
+
|
10 |
+
|
11 |
+
class TestHandleFiles:
|
12 |
+
@pytest.mark.asyncio
|
13 |
+
async def test_return_document_list_with_llama_parser_With_wrong_keys(
|
14 |
+
self, monkeypatch
|
15 |
+
):
|
16 |
+
|
17 |
+
monkeypatch.setattr(
|
18 |
+
"_utils.handle_files.llama_parser_keys",
|
19 |
+
["abc", os.getenv("LLAMA_CLOUD_API_KEY_PEIXE")],
|
20 |
+
)
|
21 |
+
|
22 |
+
result = await return_document_list_with_llama_parser(pdf_file_url)
|
23 |
+
|
24 |
+
assert isinstance(result, list)
|
25 |
+
assert len(result) > 0
|
26 |
+
assert all(isinstance(item, Document) for item in result)
|
27 |
+
assert all(len(item.page_content) > 0 for item in result)
|
28 |
+
assert all(int(item.metadata.get("page", 0)) > 0 for item in result)
|
tests/test_splitters.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
import os
|
3 |
+
from _utils.splitters.Splitter_class import Splitter
|
4 |
+
from _utils.models.gerar_relatorio import (
|
5 |
+
DocumentChunk,
|
6 |
+
)
|
7 |
+
|
8 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
9 |
+
chunk_size = 1000
|
10 |
+
chunk_overlap = 200
|
11 |
+
cwd = os.getcwd()
|
12 |
+
pdf_file = os.path.join(cwd, "tests", "fixtures", "_pdf-uma-pagina.pdf")
|
13 |
+
|
14 |
+
|
15 |
+
class TestSplitters:
|
16 |
+
splitter = Splitter(chunk_size, chunk_overlap)
|
17 |
+
|
18 |
+
@pytest.mark.asyncio
|
19 |
+
async def test_load_and_split_document_No_llama_parse_No_Bubble(self, monkeypatch):
|
20 |
+
should_use_llama_parse = False
|
21 |
+
isBubble = False
|
22 |
+
|
23 |
+
result_chunks, result_strings = await self.splitter.load_and_split_document(
|
24 |
+
pdf_file, should_use_llama_parse, isBubble
|
25 |
+
)
|
26 |
+
|
27 |
+
assert isinstance(result_chunks, list)
|
28 |
+
assert isinstance(result_strings, list)
|
29 |
+
assert len(result_chunks) > 0
|
30 |
+
assert len(result_strings) > 0
|
31 |
+
assert all(isinstance(item, str) for item in result_strings)
|
32 |
+
assert all(isinstance(item, DocumentChunk) for item in result_chunks)
|
33 |
+
assert all(
|
34 |
+
(chunk_size - 100) < len(item.content) < (chunk_size + 100)
|
35 |
+
for item in result_chunks
|
36 |
+
)
|
37 |
+
|
38 |
+
@pytest.mark.asyncio
|
39 |
+
async def test_load_and_split_document_No_llama_parse_No_Bubble_with_bigger_chunk(
|
40 |
+
self, monkeypatch
|
41 |
+
):
|
42 |
+
should_use_llama_parse = False
|
43 |
+
isBubble = False
|
44 |
+
chunk_size = 3500
|
45 |
+
splitter_temp = Splitter(chunk_size, chunk_overlap)
|
46 |
+
|
47 |
+
result_chunks, result_strings = await splitter_temp.load_and_split_document(
|
48 |
+
pdf_file, should_use_llama_parse, isBubble
|
49 |
+
)
|
50 |
+
|
51 |
+
assert isinstance(result_chunks, list)
|
52 |
+
assert isinstance(result_strings, list)
|
53 |
+
assert len(result_chunks) > 0
|
54 |
+
assert len(result_strings) > 0
|
55 |
+
assert all(isinstance(item, str) for item in result_strings)
|
56 |
+
assert all(isinstance(item, DocumentChunk) for item in result_chunks)
|
57 |
+
assert all(
|
58 |
+
(chunk_size - 200) < len(item.content) < (chunk_size + 200)
|
59 |
+
for item in result_chunks
|
60 |
+
)
|
61 |
+
|
62 |
+
@pytest.mark.asyncio
|
63 |
+
async def test_load_and_split_document_With_llama_parse_No_Bubble(
|
64 |
+
self, monkeypatch
|
65 |
+
):
|
66 |
+
should_use_llama_parse = True
|
67 |
+
isBubble = False
|
68 |
+
result_chunks, result_strings = await self.splitter.load_and_split_document(
|
69 |
+
pdf_file, should_use_llama_parse, isBubble
|
70 |
+
)
|
71 |
+
|
72 |
+
assert isinstance(result_chunks, list)
|
73 |
+
assert isinstance(result_strings, list)
|
74 |
+
assert len(result_chunks) > 0
|
75 |
+
assert len(result_strings) > 0
|
76 |
+
assert all(isinstance(item, str) for item in result_strings)
|
77 |
+
assert all(isinstance(item, DocumentChunk) for item in result_chunks)
|
78 |
+
# Teste abaixo não passa ainda --> Será consertado no futuro
|
79 |
+
# assert all(
|
80 |
+
# (chunk_size - 100) < len(item.content) < (chunk_size + 100)
|
81 |
+
# for item in result_chunks
|
82 |
+
# )
|