File size: 2,476 Bytes
12b0dd7
b374298
 
 
 
cb23311
39fc36b
 
 
 
 
12b0dd7
 
 
 
 
 
 
 
 
 
 
 
 
 
e70ffc1
12b0dd7
 
b374298
12b0dd7
 
 
b374298
 
 
78209bc
 
 
 
 
 
 
 
b374298
 
78209bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b374298
78209bc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import tempfile, os
from typing import List
from langchain_core.documents import Document as LangchainDocument
from llama_index import Document
from llama_parse import LlamaParse, ResultType

llama_parser_keys = [
    os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
    os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
]


def handle_pdf_files_from_serializer(files):
    listaPDFs = []
    for file in files:
        file.seek(0)
        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".pdf"
        ) as temp_file:  # Create a temporary file to save the uploaded PDF
            for (
                chunk
            ) in file.chunks():  # Write the uploaded file content to the temporary file
                temp_file.write(chunk)
            temp_file_path = temp_file.name  # Get the path of the temporary file
            listaPDFs.append(temp_file_path)
    print("\n\nlistaPDFs: ", listaPDFs)
    return listaPDFs


def remove_pdf_temp_files(listaPDFs):
    for file in listaPDFs:
        os.remove(file)


async def return_document_list_with_llama_parser(file: str):
    for key in llama_parser_keys:
        documents: List[LangchainDocument] = []
        if key:
            parser = LlamaParse(
                api_key=key,
                result_type=ResultType.JSON,  # Options: 'text', 'markdown', 'json', 'structured'
                language="pt",
                verbose=True,
            )

            try:
                parsed_document = await parser.aget_json(file)
            except:
                print(f"Error with llama parser key ending with {key[-4:]}")
                continue  # Faz com que comece o pr贸ximo loop
            if len(parsed_document) == 0:
                continue

            for doc in parsed_document[0].get("pages"):  # type: ignore
                # documents.append(doc.to_langchain_format())

                langchain_document = LangchainDocument(
                    page_content=doc.get("md"),  # type: ignore
                    metadata={
                        "page": doc.get("page"),  # type: ignore
                        # **doc.get("metadata", {}),  # type: ignore
                    },  # Include page number in metadata
                )

                documents.append(langchain_document)

            return documents

    # C贸digo abaixo s贸 茅 executado se o loop acima acabar e n茫o tiver retornado um valor nenhuma vez
    raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")