Spaces:

luanpoppe
/

vella-backend

Running

File size: 2,476 Bytes

import tempfile, os
from typing import List
from langchain_core.documents import Document as LangchainDocument
from llama_index import Document
from llama_parse import LlamaParse, ResultType

llama_parser_keys = [
    os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
    os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
]


def handle_pdf_files_from_serializer(files):
    listaPDFs = []
    for file in files:
        file.seek(0)
        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".pdf"
        ) as temp_file:  # Create a temporary file to save the uploaded PDF
            for (
                chunk
            ) in file.chunks():  # Write the uploaded file content to the temporary file
                temp_file.write(chunk)
            temp_file_path = temp_file.name  # Get the path of the temporary file
            listaPDFs.append(temp_file_path)
    print("\n\nlistaPDFs: ", listaPDFs)
    return listaPDFs


def remove_pdf_temp_files(listaPDFs):
    for file in listaPDFs:
        os.remove(file)


async def return_document_list_with_llama_parser(file: str):
    for key in llama_parser_keys:
        documents: List[LangchainDocument] = []
        if key:
            parser = LlamaParse(
                api_key=key,
                result_type=ResultType.JSON,  # Options: 'text', 'markdown', 'json', 'structured'
                language="pt",
                verbose=True,
            )

            try:
                parsed_document = await parser.aget_json(file)
            except:
                print(f"Error with llama parser key ending with {key[-4:]}")
                continue  # Faz com que comece o próximo loop
            if len(parsed_document) == 0:
                continue

            for doc in parsed_document[0].get("pages"):  # type: ignore
                # documents.append(doc.to_langchain_format())

                langchain_document = LangchainDocument(
                    page_content=doc.get("md"),  # type: ignore
                    metadata={
                        "page": doc.get("page"),  # type: ignore
                        # **doc.get("metadata", {}),  # type: ignore
                    },  # Include page number in metadata
                )

                documents.append(langchain_document)

            return documents

    # Código abaixo só é executado se o loop acima acabar e não tiver retornado um valor nenhuma vez
    raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")