Spaces:

luanpoppe
/

vella-backend

Running

File size: 8,386 Bytes

1fd7b67
ca8a144
1fd7b67
1286e81
 
 
 
 
 
c625f4c
ca8a144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c625f4c
 
ca8a144
1fd7b67
c625f4c
 
 
1fd7b67
c625f4c
 
1fd7b67
baeaaa5
c625f4c
 
 
ca8a144
 
 
 
 
 
 
c625f4c
ca8a144
 
 
 
 
c625f4c
ca8a144
 
 
 
 
 
 
e725020
ca8a144
3143cff
e725020
c625f4c
1286e81
ca8a144
 
c625f4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
1286e81
baeaaa5
 
 
 
 
 
1286e81
 
 
 
ca8a144
 
c625f4c
1286e81
c625f4c
ca8a144
ecd9808
1286e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
baeaaa5
1286e81
 
 
 
 
ecd9808
ca8a144
1fd7b67
baeaaa5
 
 
 
 
ca8a144
baeaaa5
 
 
c625f4c
baeaaa5
ecd9808
 
 
 
baeaaa5
c625f4c
1286e81
ca8a144
c625f4c

import os
from langchain_community.document_loaders import PyPDFLoader
import json
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
    EnhancedDocumentSummarizer,
)
from _utils.models.gerar_relatorio import (
    RetrievalConfig,
)


def reciprocal_rank_fusion(result_lists, weights=None):
    """Combine multiple ranked lists using reciprocal rank fusion"""
    fused_scores = {}
    num_lists = len(result_lists)
    if weights is None:
        weights = [1.0] * num_lists

    for i in range(num_lists):
        for doc_id, score in result_lists[i]:
            if doc_id not in fused_scores:
                fused_scores[doc_id] = 0
            fused_scores[doc_id] += weights[i] * score

    # Sort by score in descending order
    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_results


os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"


async def get_llm_summary_answer_by_cursor_complete(
    serializer, listaPDFs=None, contexto=None
):
    """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
    allPdfsChunks = []
    # Configuration
    config = RetrievalConfig(
        num_chunks=serializer["num_chunks_retrieval"],
        embedding_weight=serializer["embedding_weight"],
        bm25_weight=serializer["bm25_weight"],
        context_window=serializer["context_window"],
        chunk_overlap=serializer["chunk_overlap"],
    )

    # Initialize enhanced summarizer
    summarizer = EnhancedDocumentSummarizer(
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        claude_api_key=os.environ.get("CLAUDE_API_KEY"),
        config=config,
        embedding_model=serializer["hf_embedding"],
        chunk_overlap=serializer["chunk_overlap"],
        chunk_size=serializer["chunk_size"],
        num_k_rerank=serializer["num_k_rerank"],
        model_cohere_rerank=serializer["model_cohere_rerank"],
        claude_context_model=serializer["claude_context_model"],
        prompt_relatorio=serializer["prompt_relatorio"],
        gpt_model=serializer["model"],
        gpt_temperature=serializer["gpt_temperature"],
        id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
        prompt_modelo=serializer["prompt_modelo"],
        reciprocal_rank_fusion=reciprocal_rank_fusion,
    )

    full_text = ""
    if contexto:
        full_text = contexto
        chunks = summarizer.load_and_split_text(full_text)
        allPdfsChunks = chunks
    else:
        # # Load and process document
        # pdf_path = "./Im_a_storyteller.pdf"
        # chunks = summarizer.load_and_split_document(pdf_path)

        # Load and process document
        for pdf in listaPDFs:
            pdf_path = pdf
            chunks = summarizer.load_and_split_document(pdf_path)
            allPdfsChunks = allPdfsChunks + chunks

        # Get full text for contextualization
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        full_text = " ".join([page.page_content for page in pages])
    # Contextualize chunks
    if serializer["should_have_contextual_chunks"]:
        contextualized_chunks = (
            await summarizer.contextual_retriever.contextualize_all_chunks(
                pages, allPdfsChunks
            )
        )
        chunks_passados = contextualized_chunks
        is_contextualized_chunk = True
    else:
        chunks_passados = allPdfsChunks
        is_contextualized_chunk = False

    # Create enhanced vector store and BM25 index
    vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
        chunks_passados, is_contextualized_chunk
    )

    prompt_resumo_sem_context = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
**Instructions**:
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
2. **Reading the Context**: Extract the following information from `context`:
- The name of the defendant (réu).
- The crime they have been accused of (nome_do_crime).
- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
- The date the accusation was accepted (data_do_recebimento).
- The ID of the decision document (id_do_documento).
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
```
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
**Reminder**:
- Do not include your chain of thought in the final output.
- Do not add extra information or commentary beyond the specified format.
- The final answer must be in Portuguese.
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
"""
    # Generate enhanced summary
    structured_summaries = await summarizer.generate_enhanced_summary(
        vector_store,
        bm25,
        chunk_ids
        # , serializer["user_message"]
        ,
        prompt_resumo_sem_context,
    )

    if not isinstance(structured_summaries, list):
        from rest_framework.response import Response

        return Response({"erro": structured_summaries})

    # Output results as JSON
    # json_output = json.dumps(structured_summaries, indent=2)
    # print("\nStructured Summaries:")
    # print(json_output)
    texto_completo = ""
    print("\n\n\n")
    print("summarizer.resumo_gerado: ", summarizer.resumo_gerado)
    texto_completo += summarizer.resumo_gerado
    texto_completo += "\n\n"
    print("\n\n\n")
    print("structured_summaries: ", structured_summaries)
    for x in structured_summaries:
        texto_completo = texto_completo + x["content"] + "\n"
    return {
        "resultado": structured_summaries,
        "texto_completo": texto_completo,
        "parametros-utilizados": {
            "num_chunks_retrieval": serializer["num_chunks_retrieval"],
            "embedding_weight": serializer["embedding_weight"],
            "bm25_weight": serializer["bm25_weight"],
            "context_window": serializer["context_window"],
            "chunk_overlap": serializer["chunk_overlap"],
            "num_k_rerank": serializer["num_k_rerank"],
            "model_cohere_rerank": serializer["model_cohere_rerank"],
            "more_initial_chunks_for_reranking": serializer[
                "more_initial_chunks_for_reranking"
            ],
            "claude_context_model": serializer["claude_context_model"],
            "gpt_temperature": serializer["gpt_temperature"],
            "user_message": serializer["user_message"],
            "model": serializer["model"],
            "hf_embedding": serializer["hf_embedding"],
            "chunk_size": serializer["chunk_size"],
            "chunk_overlap": serializer["chunk_overlap"],
            "prompt_relatorio": serializer["prompt_relatorio"],
            "prompt_modelo": serializer["prompt_modelo"],
        },
    }