File size: 8,386 Bytes
1fd7b67
ca8a144
1fd7b67
1286e81
 
 
 
 
 
c625f4c
ca8a144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c625f4c
 
ca8a144
1fd7b67
c625f4c
 
 
1fd7b67
c625f4c
 
1fd7b67
baeaaa5
c625f4c
 
 
ca8a144
 
 
 
 
 
 
c625f4c
ca8a144
 
 
 
 
c625f4c
ca8a144
 
 
 
 
 
 
e725020
ca8a144
3143cff
e725020
c625f4c
1286e81
ca8a144
 
c625f4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
1286e81
baeaaa5
 
 
 
 
 
1286e81
 
 
 
ca8a144
 
c625f4c
1286e81
c625f4c
ca8a144
ecd9808
1286e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8a144
baeaaa5
1286e81
 
 
 
 
ecd9808
ca8a144
1fd7b67
baeaaa5
 
 
 
 
ca8a144
baeaaa5
 
 
c625f4c
baeaaa5
ecd9808
 
 
 
baeaaa5
c625f4c
1286e81
ca8a144
c625f4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
from langchain_community.document_loaders import PyPDFLoader
import json
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
    EnhancedDocumentSummarizer,
)
from _utils.models.gerar_relatorio import (
    RetrievalConfig,
)


def reciprocal_rank_fusion(result_lists, weights=None):
    """Combine multiple ranked lists using reciprocal rank fusion"""
    fused_scores = {}
    num_lists = len(result_lists)
    if weights is None:
        weights = [1.0] * num_lists

    for i in range(num_lists):
        for doc_id, score in result_lists[i]:
            if doc_id not in fused_scores:
                fused_scores[doc_id] = 0
            fused_scores[doc_id] += weights[i] * score

    # Sort by score in descending order
    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_results


os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"


async def get_llm_summary_answer_by_cursor_complete(
    serializer, listaPDFs=None, contexto=None
):
    """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
    allPdfsChunks = []
    # Configuration
    config = RetrievalConfig(
        num_chunks=serializer["num_chunks_retrieval"],
        embedding_weight=serializer["embedding_weight"],
        bm25_weight=serializer["bm25_weight"],
        context_window=serializer["context_window"],
        chunk_overlap=serializer["chunk_overlap"],
    )

    # Initialize enhanced summarizer
    summarizer = EnhancedDocumentSummarizer(
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        claude_api_key=os.environ.get("CLAUDE_API_KEY"),
        config=config,
        embedding_model=serializer["hf_embedding"],
        chunk_overlap=serializer["chunk_overlap"],
        chunk_size=serializer["chunk_size"],
        num_k_rerank=serializer["num_k_rerank"],
        model_cohere_rerank=serializer["model_cohere_rerank"],
        claude_context_model=serializer["claude_context_model"],
        prompt_relatorio=serializer["prompt_relatorio"],
        gpt_model=serializer["model"],
        gpt_temperature=serializer["gpt_temperature"],
        id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
        prompt_modelo=serializer["prompt_modelo"],
        reciprocal_rank_fusion=reciprocal_rank_fusion,
    )

    full_text = ""
    if contexto:
        full_text = contexto
        chunks = summarizer.load_and_split_text(full_text)
        allPdfsChunks = chunks
    else:
        # # Load and process document
        # pdf_path = "./Im_a_storyteller.pdf"
        # chunks = summarizer.load_and_split_document(pdf_path)

        # Load and process document
        for pdf in listaPDFs:
            pdf_path = pdf
            chunks = summarizer.load_and_split_document(pdf_path)
            allPdfsChunks = allPdfsChunks + chunks

        # Get full text for contextualization
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        full_text = " ".join([page.page_content for page in pages])
    # Contextualize chunks
    if serializer["should_have_contextual_chunks"]:
        contextualized_chunks = (
            await summarizer.contextual_retriever.contextualize_all_chunks(
                pages, allPdfsChunks
            )
        )
        chunks_passados = contextualized_chunks
        is_contextualized_chunk = True
    else:
        chunks_passados = allPdfsChunks
        is_contextualized_chunk = False

    # Create enhanced vector store and BM25 index
    vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
        chunks_passados, is_contextualized_chunk
    )

    prompt_resumo_sem_context = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
**Instructions**:
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
2. **Reading the Context**: Extract the following information from `context`:
- The name of the defendant (réu).
- The crime they have been accused of (nome_do_crime).
- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
- The date the accusation was accepted (data_do_recebimento).
- The ID of the decision document (id_do_documento).
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
```
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
**Reminder**:
- Do not include your chain of thought in the final output.
- Do not add extra information or commentary beyond the specified format.
- The final answer must be in Portuguese.
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
"""
    # Generate enhanced summary
    structured_summaries = await summarizer.generate_enhanced_summary(
        vector_store,
        bm25,
        chunk_ids
        # , serializer["user_message"]
        ,
        prompt_resumo_sem_context,
    )

    if not isinstance(structured_summaries, list):
        from rest_framework.response import Response

        return Response({"erro": structured_summaries})

    # Output results as JSON
    # json_output = json.dumps(structured_summaries, indent=2)
    # print("\nStructured Summaries:")
    # print(json_output)
    texto_completo = ""
    print("\n\n\n")
    print("summarizer.resumo_gerado: ", summarizer.resumo_gerado)
    texto_completo += summarizer.resumo_gerado
    texto_completo += "\n\n"
    print("\n\n\n")
    print("structured_summaries: ", structured_summaries)
    for x in structured_summaries:
        texto_completo = texto_completo + x["content"] + "\n"
    return {
        "resultado": structured_summaries,
        "texto_completo": texto_completo,
        "parametros-utilizados": {
            "num_chunks_retrieval": serializer["num_chunks_retrieval"],
            "embedding_weight": serializer["embedding_weight"],
            "bm25_weight": serializer["bm25_weight"],
            "context_window": serializer["context_window"],
            "chunk_overlap": serializer["chunk_overlap"],
            "num_k_rerank": serializer["num_k_rerank"],
            "model_cohere_rerank": serializer["model_cohere_rerank"],
            "more_initial_chunks_for_reranking": serializer[
                "more_initial_chunks_for_reranking"
            ],
            "claude_context_model": serializer["claude_context_model"],
            "gpt_temperature": serializer["gpt_temperature"],
            "user_message": serializer["user_message"],
            "model": serializer["model"],
            "hf_embedding": serializer["hf_embedding"],
            "chunk_size": serializer["chunk_size"],
            "chunk_overlap": serializer["chunk_overlap"],
            "prompt_relatorio": serializer["prompt_relatorio"],
            "prompt_modelo": serializer["prompt_modelo"],
        },
    }