Spaces:
Running
Running
File size: 8,386 Bytes
1fd7b67 ca8a144 1fd7b67 1286e81 c625f4c ca8a144 c625f4c ca8a144 1fd7b67 c625f4c 1fd7b67 c625f4c 1fd7b67 baeaaa5 c625f4c ca8a144 c625f4c ca8a144 c625f4c ca8a144 e725020 ca8a144 3143cff e725020 c625f4c 1286e81 ca8a144 c625f4c ca8a144 1286e81 baeaaa5 1286e81 ca8a144 c625f4c 1286e81 c625f4c ca8a144 ecd9808 1286e81 ca8a144 baeaaa5 1286e81 ecd9808 ca8a144 1fd7b67 baeaaa5 ca8a144 baeaaa5 c625f4c baeaaa5 ecd9808 baeaaa5 c625f4c 1286e81 ca8a144 c625f4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
from langchain_community.document_loaders import PyPDFLoader
import json
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
EnhancedDocumentSummarizer,
)
from _utils.models.gerar_relatorio import (
RetrievalConfig,
)
def reciprocal_rank_fusion(result_lists, weights=None):
"""Combine multiple ranked lists using reciprocal rank fusion"""
fused_scores = {}
num_lists = len(result_lists)
if weights is None:
weights = [1.0] * num_lists
for i in range(num_lists):
for doc_id, score in result_lists[i]:
if doc_id not in fused_scores:
fused_scores[doc_id] = 0
fused_scores[doc_id] += weights[i] * score
# Sort by score in descending order
sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
return sorted_results
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
async def get_llm_summary_answer_by_cursor_complete(
serializer, listaPDFs=None, contexto=None
):
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
allPdfsChunks = []
# Configuration
config = RetrievalConfig(
num_chunks=serializer["num_chunks_retrieval"],
embedding_weight=serializer["embedding_weight"],
bm25_weight=serializer["bm25_weight"],
context_window=serializer["context_window"],
chunk_overlap=serializer["chunk_overlap"],
)
# Initialize enhanced summarizer
summarizer = EnhancedDocumentSummarizer(
openai_api_key=os.environ.get("OPENAI_API_KEY"),
claude_api_key=os.environ.get("CLAUDE_API_KEY"),
config=config,
embedding_model=serializer["hf_embedding"],
chunk_overlap=serializer["chunk_overlap"],
chunk_size=serializer["chunk_size"],
num_k_rerank=serializer["num_k_rerank"],
model_cohere_rerank=serializer["model_cohere_rerank"],
claude_context_model=serializer["claude_context_model"],
prompt_relatorio=serializer["prompt_relatorio"],
gpt_model=serializer["model"],
gpt_temperature=serializer["gpt_temperature"],
id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
prompt_modelo=serializer["prompt_modelo"],
reciprocal_rank_fusion=reciprocal_rank_fusion,
)
full_text = ""
if contexto:
full_text = contexto
chunks = summarizer.load_and_split_text(full_text)
allPdfsChunks = chunks
else:
# # Load and process document
# pdf_path = "./Im_a_storyteller.pdf"
# chunks = summarizer.load_and_split_document(pdf_path)
# Load and process document
for pdf in listaPDFs:
pdf_path = pdf
chunks = summarizer.load_and_split_document(pdf_path)
allPdfsChunks = allPdfsChunks + chunks
# Get full text for contextualization
loader = PyPDFLoader(pdf_path)
pages = loader.load()
full_text = " ".join([page.page_content for page in pages])
# Contextualize chunks
if serializer["should_have_contextual_chunks"]:
contextualized_chunks = (
await summarizer.contextual_retriever.contextualize_all_chunks(
pages, allPdfsChunks
)
)
chunks_passados = contextualized_chunks
is_contextualized_chunk = True
else:
chunks_passados = allPdfsChunks
is_contextualized_chunk = False
# Create enhanced vector store and BM25 index
vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
chunks_passados, is_contextualized_chunk
)
prompt_resumo_sem_context = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
**Instructions**:
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
2. **Reading the Context**: Extract the following information from `context`:
- The name of the defendant (réu).
- The crime they have been accused of (nome_do_crime).
- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
- The date the accusation was accepted (data_do_recebimento).
- The ID of the decision document (id_do_documento).
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
```
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
**Reminder**:
- Do not include your chain of thought in the final output.
- Do not add extra information or commentary beyond the specified format.
- The final answer must be in Portuguese.
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
"""
# Generate enhanced summary
structured_summaries = await summarizer.generate_enhanced_summary(
vector_store,
bm25,
chunk_ids
# , serializer["user_message"]
,
prompt_resumo_sem_context,
)
if not isinstance(structured_summaries, list):
from rest_framework.response import Response
return Response({"erro": structured_summaries})
# Output results as JSON
# json_output = json.dumps(structured_summaries, indent=2)
# print("\nStructured Summaries:")
# print(json_output)
texto_completo = ""
print("\n\n\n")
print("summarizer.resumo_gerado: ", summarizer.resumo_gerado)
texto_completo += summarizer.resumo_gerado
texto_completo += "\n\n"
print("\n\n\n")
print("structured_summaries: ", structured_summaries)
for x in structured_summaries:
texto_completo = texto_completo + x["content"] + "\n"
return {
"resultado": structured_summaries,
"texto_completo": texto_completo,
"parametros-utilizados": {
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
"embedding_weight": serializer["embedding_weight"],
"bm25_weight": serializer["bm25_weight"],
"context_window": serializer["context_window"],
"chunk_overlap": serializer["chunk_overlap"],
"num_k_rerank": serializer["num_k_rerank"],
"model_cohere_rerank": serializer["model_cohere_rerank"],
"more_initial_chunks_for_reranking": serializer[
"more_initial_chunks_for_reranking"
],
"claude_context_model": serializer["claude_context_model"],
"gpt_temperature": serializer["gpt_temperature"],
"user_message": serializer["user_message"],
"model": serializer["model"],
"hf_embedding": serializer["hf_embedding"],
"chunk_size": serializer["chunk_size"],
"chunk_overlap": serializer["chunk_overlap"],
"prompt_relatorio": serializer["prompt_relatorio"],
"prompt_modelo": serializer["prompt_modelo"],
},
}
|