File size: 5,165 Bytes
1286e81
12d3e1a
 
 
 
 
 
 
1286e81
 
20e3edd
1286e81
 
 
 
 
 
 
 
20e3edd
1286e81
12d3e1a
1286e81
 
 
 
 
 
 
 
 
20e3edd
1286e81
 
 
 
 
20e3edd
 
1286e81
 
 
 
baeaaa5
1286e81
 
 
baeaaa5
12d3e1a
20e3edd
 
 
12d3e1a
1286e81
 
 
 
 
 
 
12d3e1a
 
 
 
 
 
 
baeaaa5
1286e81
 
20e3edd
 
 
 
 
 
 
 
 
1286e81
baeaaa5
1286e81
 
 
 
 
 
 
 
 
baeaaa5
1286e81
 
 
 
20e3edd
1286e81
baeaaa5
 
 
 
 
1286e81
baeaaa5
1286e81
 
12d3e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
# from _utils.gerar_relatorio_modelo_usuario.prompts import (
#     prompt_auxiliar_do_contextual_prompt,
# )
from _utils.chains.Chain_class import Chain
from _utils.prompts.Prompt_class import Prompt
from _utils.splitters.Splitter_class import Splitter
from setup.easy_imports import PyPDFLoader
from langchain_openai import ChatOpenAI
from typing import List, Dict, Tuple, Optional
from anthropic import Anthropic, AsyncAnthropic
import logging
from langchain.schema import Document
import asyncio
from langchain.prompts import PromptTemplate
from typing import List
from multiprocessing import Process, Barrier, Queue
from dataclasses import dataclass
from langchain_core.messages import HumanMessage
from asgiref.sync import sync_to_async

from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
from _utils.models.gerar_relatorio import (
    ContextualizedChunk,
    DocumentChunk,
    RetrievalConfig,
)

lista_contador = []


class ContextualRetriever:
    def __init__(
        self, config: RetrievalConfig, claude_api_key: str, claude_context_model: str
    ):
        self.config = config
        # self.claude_client = Anthropic(api_key=claude_api_key)
        self.claude_client = AsyncAnthropic(api_key=claude_api_key)
        self.logger = logging.getLogger(__name__)
        self.bm25 = None
        self.claude_context_model = claude_context_model

    async def llm_generate_context(self, full_text: str, chunk: DocumentChunk) -> str:
        """Generate contextual description using ChatOpenAI"""
        try:
            print("COMEÇOU A REQUISIÇÃO")
            prompt = contextual_prompt(full_text, chunk.content)
            # response = await aclaude_answer(
            #     self.claude_client, self.claude_context_model, prompt
            # )

            response = await agpt_answer(prompt)
            return response
        except Exception as e:
            self.logger.error(
                f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
            )
            return ""

    # def gerar_resumo_auxiliar_do_contextual_embedding(self):
    #     prompt = Prompt().create_prompt_template(
    #         "", prompt_auxiliar_do_contextual_prompt
    #     )
    #     Chain(prompt, ChatOpenAI())
    #     return

    async def create_contextualized_chunk(self, chunk, full_text):
        lista_contador.append(0)
        print("contador: ", len(lista_contador))
        # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
        # page_content = ""
        # for i in range(
        #     max(0, chunk.page_number - 1),
        #     min(len(full_text), chunk.page_number + 2),
        # ):
        #     page_content += full_text[i].page_content if full_text[i] else ""
        page_number = chunk.page_number - 1
        page_content = full_text[page_number].page_content

        context = await self.llm_generate_context(page_content, chunk)
        return ContextualizedChunk(
            content=chunk.content,
            page_number=chunk.page_number,
            chunk_id=chunk.chunk_id,
            start_char=chunk.start_char,
            end_char=chunk.end_char,
            context=context,
        )

    async def contextualize_all_chunks(
        self, full_text: List[Document], chunks: List[DocumentChunk]
    ) -> List[ContextualizedChunk]:
        """Add context to all chunks"""
        contextualized_chunks = []
        lista_contador = []

        async with asyncio.TaskGroup() as tg:
            tasks = [
                tg.create_task(self.create_contextualized_chunk(chunk, full_text))
                for chunk in chunks
            ]

        contextualized_chunks = [task.result() for task in tasks]

        return contextualized_chunks


def get_full_text_and_all_PDFs_chunks(contexto, listaPDFs, splitterObject: Splitter):
    all_PDFs_chunks = []
    full_text = ""
    if contexto:
        full_text = contexto
        chunks = splitterObject.load_and_split_text(full_text)
        all_PDFs_chunks = chunks
    else:
        # Load and process document
        for pdf in listaPDFs:
            pdf_path = pdf
            chunks = splitterObject.load_and_split_document(pdf_path)
            all_PDFs_chunks = all_PDFs_chunks + chunks
        # Get full text for contextualization
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        full_text = " ".join([page.page_content for page in pages])

    return full_text, all_PDFs_chunks, pages


async def contextualize_chunk_based_on_serializer(
    serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
):
    if serializer["should_have_contextual_chunks"]:
        contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
            pages, all_PDFs_chunks
        )
        chunks_passados = contextualized_chunks
        is_contextualized_chunk = True
    else:
        chunks_passados = all_PDFs_chunks
        is_contextualized_chunk = False

    return chunks_passados, is_contextualized_chunk