File size: 10,009 Bytes
1286e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import os
from typing import List, Dict, Tuple, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import uuid
import logging
from cohere import Client
from _utils.models.gerar_relatorio import (
    DocumentChunk,
)
from langchain.schema import Document


class DocumentSummarizer:
    def __init__(
        self,
        openai_api_key: str,
        cohere_api_key: str,
        embedding_model,
        chunk_size,
        chunk_overlap,
        num_k_rerank,
        model_cohere_rerank,
    ):
        self.openai_api_key = openai_api_key
        self.cohere_client = Client(cohere_api_key)
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        self.chunk_metadata = {}  # Store chunk metadata for tracing
        self.num_k_rerank = num_k_rerank
        self.model_cohere_rerank = model_cohere_rerank

    def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
        """Load PDF and split into chunks with metadata"""
        loader = PyPDFLoader(pdf_path)
        pages = (
            loader.load()
        )  # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
        chunks = []
        char_count = 0

        for page in pages:
            text = page.page_content
            page_chunks = self.text_splitter.split_text(
                text
            )  # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.

            for chunk in page_chunks:
                chunk_id = str(uuid.uuid4())
                start_char = text.find(
                    chunk
                )  # Retorna a posição onde se encontra o chunk dentro da página inteira
                end_char = start_char + len(chunk)

                doc_chunk = DocumentChunk(  # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
                    content=chunk,
                    page_number=page.metadata.get("page") + 1,  # 1-based page numbering
                    chunk_id=chunk_id,
                    start_char=char_count + start_char,
                    end_char=char_count + end_char,
                )
                chunks.append(doc_chunk)

                # Store metadata for later retrieval
                self.chunk_metadata[chunk_id] = {
                    "page": doc_chunk.page_number,
                    "start_char": doc_chunk.start_char,
                    "end_char": doc_chunk.end_char,
                }

            char_count += len(text)

        return chunks

    def load_and_split_text(self, text: str) -> List[DocumentChunk]:
        """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
        page = Document(page_content=text, metadata={"page": 1})
        chunks = []
        char_count = 0

        text = page.page_content
        page_chunks = self.text_splitter.split_text(
            text
        )  # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
        print("\n\n\n")
        print("page_chunks: ", page_chunks)

        for chunk in page_chunks:
            chunk_id = str(uuid.uuid4())
            start_char = text.find(
                chunk
            )  # Retorna a posição onde se encontra o chunk dentro da página inteira
            end_char = start_char + len(chunk)

            doc_chunk = DocumentChunk(  # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
                content=chunk,
                page_number=page.metadata.get("page") + 1,  # 1-based page numbering
                chunk_id=chunk_id,
                start_char=char_count + start_char,
                end_char=char_count + end_char,
            )
            chunks.append(doc_chunk)

            # Store metadata for later retrieval
            self.chunk_metadata[chunk_id] = {
                "page": doc_chunk.page_number,
                "start_char": doc_chunk.start_char,
                "end_char": doc_chunk.end_char,
            }

        char_count += len(text)

        return chunks

    def create_vector_store(
        self, chunks: List[DocumentChunk]
    ) -> Chroma:  # Esta função nunca está sendo utilizada
        """Create vector store with metadata"""
        texts = [chunk.content for chunk in chunks]
        metadatas = [
            {
                "chunk_id": chunk.chunk_id,
                "page": chunk.page_number,
                "start_char": chunk.start_char,
                "end_char": chunk.end_char,
            }
            for chunk in chunks
        ]

        vector_store = Chroma.from_texts(
            texts=texts, metadatas=metadatas, embedding=self.embeddings
        )
        return vector_store

    def rerank_chunks(  # Esta função nunca está sendo utilizada
        self, chunks: List[Dict], query: str, k: int = 5
    ) -> List[Dict]:
        """
        Rerank chunks using Cohere's reranking model.

        Args:
            chunks: List of dictionaries containing chunks and their metadata
            query: Original search query
            k: Number of top chunks to return

        Returns:
            List of reranked chunks with updated relevance scores
        """
        try:
            # Prepare documents for reranking
            documents = [chunk["content"] for chunk in chunks]

            # Get reranking scores from Cohere
            results = self.cohere_client.rerank(
                query=query,
                documents=documents,
                top_n=k,
                model=self.model_cohere_rerank,
            )

            # Create reranked results with original metadata
            reranked_chunks = []
            for hit in results:
                original_chunk = chunks[hit.index]
                reranked_chunks.append(
                    {**original_chunk, "relevance_score": hit.relevance_score}
                )

            return reranked_chunks

        except Exception as e:
            logging.error(f"Reranking failed: {str(e)}")
            return chunks[:k]  # Fallback to original ordering

    def generate_summary_with_sources(  # Esta função nunca está sendo utilizada
        self,
        vector_store: Chroma,
        query: str = "Summarize the main points of this document",
    ) -> List[Dict]:
        """Generate summary with source citations using reranking"""
        # Retrieve more initial chunks for reranking
        relevant_docs = vector_store.similarity_search_with_score(query, k=20)

        # Prepare chunks for reranking
        chunks = []
        for doc, score in relevant_docs:
            chunks.append(
                {
                    "content": doc.page_content,
                    "page": doc.metadata["page"],
                    "chunk_id": doc.metadata["chunk_id"],
                    "relevance_score": score,
                }
            )

        # Rerank chunks
        reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank)

        # Prepare context and sources from reranked chunks
        contexts = []
        sources = []

        for chunk in reranked_chunks:
            contexts.append(chunk["content"])
            sources.append(
                {
                    "content": chunk["content"],
                    "page": chunk["page"],
                    "chunk_id": chunk["chunk_id"],
                    "relevance_score": chunk["relevance_score"],
                }
            )

        prompt_template = """
        Based on the following context, provide multiple key points from the document.
        For each point, create a new paragraph.
        Each paragraph should be a complete, self-contained insight.
        
        Context: {context}
        
        Key points:
        """

        prompt = PromptTemplate(template=prompt_template, input_variables=["context"])

        llm = ChatOpenAI(
            temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
        )

        response = llm.predict(prompt.format(context="\n\n".join(contexts)))

        # Split the response into paragraphs
        summaries = [p.strip() for p in response.split("\n\n") if p.strip()]

        # Create structured output
        structured_output = []
        for idx, summary in enumerate(summaries):
            # Associate each summary with the most relevant source
            structured_output.append(
                {
                    "content": summary,
                    "source": {
                        "page": sources[min(idx, len(sources) - 1)]["page"],
                        "text": sources[min(idx, len(sources) - 1)]["content"][:200]
                        + "...",
                        "relevance_score": sources[min(idx, len(sources) - 1)][
                            "relevance_score"
                        ],
                    },
                }
            )

        return structured_output

    def get_source_context(
        self, chunk_id: str, window: int = 100
    ) -> Dict:  # Esta função nunca está sendo utilizada
        """Get extended context around a specific chunk"""
        metadata = self.chunk_metadata.get(chunk_id)
        if not metadata:
            return None

        return {
            "page": metadata["page"],
            "start_char": metadata["start_char"],
            "end_char": metadata["end_char"],
        }