Spaces:
Running
Running
File size: 5,165 Bytes
1286e81 12d3e1a 1286e81 20e3edd 1286e81 20e3edd 1286e81 12d3e1a 1286e81 20e3edd 1286e81 20e3edd 1286e81 baeaaa5 1286e81 baeaaa5 12d3e1a 20e3edd 12d3e1a 1286e81 12d3e1a baeaaa5 1286e81 20e3edd 1286e81 baeaaa5 1286e81 baeaaa5 1286e81 20e3edd 1286e81 baeaaa5 1286e81 baeaaa5 1286e81 12d3e1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
# from _utils.gerar_relatorio_modelo_usuario.prompts import (
# prompt_auxiliar_do_contextual_prompt,
# )
from _utils.chains.Chain_class import Chain
from _utils.prompts.Prompt_class import Prompt
from _utils.splitters.Splitter_class import Splitter
from setup.easy_imports import PyPDFLoader
from langchain_openai import ChatOpenAI
from typing import List, Dict, Tuple, Optional
from anthropic import Anthropic, AsyncAnthropic
import logging
from langchain.schema import Document
import asyncio
from langchain.prompts import PromptTemplate
from typing import List
from multiprocessing import Process, Barrier, Queue
from dataclasses import dataclass
from langchain_core.messages import HumanMessage
from asgiref.sync import sync_to_async
from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
from _utils.models.gerar_relatorio import (
ContextualizedChunk,
DocumentChunk,
RetrievalConfig,
)
lista_contador = []
class ContextualRetriever:
def __init__(
self, config: RetrievalConfig, claude_api_key: str, claude_context_model: str
):
self.config = config
# self.claude_client = Anthropic(api_key=claude_api_key)
self.claude_client = AsyncAnthropic(api_key=claude_api_key)
self.logger = logging.getLogger(__name__)
self.bm25 = None
self.claude_context_model = claude_context_model
async def llm_generate_context(self, full_text: str, chunk: DocumentChunk) -> str:
"""Generate contextual description using ChatOpenAI"""
try:
print("COMEÇOU A REQUISIÇÃO")
prompt = contextual_prompt(full_text, chunk.content)
# response = await aclaude_answer(
# self.claude_client, self.claude_context_model, prompt
# )
response = await agpt_answer(prompt)
return response
except Exception as e:
self.logger.error(
f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
)
return ""
# def gerar_resumo_auxiliar_do_contextual_embedding(self):
# prompt = Prompt().create_prompt_template(
# "", prompt_auxiliar_do_contextual_prompt
# )
# Chain(prompt, ChatOpenAI())
# return
async def create_contextualized_chunk(self, chunk, full_text):
lista_contador.append(0)
print("contador: ", len(lista_contador))
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
# page_content = ""
# for i in range(
# max(0, chunk.page_number - 1),
# min(len(full_text), chunk.page_number + 2),
# ):
# page_content += full_text[i].page_content if full_text[i] else ""
page_number = chunk.page_number - 1
page_content = full_text[page_number].page_content
context = await self.llm_generate_context(page_content, chunk)
return ContextualizedChunk(
content=chunk.content,
page_number=chunk.page_number,
chunk_id=chunk.chunk_id,
start_char=chunk.start_char,
end_char=chunk.end_char,
context=context,
)
async def contextualize_all_chunks(
self, full_text: List[Document], chunks: List[DocumentChunk]
) -> List[ContextualizedChunk]:
"""Add context to all chunks"""
contextualized_chunks = []
lista_contador = []
async with asyncio.TaskGroup() as tg:
tasks = [
tg.create_task(self.create_contextualized_chunk(chunk, full_text))
for chunk in chunks
]
contextualized_chunks = [task.result() for task in tasks]
return contextualized_chunks
def get_full_text_and_all_PDFs_chunks(contexto, listaPDFs, splitterObject: Splitter):
all_PDFs_chunks = []
full_text = ""
if contexto:
full_text = contexto
chunks = splitterObject.load_and_split_text(full_text)
all_PDFs_chunks = chunks
else:
# Load and process document
for pdf in listaPDFs:
pdf_path = pdf
chunks = splitterObject.load_and_split_document(pdf_path)
all_PDFs_chunks = all_PDFs_chunks + chunks
# Get full text for contextualization
loader = PyPDFLoader(pdf_path)
pages = loader.load()
full_text = " ".join([page.page_content for page in pages])
return full_text, all_PDFs_chunks, pages
async def contextualize_chunk_based_on_serializer(
serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
):
if serializer["should_have_contextual_chunks"]:
contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
pages, all_PDFs_chunks
)
chunks_passados = contextualized_chunks
is_contextualized_chunk = True
else:
chunks_passados = all_PDFs_chunks
is_contextualized_chunk = False
return chunks_passados, is_contextualized_chunk
|