Update app.py
Browse files
app.py
CHANGED
@@ -359,42 +359,96 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
|
|
359 |
return True
|
360 |
|
361 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
-
user_prompt = f"""
|
365 |
Query: {query}
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
Document Content:
|
368 |
{document['content']}
|
369 |
|
370 |
-
|
371 |
-
1. Assess if the document is relevant to the QUERY made by the user.
|
372 |
-
2. If relevant, summarize the main points in 1-2 sentences.
|
373 |
-
3. If not relevant, simply state "Not relevant".
|
374 |
-
|
375 |
-
Your response should be in the following format:
|
376 |
Relevant: [Yes/No]
|
|
|
|
|
377 |
Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
|
|
|
378 |
|
379 |
-
|
380 |
-
"""
|
381 |
-
|
382 |
-
|
383 |
-
{"role": "system", "content": system_prompt},
|
384 |
-
{"role": "user", "content": user_prompt}
|
385 |
-
]
|
386 |
|
387 |
-
|
388 |
response = llm_client.chat_completion(
|
389 |
messages=messages,
|
390 |
-
max_tokens=
|
391 |
-
temperature=temperature
|
392 |
-
top_p=0.9
|
393 |
)
|
|
|
394 |
return response.choices[0].message.content.strip()
|
|
|
395 |
except Exception as e:
|
396 |
-
logger.error(f"Error
|
397 |
-
return "
|
|
|
|
|
|
|
|
|
398 |
|
399 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
400 |
try:
|
@@ -601,28 +655,63 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
601 |
# Step 3: Assess relevance, summarize, and check for uniqueness
|
602 |
relevant_documents = []
|
603 |
unique_summaries = []
|
|
|
|
|
604 |
for doc in scraped_content:
|
605 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
606 |
-
relevance, summary = assessment.split('\n', 1)
|
607 |
|
608 |
-
|
609 |
-
|
610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
if is_content_unique(summary_text, unique_summaries):
|
612 |
-
|
|
|
613 |
"title": doc['title'],
|
614 |
"url": doc['url'],
|
615 |
"summary": summary_text,
|
616 |
-
"scraper": doc['scraper']
|
617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
618 |
unique_summaries.append(summary_text)
|
|
|
619 |
else:
|
620 |
logger.info(f"Skipping similar content: {doc['title']}")
|
621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
622 |
if not relevant_documents:
|
623 |
logger.warning("No relevant and unique documents found.")
|
624 |
-
return "No relevant and unique
|
625 |
-
|
|
|
|
|
626 |
|
627 |
# Step 4: Rerank documents based on similarity to query
|
628 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|
|
|
359 |
return True
|
360 |
|
361 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
362 |
+
"""
|
363 |
+
Enhanced function to assess document relevance with entity detection and URL analysis.
|
364 |
+
|
365 |
+
Args:
|
366 |
+
llm_client: The LLM client instance
|
367 |
+
query: User's search query
|
368 |
+
document: Dictionary containing document info (url, content, etc.)
|
369 |
+
temperature: Temperature parameter for LLM
|
370 |
+
|
371 |
+
Returns:
|
372 |
+
String containing relevance assessment and summary
|
373 |
+
"""
|
374 |
+
# First, detect entities in the query using LLM
|
375 |
+
entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
|
376 |
|
|
|
377 |
Query: {query}
|
378 |
|
379 |
+
Entities:"""
|
380 |
+
|
381 |
+
entity_messages = [
|
382 |
+
{"role": "system", "content": "You are an expert at identifying named entities in text."},
|
383 |
+
{"role": "user", "content": entity_detection_prompt.format(query=query)}
|
384 |
+
]
|
385 |
+
|
386 |
+
try:
|
387 |
+
entity_response = llm_client.chat_completion(
|
388 |
+
messages=entity_messages,
|
389 |
+
max_tokens=100,
|
390 |
+
temperature=0.1 # Lower temperature for more consistent entity detection
|
391 |
+
)
|
392 |
+
entities = entity_response.choices[0].message.content.strip()
|
393 |
+
|
394 |
+
# Calculate URL relevance score based on entities
|
395 |
+
url_relevance_score = 0
|
396 |
+
if entities.lower() != 'none':
|
397 |
+
url = document['url'].lower()
|
398 |
+
for entity in entities.split(','):
|
399 |
+
entity = entity.strip().lower()
|
400 |
+
if entity in url:
|
401 |
+
url_relevance_score += 1
|
402 |
+
|
403 |
+
# Prepare the main assessment prompt with entity and URL information
|
404 |
+
system_prompt = """You are a world class AI assistant specializing in document relevance assessment and summarization. Your task is to:
|
405 |
+
1. Consider any detected entities and URL relevance
|
406 |
+
2. Assess if the document content is relevant to the user's query
|
407 |
+
3. Provide a relevance score and summary if relevant
|
408 |
+
|
409 |
+
Use the following scoring criteria:
|
410 |
+
- URL contains query entities: +1 point per entity
|
411 |
+
- Content directly addresses the query topic: +2 points
|
412 |
+
- Content contains relevant but indirect information: +1 point
|
413 |
+
- Content is recent and up-to-date (if time-sensitive): +1 point
|
414 |
+
- Content provides unique insights: +1 point"""
|
415 |
+
|
416 |
+
user_prompt = f"""
|
417 |
+
Query: {query}
|
418 |
+
Detected Entities: {entities}
|
419 |
+
URL Relevance Score: {url_relevance_score}
|
420 |
+
|
421 |
Document Content:
|
422 |
{document['content']}
|
423 |
|
424 |
+
Please provide your assessment in the following format:
|
|
|
|
|
|
|
|
|
|
|
425 |
Relevant: [Yes/No]
|
426 |
+
Relevance Score: [Score out of 5]
|
427 |
+
URL Priority: [High if URL contains entities, Low if not]
|
428 |
Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
|
429 |
+
Entities Mentioned: [List entities from the query that appear in the content]"""
|
430 |
|
431 |
+
messages = [
|
432 |
+
{"role": "system", "content": system_prompt},
|
433 |
+
{"role": "user", "content": user_prompt}
|
434 |
+
]
|
|
|
|
|
|
|
435 |
|
436 |
+
# Get the final assessment
|
437 |
response = llm_client.chat_completion(
|
438 |
messages=messages,
|
439 |
+
max_tokens=250,
|
440 |
+
temperature=temperature
|
|
|
441 |
)
|
442 |
+
|
443 |
return response.choices[0].message.content.strip()
|
444 |
+
|
445 |
except Exception as e:
|
446 |
+
logger.error(f"Error in enhanced relevance assessment: {e}")
|
447 |
+
return f"""Relevant: No
|
448 |
+
Relevance Score: 0
|
449 |
+
URL Priority: Low
|
450 |
+
Summary: Error during assessment - {str(e)}
|
451 |
+
Entities Mentioned: None"""
|
452 |
|
453 |
def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
454 |
try:
|
|
|
655 |
# Step 3: Assess relevance, summarize, and check for uniqueness
|
656 |
relevant_documents = []
|
657 |
unique_summaries = []
|
658 |
+
|
659 |
+
# Sort scraped_content based on initial URL analysis (if entities are in URL)
|
660 |
for doc in scraped_content:
|
661 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
|
|
662 |
|
663 |
+
# Parse the structured assessment response
|
664 |
+
assessment_parts = {}
|
665 |
+
for line in assessment.split('\n'):
|
666 |
+
if ':' in line:
|
667 |
+
key, value = line.split(':', 1)
|
668 |
+
assessment_parts[key.strip()] = value.strip()
|
669 |
+
|
670 |
+
# Extract relevant information
|
671 |
+
is_relevant = assessment_parts.get('Relevant', 'No').lower() == 'yes'
|
672 |
+
relevance_score = float(assessment_parts.get('Relevance Score', '0').split('/')[0])
|
673 |
+
url_priority = assessment_parts.get('URL Priority', 'Low')
|
674 |
+
summary_text = assessment_parts.get('Summary', 'Not relevant')
|
675 |
+
entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
|
676 |
+
|
677 |
+
# Define relevance threshold
|
678 |
+
RELEVANCE_THRESHOLD = 2.5 # Documents must score above 2.5 out of 5 to be considered
|
679 |
+
|
680 |
+
if is_relevant and relevance_score >= RELEVANCE_THRESHOLD:
|
681 |
+
# Check for content uniqueness
|
682 |
if is_content_unique(summary_text, unique_summaries):
|
683 |
+
# Create enhanced document record
|
684 |
+
doc_record = {
|
685 |
"title": doc['title'],
|
686 |
"url": doc['url'],
|
687 |
"summary": summary_text,
|
688 |
+
"scraper": doc['scraper'],
|
689 |
+
"relevance_score": relevance_score,
|
690 |
+
"url_priority": url_priority,
|
691 |
+
"entities_mentioned": entities_mentioned,
|
692 |
+
"original_content": doc.get('content', '') # Keep original content if needed
|
693 |
+
}
|
694 |
+
|
695 |
+
relevant_documents.append(doc_record)
|
696 |
unique_summaries.append(summary_text)
|
697 |
+
logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
|
698 |
else:
|
699 |
logger.info(f"Skipping similar content: {doc['title']}")
|
700 |
+
else:
|
701 |
+
logger.info(f"Skipping irrelevant or low-scoring document: {doc['title']} (Score: {relevance_score})")
|
702 |
+
|
703 |
+
# Sort relevant documents by relevance score and URL priority
|
704 |
+
relevant_documents.sort(key=lambda x: (
|
705 |
+
x['url_priority'] == 'High', # True sorts before False
|
706 |
+
x['relevance_score']
|
707 |
+
), reverse=True)
|
708 |
+
|
709 |
if not relevant_documents:
|
710 |
logger.warning("No relevant and unique documents found.")
|
711 |
+
return "No relevant and unique documents found for the given query."
|
712 |
+
|
713 |
+
logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
|
714 |
+
logger.debug(f"Top document scores: {[(doc['title'], doc['relevance_score']) for doc in relevant_documents[:3]]}")
|
715 |
|
716 |
# Step 4: Rerank documents based on similarity to query
|
717 |
reranked_docs = rerank_documents(rephrased_query, relevant_documents)
|