Shreyas094 commited on
Commit
bce6fcd
·
verified ·
1 Parent(s): a38a0b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -31
app.py CHANGED
@@ -359,42 +359,96 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
359
  return True
360
 
361
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
362
- system_prompt = """You are a world class AI assistant. Your task is to assess whether the given text is relevant to the user's query and provide a brief summary if it is relevant."""
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
- user_prompt = f"""
365
  Query: {query}
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  Document Content:
368
  {document['content']}
369
 
370
- Instructions:
371
- 1. Assess if the document is relevant to the QUERY made by the user.
372
- 2. If relevant, summarize the main points in 1-2 sentences.
373
- 3. If not relevant, simply state "Not relevant".
374
-
375
- Your response should be in the following format:
376
  Relevant: [Yes/No]
 
 
377
  Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
 
378
 
379
- Remember to focus on financial aspects and implications in your assessment and summary.
380
- """
381
-
382
- messages = [
383
- {"role": "system", "content": system_prompt},
384
- {"role": "user", "content": user_prompt}
385
- ]
386
 
387
- try:
388
  response = llm_client.chat_completion(
389
  messages=messages,
390
- max_tokens=150,
391
- temperature=temperature,
392
- top_p=0.9
393
  )
 
394
  return response.choices[0].message.content.strip()
 
395
  except Exception as e:
396
- logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
397
- return "Error: Unable to assess relevance and summarize"
 
 
 
 
398
 
399
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
400
  try:
@@ -601,28 +655,63 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
601
  # Step 3: Assess relevance, summarize, and check for uniqueness
602
  relevant_documents = []
603
  unique_summaries = []
 
 
604
  for doc in scraped_content:
605
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
606
- relevance, summary = assessment.split('\n', 1)
607
 
608
- if relevance.strip().lower() == "relevant: yes":
609
- summary_text = summary.replace("Summary: ", "").strip()
610
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  if is_content_unique(summary_text, unique_summaries):
612
- relevant_documents.append({
 
613
  "title": doc['title'],
614
  "url": doc['url'],
615
  "summary": summary_text,
616
- "scraper": doc['scraper']
617
- })
 
 
 
 
 
 
618
  unique_summaries.append(summary_text)
 
619
  else:
620
  logger.info(f"Skipping similar content: {doc['title']}")
621
-
 
 
 
 
 
 
 
 
622
  if not relevant_documents:
623
  logger.warning("No relevant and unique documents found.")
624
- return "No relevant and unique financial news found for the given query."
625
- logger.debug(f"Assessment result: {assessment}")
 
 
626
 
627
  # Step 4: Rerank documents based on similarity to query
628
  reranked_docs = rerank_documents(rephrased_query, relevant_documents)
 
359
  return True
360
 
361
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
362
+ """
363
+ Enhanced function to assess document relevance with entity detection and URL analysis.
364
+
365
+ Args:
366
+ llm_client: The LLM client instance
367
+ query: User's search query
368
+ document: Dictionary containing document info (url, content, etc.)
369
+ temperature: Temperature parameter for LLM
370
+
371
+ Returns:
372
+ String containing relevance assessment and summary
373
+ """
374
+ # First, detect entities in the query using LLM
375
+ entity_detection_prompt = """Analyze the following query and identify any specific named entities (companies, people, organizations, products, etc.). Return ONLY the entities, separated by commas. If no entities are found, return 'None'.
376
 
 
377
  Query: {query}
378
 
379
+ Entities:"""
380
+
381
+ entity_messages = [
382
+ {"role": "system", "content": "You are an expert at identifying named entities in text."},
383
+ {"role": "user", "content": entity_detection_prompt.format(query=query)}
384
+ ]
385
+
386
+ try:
387
+ entity_response = llm_client.chat_completion(
388
+ messages=entity_messages,
389
+ max_tokens=100,
390
+ temperature=0.1 # Lower temperature for more consistent entity detection
391
+ )
392
+ entities = entity_response.choices[0].message.content.strip()
393
+
394
+ # Calculate URL relevance score based on entities
395
+ url_relevance_score = 0
396
+ if entities.lower() != 'none':
397
+ url = document['url'].lower()
398
+ for entity in entities.split(','):
399
+ entity = entity.strip().lower()
400
+ if entity in url:
401
+ url_relevance_score += 1
402
+
403
+ # Prepare the main assessment prompt with entity and URL information
404
+ system_prompt = """You are a world class AI assistant specializing in document relevance assessment and summarization. Your task is to:
405
+ 1. Consider any detected entities and URL relevance
406
+ 2. Assess if the document content is relevant to the user's query
407
+ 3. Provide a relevance score and summary if relevant
408
+
409
+ Use the following scoring criteria:
410
+ - URL contains query entities: +1 point per entity
411
+ - Content directly addresses the query topic: +2 points
412
+ - Content contains relevant but indirect information: +1 point
413
+ - Content is recent and up-to-date (if time-sensitive): +1 point
414
+ - Content provides unique insights: +1 point"""
415
+
416
+ user_prompt = f"""
417
+ Query: {query}
418
+ Detected Entities: {entities}
419
+ URL Relevance Score: {url_relevance_score}
420
+
421
  Document Content:
422
  {document['content']}
423
 
424
+ Please provide your assessment in the following format:
 
 
 
 
 
425
  Relevant: [Yes/No]
426
+ Relevance Score: [Score out of 5]
427
+ URL Priority: [High if URL contains entities, Low if not]
428
  Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
429
+ Entities Mentioned: [List entities from the query that appear in the content]"""
430
 
431
+ messages = [
432
+ {"role": "system", "content": system_prompt},
433
+ {"role": "user", "content": user_prompt}
434
+ ]
 
 
 
435
 
436
+ # Get the final assessment
437
  response = llm_client.chat_completion(
438
  messages=messages,
439
+ max_tokens=250,
440
+ temperature=temperature
 
441
  )
442
+
443
  return response.choices[0].message.content.strip()
444
+
445
  except Exception as e:
446
+ logger.error(f"Error in enhanced relevance assessment: {e}")
447
+ return f"""Relevant: No
448
+ Relevance Score: 0
449
+ URL Priority: Low
450
+ Summary: Error during assessment - {str(e)}
451
+ Entities Mentioned: None"""
452
 
453
  def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
454
  try:
 
655
  # Step 3: Assess relevance, summarize, and check for uniqueness
656
  relevant_documents = []
657
  unique_summaries = []
658
+
659
+ # Sort scraped_content based on initial URL analysis (if entities are in URL)
660
  for doc in scraped_content:
661
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
 
662
 
663
+ # Parse the structured assessment response
664
+ assessment_parts = {}
665
+ for line in assessment.split('\n'):
666
+ if ':' in line:
667
+ key, value = line.split(':', 1)
668
+ assessment_parts[key.strip()] = value.strip()
669
+
670
+ # Extract relevant information
671
+ is_relevant = assessment_parts.get('Relevant', 'No').lower() == 'yes'
672
+ relevance_score = float(assessment_parts.get('Relevance Score', '0').split('/')[0])
673
+ url_priority = assessment_parts.get('URL Priority', 'Low')
674
+ summary_text = assessment_parts.get('Summary', 'Not relevant')
675
+ entities_mentioned = assessment_parts.get('Entities Mentioned', 'None')
676
+
677
+ # Define relevance threshold
678
+ RELEVANCE_THRESHOLD = 2.5 # Documents must score above 2.5 out of 5 to be considered
679
+
680
+ if is_relevant and relevance_score >= RELEVANCE_THRESHOLD:
681
+ # Check for content uniqueness
682
  if is_content_unique(summary_text, unique_summaries):
683
+ # Create enhanced document record
684
+ doc_record = {
685
  "title": doc['title'],
686
  "url": doc['url'],
687
  "summary": summary_text,
688
+ "scraper": doc['scraper'],
689
+ "relevance_score": relevance_score,
690
+ "url_priority": url_priority,
691
+ "entities_mentioned": entities_mentioned,
692
+ "original_content": doc.get('content', '') # Keep original content if needed
693
+ }
694
+
695
+ relevant_documents.append(doc_record)
696
  unique_summaries.append(summary_text)
697
+ logger.info(f"Added relevant document: {doc['title']} (Score: {relevance_score}, Priority: {url_priority})")
698
  else:
699
  logger.info(f"Skipping similar content: {doc['title']}")
700
+ else:
701
+ logger.info(f"Skipping irrelevant or low-scoring document: {doc['title']} (Score: {relevance_score})")
702
+
703
+ # Sort relevant documents by relevance score and URL priority
704
+ relevant_documents.sort(key=lambda x: (
705
+ x['url_priority'] == 'High', # True sorts before False
706
+ x['relevance_score']
707
+ ), reverse=True)
708
+
709
  if not relevant_documents:
710
  logger.warning("No relevant and unique documents found.")
711
+ return "No relevant and unique documents found for the given query."
712
+
713
+ logger.info(f"Found {len(relevant_documents)} relevant and unique documents")
714
+ logger.debug(f"Top document scores: {[(doc['title'], doc['relevance_score']) for doc in relevant_documents[:3]]}")
715
 
716
  # Step 4: Rerank documents based on similarity to query
717
  reranked_docs = rerank_documents(rephrased_query, relevant_documents)