Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import requests
|
2 |
import gradio as gr
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
import logging
|
5 |
from urllib.parse import urlparse
|
6 |
from requests.adapters import HTTPAdapter
|
@@ -38,10 +37,11 @@ from collections import Counter
|
|
38 |
import numpy as np
|
39 |
from typing import List, Dict, Tuple
|
40 |
import datetime
|
41 |
-
|
|
|
42 |
|
43 |
# Automatically get the current year
|
44 |
-
|
45 |
|
46 |
# Load environment variables from a .env file
|
47 |
load_dotenv()
|
@@ -51,8 +51,42 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
51 |
logger = logging.getLogger(__name__)
|
52 |
|
53 |
# SearXNG instance details
|
54 |
-
SEARXNG_URL =
|
55 |
-
SEARXNG_KEY =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Use the environment variable
|
58 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
@@ -75,7 +109,89 @@ mistral_client = Mistral(api_key=MISTRAL_API_KEY)
|
|
75 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
76 |
|
77 |
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
system_prompt = """You are Sentinel, an intelligent AI agent tasked with determining whether a user query requires a web search or can be answered using your existing knowledge base. Your knowledge cutoff date is 2023, and the current year is 2024. Your task is to analyze the query and decide on the appropriate action.
|
80 |
|
81 |
Instructions for Sentinel:
|
@@ -120,18 +236,18 @@ def determine_query_type(query: str, chat_history: str, llm_client) -> str:
|
|
120 |
]
|
121 |
|
122 |
try:
|
123 |
-
response =
|
124 |
messages=messages,
|
125 |
max_tokens=10,
|
126 |
temperature=0.2
|
127 |
)
|
128 |
-
decision = response.
|
129 |
return "web_search" if decision == "web_search" else "knowledge_base"
|
130 |
except Exception as e:
|
131 |
logger.error(f"Error determining query type: {e}")
|
132 |
return "web_search" # Default to web search if there's an error
|
133 |
|
134 |
-
def generate_ai_response(query: str, chat_history: str,
|
135 |
system_prompt = """You are a helpful AI assistant. Provide a concise and informative response to the user's query based on your existing knowledge. Do not make up information or claim to have real-time data."""
|
136 |
|
137 |
user_prompt = f"""
|
@@ -149,29 +265,12 @@ def generate_ai_response(query: str, chat_history: str, llm_client, model: str)
|
|
149 |
]
|
150 |
|
151 |
try:
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
)
|
159 |
-
return response.choices[0].message.content.strip()
|
160 |
-
elif model == "mistral":
|
161 |
-
response = mistral_client.chat.complete(
|
162 |
-
model="open-mistral-nemo",
|
163 |
-
messages=messages,
|
164 |
-
max_tokens=500,
|
165 |
-
temperature=0.7
|
166 |
-
)
|
167 |
-
return response.choices[0].message.content.strip()
|
168 |
-
else: # huggingface
|
169 |
-
response = llm_client.chat_completion(
|
170 |
-
messages=messages,
|
171 |
-
max_tokens=500,
|
172 |
-
temperature=0.7
|
173 |
-
)
|
174 |
-
return response.choices[0].message.content.strip()
|
175 |
except Exception as e:
|
176 |
logger.error(f"Error generating AI response: {e}")
|
177 |
return "I apologize, but I'm having trouble generating a response at the moment. Please try again later."
|
@@ -282,8 +381,8 @@ def rephrase_query(chat_history, query, temperature=0.2):
|
|
282 |
- First, check if the query contains words indicating current information (e.g., "today", "now", "current", "latest"):
|
283 |
- If present, do NOT add any date operators to the query
|
284 |
- Otherwise, if the query mentions a specific time period (e.g., a quarter, year, or date range):
|
285 |
-
- Add appropriate "after:" operators to the end of the rephrased query.
|
286 |
-
- Use the format "after:YYYY" for date ranges.
|
287 |
- If no specific time period is mentioned and no current-time indicators are present:
|
288 |
- Append "after: {CURRENT_YEAR}" to the end of the rephrased query.
|
289 |
- Do not use quotes or the "+" operator when adding dates.
|
@@ -300,16 +399,20 @@ def rephrase_query(chat_history, query, temperature=0.2):
|
|
300 |
|
301 |
**Scenario 2: New Topic with Specific Quarter**
|
302 |
- **User Query**: "How did Bank of America perform during Q2 2024?"
|
303 |
-
- **Rephrased Query**: "How did \"Bank of America\" perform during Q2 2024 after:2024"
|
304 |
|
305 |
**Scenario 3: Continuation with Date Range**
|
306 |
- **Previous Query**: "What were Apple's sales figures for 2023?"
|
307 |
- **User Query**: "How about for the first half of 2024?"
|
308 |
-
- **Rephrased Query**: "How about \"Apple\"'s sales figures for the first half of 2024 after:2024"
|
309 |
|
310 |
**Scenario 4: Current Status Query**
|
311 |
- **User Query**: "What is the current market share of Toyota and Honda in the US?"
|
312 |
- **Rephrased Query**: "What is the current market share of \"Toyota\" and \"Honda\" in the \"US\""
|
|
|
|
|
|
|
|
|
313 |
"""
|
314 |
|
315 |
# Create the user prompt with the chat history and current query
|
@@ -527,7 +630,7 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
|
|
527 |
return True
|
528 |
|
529 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
530 |
-
system_prompt = """You are a world-class AI assistant specializing in
|
531 |
|
532 |
user_prompt = f"""
|
533 |
Query: {query}
|
@@ -542,7 +645,7 @@ Instructions:
|
|
542 |
- Key facts and figures
|
543 |
- Dates of events or announcements
|
544 |
- Names of important entities mentioned
|
545 |
-
- Any
|
546 |
- The potential impact or significance of the news
|
547 |
3. If not relevant, simply state "Not relevant".
|
548 |
|
@@ -550,7 +653,7 @@ Your response should be in the following format:
|
|
550 |
Relevant: [Yes/No]
|
551 |
Summary: [Your detailed summary if relevant, or "Not relevant" if not]
|
552 |
|
553 |
-
Remember to focus on
|
554 |
"""
|
555 |
|
556 |
messages = [
|
@@ -596,7 +699,7 @@ def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
|
|
596 |
return ""
|
597 |
|
598 |
def llm_summarize(json_input, model, temperature=0.2):
|
599 |
-
system_prompt = """You are Sentinel, a world-class
|
600 |
user_prompt = f"""
|
601 |
Please provide a comprehensive summary based on the following JSON input:
|
602 |
{json_input}
|
@@ -654,8 +757,23 @@ Instructions:
|
|
654 |
logger.error(f"Error in LLM summarization: {e}")
|
655 |
return "Error: Unable to generate a summary. Please try again."
|
656 |
|
657 |
-
def search_and_scrape(
|
658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
try:
|
660 |
# Step 1: Rephrase the Query
|
661 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
@@ -801,16 +919,16 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
801 |
|
802 |
if not relevant_documents:
|
803 |
logger.warning("No relevant and unique documents found.")
|
804 |
-
return "No relevant and unique
|
805 |
|
806 |
# Step 5: Rerank documents based on similarity to query and prioritize entity domain
|
807 |
reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
|
808 |
|
809 |
if not reranked_docs:
|
810 |
logger.warning("No documents remained after reranking.")
|
811 |
-
return "No relevant
|
812 |
|
813 |
-
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique,
|
814 |
|
815 |
# Step 5: Scrape full content for top documents (up to num_results)
|
816 |
for doc in reranked_docs[:num_results]:
|
@@ -839,19 +957,39 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
839 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
840 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
841 |
|
842 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
843 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
844 |
|
845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
846 |
|
847 |
if query_type == "knowledge_base":
|
848 |
-
response = generate_ai_response(message, chat_history,
|
849 |
else: # web_search
|
850 |
gr.Info("Initiating Web Search")
|
851 |
yield "Request you to sit back and relax until I scrape the web for up-to-date information"
|
852 |
response = search_and_scrape(
|
853 |
query=message,
|
854 |
chat_history=chat_history,
|
|
|
855 |
num_results=num_results,
|
856 |
max_chars=max_chars,
|
857 |
time_range=time_range,
|
@@ -870,15 +1008,16 @@ def chat_function(message: str, history: List[Tuple[str, str]], num_results: int
|
|
870 |
|
871 |
iface = gr.ChatInterface(
|
872 |
chat_function,
|
873 |
-
title="Web Scraper for
|
874 |
description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
|
875 |
theme=gr.Theme.from_hub("allenai/gradio-theme"),
|
876 |
additional_inputs=[
|
877 |
-
gr.
|
|
|
878 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
879 |
-
gr.Dropdown(["", "day", "week", "month", "year"], value="", label="Time Range"),
|
880 |
-
gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="", label="Language"),
|
881 |
-
gr.Dropdown(["", "general", "news", "images", "videos", "music", "files", "it", "science", "social media"], value="", label="Category"),
|
882 |
gr.Dropdown(
|
883 |
["google", "bing", "duckduckgo", "baidu", "yahoo", "qwant", "startpage"],
|
884 |
multiselect=True,
|
@@ -886,10 +1025,10 @@ iface = gr.ChatInterface(
|
|
886 |
label="Engines"
|
887 |
),
|
888 |
gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
|
889 |
-
gr.Radio(["GET", "POST"], value="
|
890 |
gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
|
891 |
-
gr.Dropdown(
|
892 |
-
gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=
|
893 |
],
|
894 |
additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
|
895 |
retry_btn="Retry",
|
@@ -904,5 +1043,31 @@ iface = gr.ChatInterface(
|
|
904 |
)
|
905 |
|
906 |
if __name__ == "__main__":
|
907 |
-
logger.info("Starting the SearXNG Scraper for
|
908 |
-
iface.launch(share=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
import gradio as gr
|
|
|
3 |
import logging
|
4 |
from urllib.parse import urlparse
|
5 |
from requests.adapters import HTTPAdapter
|
|
|
37 |
import numpy as np
|
38 |
from typing import List, Dict, Tuple
|
39 |
import datetime
|
40 |
+
from abc import ABC, abstractmethod
|
41 |
+
from typing import List, Dict, Any
|
42 |
|
43 |
# Automatically get the current year
|
44 |
+
CURRENT_YEAR = datetime.datetime.now().year
|
45 |
|
46 |
# Load environment variables from a .env file
|
47 |
load_dotenv()
|
|
|
51 |
logger = logging.getLogger(__name__)
|
52 |
|
53 |
# SearXNG instance details
|
54 |
+
SEARXNG_URL = os.getenv("SEARXNG_URL")
|
55 |
+
SEARXNG_KEY = os.getenv("SEARXNG_KEY")
|
56 |
+
|
57 |
+
|
58 |
+
logger.info(f"SearXNG URL: {SEARXNG_URL}")
|
59 |
+
logger.info(f"SearXNG Key: {SEARXNG_KEY}")
|
60 |
+
|
61 |
+
|
62 |
+
# ... other environment variables ...
|
63 |
+
CUSTOM_LLM = os.getenv("CUSTOM_LLM")
|
64 |
+
CUSTOM_LLM_DEFAULT_MODEL = os.getenv("CUSTOM_LLM_DEFAULT_MODEL")
|
65 |
+
|
66 |
+
logger.info(f"CUSTOM_LLM: {CUSTOM_LLM}")
|
67 |
+
logger.info(f"CUSTOM_LLM_DEFAULT_MODEL: {CUSTOM_LLM_DEFAULT_MODEL}")
|
68 |
+
|
69 |
+
# Define the fetch_custom_models function here
|
70 |
+
def fetch_custom_models():
|
71 |
+
if not CUSTOM_LLM:
|
72 |
+
return []
|
73 |
+
try:
|
74 |
+
response = requests.get(f"{CUSTOM_LLM}/v1/models")
|
75 |
+
response.raise_for_status()
|
76 |
+
models = response.json().get("data", [])
|
77 |
+
return [model["id"] for model in models]
|
78 |
+
except Exception as e:
|
79 |
+
logger.error(f"Error fetching custom models: {e}")
|
80 |
+
return []
|
81 |
+
|
82 |
+
# Fetch custom models and determine the default model
|
83 |
+
custom_models = fetch_custom_models()
|
84 |
+
all_models = ["huggingface", "groq", "mistral"] + custom_models
|
85 |
+
|
86 |
+
# Determine the default model
|
87 |
+
default_model = CUSTOM_LLM_DEFAULT_MODEL if CUSTOM_LLM_DEFAULT_MODEL in all_models else "groq"
|
88 |
+
|
89 |
+
logger.info(f"Default model selected: {default_model}")
|
90 |
|
91 |
# Use the environment variable
|
92 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
109 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
110 |
|
111 |
|
112 |
+
|
113 |
+
# Step 1: Create a base class for AI models
|
114 |
+
class AIModel(ABC):
|
115 |
+
@abstractmethod
|
116 |
+
def generate_response(self, messages: List[Dict[str, str]], max_tokens: int, temperature: float) -> str:
|
117 |
+
pass
|
118 |
+
|
119 |
+
# Step 2: Implement specific classes for each AI model
|
120 |
+
class HuggingFaceModel(AIModel):
|
121 |
+
def __init__(self, client):
|
122 |
+
self.client = client
|
123 |
+
|
124 |
+
def generate_response(self, messages: List[Dict[str, str]], max_tokens: int, temperature: float) -> str:
|
125 |
+
response = self.client.chat_completion(
|
126 |
+
messages=messages,
|
127 |
+
max_tokens=max_tokens,
|
128 |
+
temperature=temperature
|
129 |
+
)
|
130 |
+
return response.choices[0].message.content.strip()
|
131 |
+
|
132 |
+
class GroqModel(AIModel):
|
133 |
+
def __init__(self, client):
|
134 |
+
self.client = client
|
135 |
+
|
136 |
+
def generate_response(self, messages: List[Dict[str, str]], max_tokens: int, temperature: float) -> str:
|
137 |
+
response = self.client.chat.completions.create(
|
138 |
+
messages=messages,
|
139 |
+
model="llama-3.1-70b-versatile",
|
140 |
+
max_tokens=max_tokens,
|
141 |
+
temperature=temperature
|
142 |
+
)
|
143 |
+
return response.choices[0].message.content.strip()
|
144 |
+
|
145 |
+
class MistralModel(AIModel):
|
146 |
+
def __init__(self, client):
|
147 |
+
self.client = client
|
148 |
+
|
149 |
+
def generate_response(self, messages: List[Dict[str, str]], max_tokens: int, temperature: float) -> str:
|
150 |
+
response = self.client.chat.complete(
|
151 |
+
model="open-mistral-nemo",
|
152 |
+
messages=messages,
|
153 |
+
max_tokens=max_tokens,
|
154 |
+
temperature=temperature
|
155 |
+
)
|
156 |
+
return response.choices[0].message.content.strip()
|
157 |
+
|
158 |
+
# Step 3: Use a factory pattern to create model instances
|
159 |
+
class CustomModel(AIModel):
|
160 |
+
def __init__(self, model_name):
|
161 |
+
self.model_name = model_name
|
162 |
+
|
163 |
+
def generate_response(self, messages: List[Dict[str, str]], max_tokens: int, temperature: float) -> str:
|
164 |
+
try:
|
165 |
+
response = requests.post(
|
166 |
+
f"{CUSTOM_LLM}/v1/chat/completions",
|
167 |
+
json={
|
168 |
+
"model": self.model_name,
|
169 |
+
"messages": messages,
|
170 |
+
"max_tokens": max_tokens,
|
171 |
+
"temperature": temperature
|
172 |
+
}
|
173 |
+
)
|
174 |
+
response.raise_for_status()
|
175 |
+
return response.json()["choices"][0]["message"]["content"].strip()
|
176 |
+
except Exception as e:
|
177 |
+
logger.error(f"Error generating response from custom model: {e}")
|
178 |
+
return "Error: Unable to generate response from custom model."
|
179 |
+
|
180 |
+
class AIModelFactory:
|
181 |
+
@staticmethod
|
182 |
+
def create_model(model_name: str, client: Any = None) -> AIModel:
|
183 |
+
if model_name == "huggingface":
|
184 |
+
return HuggingFaceModel(client)
|
185 |
+
elif model_name == "groq":
|
186 |
+
return GroqModel(client)
|
187 |
+
elif model_name == "mistral":
|
188 |
+
return MistralModel(client)
|
189 |
+
elif CUSTOM_LLM and model_name in fetch_custom_models():
|
190 |
+
return CustomModel(model_name)
|
191 |
+
else:
|
192 |
+
raise ValueError(f"Unsupported model: {model_name}")
|
193 |
+
|
194 |
+
def determine_query_type(query: str, chat_history: str, ai_model: AIModel) -> str:
|
195 |
system_prompt = """You are Sentinel, an intelligent AI agent tasked with determining whether a user query requires a web search or can be answered using your existing knowledge base. Your knowledge cutoff date is 2023, and the current year is 2024. Your task is to analyze the query and decide on the appropriate action.
|
196 |
|
197 |
Instructions for Sentinel:
|
|
|
236 |
]
|
237 |
|
238 |
try:
|
239 |
+
response = ai_model.generate_response(
|
240 |
messages=messages,
|
241 |
max_tokens=10,
|
242 |
temperature=0.2
|
243 |
)
|
244 |
+
decision = response.strip().lower()
|
245 |
return "web_search" if decision == "web_search" else "knowledge_base"
|
246 |
except Exception as e:
|
247 |
logger.error(f"Error determining query type: {e}")
|
248 |
return "web_search" # Default to web search if there's an error
|
249 |
|
250 |
+
def generate_ai_response(query: str, chat_history: str, ai_model: AIModel, temperature: float) -> str:
|
251 |
system_prompt = """You are a helpful AI assistant. Provide a concise and informative response to the user's query based on your existing knowledge. Do not make up information or claim to have real-time data."""
|
252 |
|
253 |
user_prompt = f"""
|
|
|
265 |
]
|
266 |
|
267 |
try:
|
268 |
+
response = ai_model.generate_response(
|
269 |
+
messages=messages,
|
270 |
+
max_tokens=500,
|
271 |
+
temperature=temperature
|
272 |
+
)
|
273 |
+
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
except Exception as e:
|
275 |
logger.error(f"Error generating AI response: {e}")
|
276 |
return "I apologize, but I'm having trouble generating a response at the moment. Please try again later."
|
|
|
381 |
- First, check if the query contains words indicating current information (e.g., "today", "now", "current", "latest"):
|
382 |
- If present, do NOT add any date operators to the query
|
383 |
- Otherwise, if the query mentions a specific time period (e.g., a quarter, year, or date range):
|
384 |
+
- Add appropriate "after: " operators to the end of the rephrased query.
|
385 |
+
- Use the format "after: YYYY" for date ranges.
|
386 |
- If no specific time period is mentioned and no current-time indicators are present:
|
387 |
- Append "after: {CURRENT_YEAR}" to the end of the rephrased query.
|
388 |
- Do not use quotes or the "+" operator when adding dates.
|
|
|
399 |
|
400 |
**Scenario 2: New Topic with Specific Quarter**
|
401 |
- **User Query**: "How did Bank of America perform during Q2 2024?"
|
402 |
+
- **Rephrased Query**: "How did \"Bank of America\" perform during Q2 2024 after: 2024"
|
403 |
|
404 |
**Scenario 3: Continuation with Date Range**
|
405 |
- **Previous Query**: "What were Apple's sales figures for 2023?"
|
406 |
- **User Query**: "How about for the first half of 2024?"
|
407 |
+
- **Rephrased Query**: "How about \"Apple\"'s sales figures for the first half of 2024 after: 2024"
|
408 |
|
409 |
**Scenario 4: Current Status Query**
|
410 |
- **User Query**: "What is the current market share of Toyota and Honda in the US?"
|
411 |
- **Rephrased Query**: "What is the current market share of \"Toyota\" and \"Honda\" in the \"US\""
|
412 |
+
|
413 |
+
**Scenario 5: Current Status Query**
|
414 |
+
- **User Query**: "Bank of America Q2 2024 earnings?"
|
415 |
+
- **Rephrased Query**: "\"Bank of America\" Q2 2024 earnings after: 2024""
|
416 |
"""
|
417 |
|
418 |
# Create the user prompt with the chat history and current query
|
|
|
630 |
return True
|
631 |
|
632 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
633 |
+
system_prompt = """You are a world-class AI assistant specializing in news analysis. Your task is to assess the relevance of a given document to a user's query and provide a detailed summary if it's relevant."""
|
634 |
|
635 |
user_prompt = f"""
|
636 |
Query: {query}
|
|
|
645 |
- Key facts and figures
|
646 |
- Dates of events or announcements
|
647 |
- Names of important entities mentioned
|
648 |
+
- Any metrics or changes reported
|
649 |
- The potential impact or significance of the news
|
650 |
3. If not relevant, simply state "Not relevant".
|
651 |
|
|
|
653 |
Relevant: [Yes/No]
|
654 |
Summary: [Your detailed summary if relevant, or "Not relevant" if not]
|
655 |
|
656 |
+
Remember to focus on key aspects and implications in your assessment and summary. Aim to make the summary distinctive, highlighting what makes this particular news item unique compared to similar news.
|
657 |
"""
|
658 |
|
659 |
messages = [
|
|
|
699 |
return ""
|
700 |
|
701 |
def llm_summarize(json_input, model, temperature=0.2):
|
702 |
+
system_prompt = """You are Sentinel, a world-class AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
|
703 |
user_prompt = f"""
|
704 |
Please provide a comprehensive summary based on the following JSON input:
|
705 |
{json_input}
|
|
|
757 |
logger.error(f"Error in LLM summarization: {e}")
|
758 |
return "Error: Unable to generate a summary. Please try again."
|
759 |
|
760 |
+
def search_and_scrape(
|
761 |
+
query: str,
|
762 |
+
chat_history: str,
|
763 |
+
ai_model: AIModel,
|
764 |
+
num_results: int = 10,
|
765 |
+
max_chars: int = 1500,
|
766 |
+
time_range: str = "",
|
767 |
+
language: str = "en",
|
768 |
+
category: str = "general",
|
769 |
+
engines: List[str] = [],
|
770 |
+
safesearch: int = 2,
|
771 |
+
method: str = "GET",
|
772 |
+
llm_temperature: float = 0.2,
|
773 |
+
timeout: int = 5,
|
774 |
+
model: str = "huggingface",
|
775 |
+
use_pydf2: bool = True
|
776 |
+
):
|
777 |
try:
|
778 |
# Step 1: Rephrase the Query
|
779 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
|
|
919 |
|
920 |
if not relevant_documents:
|
921 |
logger.warning("No relevant and unique documents found.")
|
922 |
+
return "No relevant and unique news found for the given query."
|
923 |
|
924 |
# Step 5: Rerank documents based on similarity to query and prioritize entity domain
|
925 |
reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
|
926 |
|
927 |
if not reranked_docs:
|
928 |
logger.warning("No documents remained after reranking.")
|
929 |
+
return "No relevant news found after filtering and ranking."
|
930 |
|
931 |
+
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, related documents.")
|
932 |
|
933 |
# Step 5: Scrape full content for top documents (up to num_results)
|
934 |
for doc in reranked_docs[:num_results]:
|
|
|
957 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
958 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
959 |
|
960 |
+
# Helper function to get the appropriate client for each model
|
961 |
+
def get_client_for_model(model: str) -> Any:
|
962 |
+
if model == "huggingface":
|
963 |
+
return InferenceClient("mistralai/Mistral-Small-Instruct-2409", token=HF_TOKEN)
|
964 |
+
elif model == "groq":
|
965 |
+
return Groq(api_key=GROQ_API_KEY)
|
966 |
+
elif model == "mistral":
|
967 |
+
return Mistral(api_key=MISTRAL_API_KEY)
|
968 |
+
elif CUSTOM_LLM and (model in fetch_custom_models() or model == CUSTOM_LLM_DEFAULT_MODEL):
|
969 |
+
return None # CustomModel doesn't need a client
|
970 |
+
else:
|
971 |
+
raise ValueError(f"Unsupported model: {model}")
|
972 |
+
|
973 |
+
def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
|
974 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
975 |
|
976 |
+
# Create the appropriate AI model
|
977 |
+
ai_model = AIModelFactory.create_model(model, get_client_for_model(model))
|
978 |
+
|
979 |
+
if only_web_search:
|
980 |
+
query_type = "web_search"
|
981 |
+
else:
|
982 |
+
query_type = determine_query_type(message, chat_history, ai_model)
|
983 |
|
984 |
if query_type == "knowledge_base":
|
985 |
+
response = generate_ai_response(message, chat_history, ai_model, llm_temperature)
|
986 |
else: # web_search
|
987 |
gr.Info("Initiating Web Search")
|
988 |
yield "Request you to sit back and relax until I scrape the web for up-to-date information"
|
989 |
response = search_and_scrape(
|
990 |
query=message,
|
991 |
chat_history=chat_history,
|
992 |
+
ai_model=ai_model,
|
993 |
num_results=num_results,
|
994 |
max_chars=max_chars,
|
995 |
time_range=time_range,
|
|
|
1008 |
|
1009 |
iface = gr.ChatInterface(
|
1010 |
chat_function,
|
1011 |
+
title="Web Scraper for News with Sentinel AI",
|
1012 |
description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
|
1013 |
theme=gr.Theme.from_hub("allenai/gradio-theme"),
|
1014 |
additional_inputs=[
|
1015 |
+
gr.Checkbox(label="Only do web search", value=True), # Add this line
|
1016 |
+
gr.Slider(5, 20, value=3, step=1, label="Number of initial results"),
|
1017 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
1018 |
+
gr.Dropdown(["", "day", "week", "month", "year"], value="week", label="Time Range"),
|
1019 |
+
gr.Dropdown(["", "all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),
|
1020 |
+
gr.Dropdown(["", "general", "news", "images", "videos", "music", "files", "it", "science", "social media"], value="general", label="Category"),
|
1021 |
gr.Dropdown(
|
1022 |
["google", "bing", "duckduckgo", "baidu", "yahoo", "qwant", "startpage"],
|
1023 |
multiselect=True,
|
|
|
1025 |
label="Engines"
|
1026 |
),
|
1027 |
gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
|
1028 |
+
gr.Radio(["GET", "POST"], value="GET", label="HTTP Method"),
|
1029 |
gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
|
1030 |
+
gr.Dropdown(all_models, value=default_model, label="LLM Model"),
|
1031 |
+
gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
|
1032 |
],
|
1033 |
additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
|
1034 |
retry_btn="Retry",
|
|
|
1043 |
)
|
1044 |
|
1045 |
if __name__ == "__main__":
|
1046 |
+
logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
|
1047 |
+
iface.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
1048 |
+
|
1049 |
+
|
1050 |
+
|
1051 |
+
|
1052 |
+
|
1053 |
+
|
1054 |
+
|
1055 |
+
|
1056 |
+
|
1057 |
+
|
1058 |
+
|
1059 |
+
|
1060 |
+
|
1061 |
+
|
1062 |
+
|
1063 |
+
|
1064 |
+
|
1065 |
+
|
1066 |
+
|
1067 |
+
|
1068 |
+
|
1069 |
+
|
1070 |
+
|
1071 |
+
|
1072 |
+
|
1073 |
+
|