SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 8, 2024

Commit

8bba703

verified ·

1 Parent(s): cae97f8

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -23

app.py CHANGED Viewed

@@ -332,7 +332,7 @@ Remember to focus on financial aspects and implications in your assessment and s
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
-def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
     try:
         logger.info(f"Scraping full content from: {url}")
@@ -340,28 +340,26 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
         if url.lower().endswith('.pdf'):
             return scrape_pdf_content(url, max_chars, timeout)
-        if scraper == "bs4":
-            session = requests_retry_session()
-            response = session.get(url, timeout=timeout)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.content, 'html.parser')
-            # Try to find the main content
-            main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
-            if main_content:
-                content = main_content.get_text(strip=True, separator='\n')
-            else:
-                content = soup.get_text(strip=True, separator='\n')
-        elif scraper == "trafilatura":
-            content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
-        elif scraper == "scrapy":
-            content = scrape_with_scrapy(url, timeout)
-        elif scraper == "newspaper":
-            content = scrape_with_newspaper(url)
-        else:
-            logger.error(f"Unknown scraper: {scraper}")
-            return ""
         # Limit the content to max_chars
         return content[:max_chars] if content else ""

         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
+def scrape_full_content(url, max_chars=3000, timeout=5):
     try:
         logger.info(f"Scraping full content from: {url}")
         if url.lower().endswith('.pdf'):
             return scrape_pdf_content(url, max_chars, timeout)
+        # Use newspaper for non-PDF content
+        article = Article(url)
+        article.download()
+        article.parse()
+        # Combine title and text
+        content = f"Title: {article.title}\n\n"
+        content += article.text
+        # Add publish date if available
+        if article.publish_date:
+            content += f"\n\nPublish Date: {article.publish_date}"
+        # Add authors if available
+        if article.authors:
+            content += f"\n\nAuthors: {', '.join(article.authors)}"
+        # Add top image URL if available
+        if article.top_image:
+            content += f"\n\nTop Image URL: {article.top_image}"
         # Limit the content to max_chars
         return content[:max_chars] if content else ""