Shreyas094 commited on
Commit
8bba703
·
verified ·
1 Parent(s): cae97f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -23
app.py CHANGED
@@ -332,7 +332,7 @@ Remember to focus on financial aspects and implications in your assessment and s
332
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
333
  return "Error: Unable to assess relevance and summarize"
334
 
335
- def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
336
  try:
337
  logger.info(f"Scraping full content from: {url}")
338
 
@@ -340,28 +340,26 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
340
  if url.lower().endswith('.pdf'):
341
  return scrape_pdf_content(url, max_chars, timeout)
342
 
343
- if scraper == "bs4":
344
- session = requests_retry_session()
345
- response = session.get(url, timeout=timeout)
346
- response.raise_for_status()
347
- soup = BeautifulSoup(response.content, 'html.parser')
348
-
349
- # Try to find the main content
350
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
351
-
352
- if main_content:
353
- content = main_content.get_text(strip=True, separator='\n')
354
- else:
355
- content = soup.get_text(strip=True, separator='\n')
356
- elif scraper == "trafilatura":
357
- content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
358
- elif scraper == "scrapy":
359
- content = scrape_with_scrapy(url, timeout)
360
- elif scraper == "newspaper":
361
- content = scrape_with_newspaper(url)
362
- else:
363
- logger.error(f"Unknown scraper: {scraper}")
364
- return ""
365
 
366
  # Limit the content to max_chars
367
  return content[:max_chars] if content else ""
 
332
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
333
  return "Error: Unable to assess relevance and summarize"
334
 
335
+ def scrape_full_content(url, max_chars=3000, timeout=5):
336
  try:
337
  logger.info(f"Scraping full content from: {url}")
338
 
 
340
  if url.lower().endswith('.pdf'):
341
  return scrape_pdf_content(url, max_chars, timeout)
342
 
343
+ # Use newspaper for non-PDF content
344
+ article = Article(url)
345
+ article.download()
346
+ article.parse()
347
+
348
+ # Combine title and text
349
+ content = f"Title: {article.title}\n\n"
350
+ content += article.text
351
+
352
+ # Add publish date if available
353
+ if article.publish_date:
354
+ content += f"\n\nPublish Date: {article.publish_date}"
355
+
356
+ # Add authors if available
357
+ if article.authors:
358
+ content += f"\n\nAuthors: {', '.join(article.authors)}"
359
+
360
+ # Add top image URL if available
361
+ if article.top_image:
362
+ content += f"\n\nTop Image URL: {article.top_image}"
 
 
363
 
364
  # Limit the content to max_chars
365
  return content[:max_chars] if content else ""