Update app.py
Browse files
app.py
CHANGED
@@ -332,7 +332,7 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
332 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
333 |
return "Error: Unable to assess relevance and summarize"
|
334 |
|
335 |
-
def scrape_full_content(url,
|
336 |
try:
|
337 |
logger.info(f"Scraping full content from: {url}")
|
338 |
|
@@ -340,28 +340,26 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
|
340 |
if url.lower().endswith('.pdf'):
|
341 |
return scrape_pdf_content(url, max_chars, timeout)
|
342 |
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
logger.error(f"Unknown scraper: {scraper}")
|
364 |
-
return ""
|
365 |
|
366 |
# Limit the content to max_chars
|
367 |
return content[:max_chars] if content else ""
|
|
|
332 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
333 |
return "Error: Unable to assess relevance and summarize"
|
334 |
|
335 |
+
def scrape_full_content(url, max_chars=3000, timeout=5):
|
336 |
try:
|
337 |
logger.info(f"Scraping full content from: {url}")
|
338 |
|
|
|
340 |
if url.lower().endswith('.pdf'):
|
341 |
return scrape_pdf_content(url, max_chars, timeout)
|
342 |
|
343 |
+
# Use newspaper for non-PDF content
|
344 |
+
article = Article(url)
|
345 |
+
article.download()
|
346 |
+
article.parse()
|
347 |
+
|
348 |
+
# Combine title and text
|
349 |
+
content = f"Title: {article.title}\n\n"
|
350 |
+
content += article.text
|
351 |
+
|
352 |
+
# Add publish date if available
|
353 |
+
if article.publish_date:
|
354 |
+
content += f"\n\nPublish Date: {article.publish_date}"
|
355 |
+
|
356 |
+
# Add authors if available
|
357 |
+
if article.authors:
|
358 |
+
content += f"\n\nAuthors: {', '.join(article.authors)}"
|
359 |
+
|
360 |
+
# Add top image URL if available
|
361 |
+
if article.top_image:
|
362 |
+
content += f"\n\nTop Image URL: {article.top_image}"
|
|
|
|
|
363 |
|
364 |
# Limit the content to max_chars
|
365 |
return content[:max_chars] if content else ""
|