Rauhan commited on
Commit
4a2e5ad
·
1 Parent(s): d7b4497

DEBUG: updating getLinks

Browse files
Files changed (1) hide show
  1. functions.py +4 -3
functions.py CHANGED
@@ -255,7 +255,7 @@ def listTables(username: str):
255
 
256
  def getLinks(url: str, timeout=30):
257
  start = time.time()
258
-
259
  def getLinksFromPage(url: str) -> list:
260
  response = requests.get(url)
261
  soup = BeautifulSoup(response.content, "lxml")
@@ -265,11 +265,12 @@ def getLinks(url: str, timeout=30):
265
  if "href" in anchor.attrs:
266
  if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
267
  links.append(anchor.attrs["href"])
268
- elif anchor.attrs["href"].startswith("/"):
269
  links.append(urljoin(url + "/", anchor.attrs["href"]))
270
  else:
271
  pass
272
- links = list(set([link for link in links if url in link]))
 
273
  else:
274
  continue
275
  return links
 
255
 
256
  def getLinks(url: str, timeout=30):
257
  start = time.time()
258
+
259
  def getLinksFromPage(url: str) -> list:
260
  response = requests.get(url)
261
  soup = BeautifulSoup(response.content, "lxml")
 
265
  if "href" in anchor.attrs:
266
  if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
267
  links.append(anchor.attrs["href"])
268
+ elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
269
  links.append(urljoin(url + "/", anchor.attrs["href"]))
270
  else:
271
  pass
272
+ links = [link for link in links if "#" not in link]
273
+ links = list(set(links))
274
  else:
275
  continue
276
  return links