This commit is contained in:
bMorgan01 2022-09-25 10:54:42 -06:00
parent 1b60e21243
commit e587b58444

13
main.py
View file

@ -41,9 +41,17 @@ def spider_rec(page_links, current_href, base_parse, exclude):
if href_parse.query:
href += "?" + href_parse.query
if href not in page_links[postfix]:
page_links[postfix].append(href)
if href not in page_links.keys():
found = False
for key in page_links.keys() - [postfix]:
for link in page_links[key]:
if href == key or href == link:
found = True
break
if not found:
found = False
for d in exclude:
if d in href:
@ -55,7 +63,10 @@ def spider_rec(page_links, current_href, base_parse, exclude):
spider_rec(page_links, href, base_parse, exclude)
except HTTPError as e:
if parse_result.hostname == base_parse.hostname:
page_links[postfix] = e
else:
page_links[current_href] = e
return page_links