fixed
This commit is contained in:
parent
1b60e21243
commit
e587b58444
1 changed files with 15 additions and 4 deletions
13
main.py
13
main.py
|
|
@ -41,9 +41,17 @@ def spider_rec(page_links, current_href, base_parse, exclude):
|
|||
if href_parse.query:
|
||||
href += "?" + href_parse.query
|
||||
|
||||
if href not in page_links[postfix]:
|
||||
page_links[postfix].append(href)
|
||||
|
||||
if href not in page_links.keys():
|
||||
found = False
|
||||
for key in page_links.keys() - [postfix]:
|
||||
for link in page_links[key]:
|
||||
if href == key or href == link:
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
found = False
|
||||
for d in exclude:
|
||||
if d in href:
|
||||
|
|
@ -55,7 +63,10 @@ def spider_rec(page_links, current_href, base_parse, exclude):
|
|||
|
||||
spider_rec(page_links, href, base_parse, exclude)
|
||||
except HTTPError as e:
|
||||
if parse_result.hostname == base_parse.hostname:
|
||||
page_links[postfix] = e
|
||||
else:
|
||||
page_links[current_href] = e
|
||||
|
||||
return page_links
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue