fixed
This commit is contained in:
parent
1b60e21243
commit
e587b58444
1 changed files with 15 additions and 4 deletions
13
main.py
13
main.py
|
|
@ -41,9 +41,17 @@ def spider_rec(page_links, current_href, base_parse, exclude):
|
||||||
if href_parse.query:
|
if href_parse.query:
|
||||||
href += "?" + href_parse.query
|
href += "?" + href_parse.query
|
||||||
|
|
||||||
|
if href not in page_links[postfix]:
|
||||||
page_links[postfix].append(href)
|
page_links[postfix].append(href)
|
||||||
|
|
||||||
if href not in page_links.keys():
|
found = False
|
||||||
|
for key in page_links.keys() - [postfix]:
|
||||||
|
for link in page_links[key]:
|
||||||
|
if href == key or href == link:
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not found:
|
||||||
found = False
|
found = False
|
||||||
for d in exclude:
|
for d in exclude:
|
||||||
if d in href:
|
if d in href:
|
||||||
|
|
@ -55,7 +63,10 @@ def spider_rec(page_links, current_href, base_parse, exclude):
|
||||||
|
|
||||||
spider_rec(page_links, href, base_parse, exclude)
|
spider_rec(page_links, href, base_parse, exclude)
|
||||||
except HTTPError as e:
|
except HTTPError as e:
|
||||||
|
if parse_result.hostname == base_parse.hostname:
|
||||||
page_links[postfix] = e
|
page_links[postfix] = e
|
||||||
|
else:
|
||||||
|
page_links[current_href] = e
|
||||||
|
|
||||||
return page_links
|
return page_links
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue