From e587b5844416d46cfcc15fa89ad21188b7b056ad Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Sun, 25 Sep 2022 10:54:42 -0600 Subject: [PATCH] fixed --- main.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 6964aac..7086913 100644 --- a/main.py +++ b/main.py @@ -40,10 +40,18 @@ def spider_rec(page_links, current_href, base_parse, exclude): if href_parse.query: href += "?" + href_parse.query + + if href not in page_links[postfix]: + page_links[postfix].append(href) + + found = False + for key in page_links.keys() - [postfix]: + for link in page_links[key]: + if href == key or href == link: + found = True + break - page_links[postfix].append(href) - - if href not in page_links.keys(): + if not found: found = False for d in exclude: if d in href: @@ -55,7 +63,10 @@ def spider_rec(page_links, current_href, base_parse, exclude): spider_rec(page_links, href, base_parse, exclude) except HTTPError as e: - page_links[postfix] = e + if parse_result.hostname == base_parse.hostname: + page_links[postfix] = e + else: + page_links[current_href] = e return page_links