From 1b60e212434bcf99e185ae9c555ed4aa35f3be8e Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Sun, 25 Sep 2022 10:30:56 -0600 Subject: [PATCH] prettify --- main.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 79a9c0c..6964aac 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,7 @@ -import http import bs4 from urllib.request import Request, urlopen from urllib.error import HTTPError from urllib.parse import urlparse, urlunparse, urljoin -import re def spider(target, exclude): @@ -15,7 +13,10 @@ def spider_rec(page_links, current_href, base_parse, exclude): target_url = urlunparse(base_parse) parse_result = urlparse(urljoin(target_url, current_href)) req = Request(urlunparse(parse_result)) + postfix = parse_result.path + if parse_result.query: + postfix += "?" + parse_result.query if len(postfix) == 0: postfix = "/" @@ -31,11 +32,15 @@ def spider_rec(page_links, current_href, base_parse, exclude): for link in soup.findAll('a'): href = link.get('href') href = href.replace(" ", "%20") - - if not urlparse(href).hostname: - href = urlparse(urljoin(target_url, href)).path - + if "mailto:" not in href: + if not urlparse(href).hostname: + href_parse = urlparse(urljoin(target_url, href)) + href = href_parse.path + + if href_parse.query: + href += "?" + href_parse.query + page_links[postfix].append(href) if href not in page_links.keys():