This commit is contained in:
bMorgan01 2022-09-25 10:30:56 -06:00
parent 13e91f8501
commit 1b60e21243

15
main.py
View file

@ -1,9 +1,7 @@
import http
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError from urllib.error import HTTPError
from urllib.parse import urlparse, urlunparse, urljoin from urllib.parse import urlparse, urlunparse, urljoin
import re
def spider(target, exclude): def spider(target, exclude):
@ -15,7 +13,10 @@ def spider_rec(page_links, current_href, base_parse, exclude):
target_url = urlunparse(base_parse) target_url = urlunparse(base_parse)
parse_result = urlparse(urljoin(target_url, current_href)) parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result)) req = Request(urlunparse(parse_result))
postfix = parse_result.path postfix = parse_result.path
if parse_result.query:
postfix += "?" + parse_result.query
if len(postfix) == 0: if len(postfix) == 0:
postfix = "/" postfix = "/"
@ -32,10 +33,14 @@ def spider_rec(page_links, current_href, base_parse, exclude):
href = link.get('href') href = link.get('href')
href = href.replace(" ", "%20") href = href.replace(" ", "%20")
if not urlparse(href).hostname:
href = urlparse(urljoin(target_url, href)).path
if "mailto:" not in href: if "mailto:" not in href:
if not urlparse(href).hostname:
href_parse = urlparse(urljoin(target_url, href))
href = href_parse.path
if href_parse.query:
href += "?" + href_parse.query
page_links[postfix].append(href) page_links[postfix].append(href)
if href not in page_links.keys(): if href not in page_links.keys():