urllib-ifying

This commit is contained in:
bMorgan01 2022-09-25 11:08:09 -06:00
parent 07c0b8c329
commit 55187dda43
2 changed files with 49 additions and 34 deletions

View file

@ -1,8 +1,6 @@
[Config] [Config]
; Domain Ex: benrmorgan.com ; Target site
domain = benrmorgan.com site = http://dev-builder.benrmorgan.com
; Prefix Ex: http://www.
prefix = http://dev-builder.
; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml ; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
target = ./sitemap.xml target = ./sitemap.xml
; Checksums path Ex ./checksums ; Checksums path Ex ./checksums

49
main.py
View file

@ -1,7 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from calendar import different_locale
import datetime import datetime
import os import os
from urllib.parse import urlparse, urlunparse, urljoin
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from os.path import exists from os.path import exists
@ -15,12 +16,25 @@ def get_page_hash(text: str):
return md5(text.encode('utf-8')).hexdigest() return md5(text.encode('utf-8')).hexdigest()
def spider(prefix, domain, exclude):
return spider_rec(dict(), dict(), prefix, domain, "/", exclude) def spider(target, exclude):
parsed_target = urlparse(target)
return spider_rec(dict(), dict(), target, parsed_target, exclude)
def spider_rec(links, checksums, prefix, domain, postfix, exclude): def spider_rec(links, checksums, current_href, base_parse, exclude):
req = Request(prefix + domain + postfix) target_url = urlunparse(base_parse)
parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result))
postfix = parse_result.path
if parse_result.query:
postfix += "?" + parse_result.query
if len(postfix) == 0:
postfix = "/"
if parse_result.hostname == base_parse.hostname:
html_page = urlopen(req) html_page = urlopen(req)
soup = bs4.BeautifulSoup(html_page, "lxml") soup = bs4.BeautifulSoup(html_page, "lxml")
@ -30,7 +44,16 @@ def spider_rec(links, checksums, prefix, domain, postfix, exclude):
for link in soup.findAll('a'): for link in soup.findAll('a'):
href = link.get('href') href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'): href = href.replace(" ", "%20")
if "mailto:" not in href:
if not urlparse(href).hostname:
href_parse = urlparse(urljoin(target_url, href))
href = href_parse.path
if href_parse.query:
href += "?" + href_parse.query
if href not in links.keys(): if href not in links.keys():
found = False found = False
for d in exclude: for d in exclude:
@ -41,12 +64,7 @@ def spider_rec(links, checksums, prefix, domain, postfix, exclude):
if found: if found:
continue continue
href = href.replace(" ", "%20") spider_rec(links, checksums, href, base_parse, exclude)
if domain in href:
spider_rec(links, checksums, "", "", href, exclude)
else:
spider_rec(links, checksums, prefix, domain, href, exclude)
else: else:
links[href] += 1 links[href] += 1
return links, checksums return links, checksums
@ -74,8 +92,7 @@ def main():
config.read('crawl.conf') config.read('crawl.conf')
config = config['Config'] config = config['Config']
domain = config['domain'] target = config['site']
prefix = config['prefix']
path = config['target'] path = config['target']
checksums_path = config['checksums'] checksums_path = config['checksums']
@ -91,7 +108,7 @@ def main():
print("No checksums file found at path, new file will be created.") print("No checksums file found at path, new file will be created.")
print("Crawling site...") print("Crawling site...")
links, new_checksums = spider(prefix, domain, ignores) links, new_checksums = spider(target, ignores)
date = datetime.datetime.utcnow() date = datetime.datetime.utcnow()
existed = exists(path) existed = exists(path)
@ -132,7 +149,7 @@ def main():
checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n") checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n")
if l[0] == '/': if l[0] == '/':
l = prefix + domain + l[0] l = target + l[0]
out.write("\t<url>\n") out.write("\t<url>\n")
out.write("\t\t<loc>" + l[0] + "</loc>\n") out.write("\t\t<loc>" + l[0] + "</loc>\n")