From 5ad1802e17502f94fb366bb1a3818097d0e1d961 Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Sun, 18 Sep 2022 19:48:32 -0600 Subject: [PATCH] Initial commit --- crawl.conf | 7 +++++++ main.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 crawl.conf create mode 100644 main.py diff --git a/crawl.conf b/crawl.conf new file mode 100644 index 0000000..15c50ff --- /dev/null +++ b/crawl.conf @@ -0,0 +1,7 @@ +# Domain Ex: benrmorgan.com +benrmorgan.com +# Prefix Ex: http://www. +http://www. +# Ignore urls containing Ex: /files/ +/files/ +/images/ diff --git a/main.py b/main.py new file mode 100644 index 0000000..35134ea --- /dev/null +++ b/main.py @@ -0,0 +1,60 @@ +import bs4 +from urllib.request import Request, urlopen + + +def spider(prefix, domain, exclude): + return spider_rec(dict(), prefix, domain, "/", exclude) + + +def spider_rec(page_links, prefix, domain, postfix, exclude): + req = Request(prefix + domain + postfix) + html_page = urlopen(req) + + print(html_page.status) + page_links[postfix] = [] + + soup = bs4.BeautifulSoup(html_page, "lxml") + + for link in soup.findAll('a'): + href = link.get('href') + if "mailto:" not in href and (domain in href or href[0] == '/'): + page_links[postfix].append(href) + + if href not in page_links.keys(): + found = False + for d in exclude: + if d in href: + found = True + break + + if found: + continue + + href = href.replace(" ", "%20") + if domain in href: + spider_rec(page_links, "", "", href, exclude) + else: + spider_rec(page_links, prefix, domain, href, exclude) + + return page_links + + +def main(): + print("Reading conf...") + + conf = [] + with open('crawl.conf', 'r') as file: + for line in file.readlines(): + line = line.replace("\n", "") + line = line.replace("\r", "") + conf.append(line) + + domain = conf[1] + prefix = conf[3] + ignores = conf[5:] + + print("Crawling site...") + links = spider(prefix, domain, ignores) + + +main()