commit f03eefe2f63798169d59523a2e451a47afacec88 Author: bMorgan01 Date: Tue Mar 29 19:13:27 2022 -0600 Initial commit diff --git a/crawl.conf b/crawl.conf new file mode 100644 index 0000000..4dc577c --- /dev/null +++ b/crawl.conf @@ -0,0 +1,7 @@ +# Domain Ex: benrmorgan.com +benrmorgan.com +# Prefix Ex: http://www. +http://www. +# Ignore urls containing Ex: /files/ +/files/ +/images/ \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..8333081 --- /dev/null +++ b/main.py @@ -0,0 +1,90 @@ +import datetime +import bs4 +from urllib.request import Request, urlopen + + +def spider(prefix, domain, exclude): + return spider_rec(dict(), prefix, domain, "/", exclude) + + +def spider_rec(links, prefix, domain, postfix, exclude): + links[postfix] = 1 + + req = Request(prefix + domain + postfix) + html_page = urlopen(req) + + soup = bs4.BeautifulSoup(html_page, "lxml") + for link in soup.findAll('a'): + href = link.get('href') + if "mailto:" not in href and (domain in href or href[0] == '/'): + if href not in links.keys(): + found = False + for d in exclude: + if d in href: + found = True + break + + if found: + continue + + href = href.replace(" ", "%20") + if domain in href: + spider_rec(links, "", "", href, exclude) + else: + spider_rec(links, prefix, domain, href, exclude) + else: + links[href] += 1 + return links + + +def main(): + conf = [] + with open('crawl.conf', 'r') as file: + for line in file.readlines(): + if line[0] != '#': + line = line.replace("\n", "") + line = line.replace("\r", "") + conf.append(line) + + domain = conf[0] + prefix = conf[1] + + ignores = conf[2::] + + links = spider(prefix, domain, ignores) + date = datetime.datetime.utcnow() + + out = open("sitemap.xml", 'w') + out.write("\n") + out.write( + "\n") + + sorted_links = dict(sorted(links.items(), key=lambda item: item[1], reverse=True)) + + ordered = [] + level = 0 + old_num = sorted_links[list(sorted_links.keys())[0]] + for l in sorted_links.keys(): + if sorted_links[l] != old_num: + level += 1 + old_num = sorted_links[l] + + link = l + if link[0] == '/': + link = prefix + domain + link + ordered.append((link, str(float(str(round(pow(0.8, level), 2)))))) + + for l in ordered: + out.write("\t\n") + out.write("\t\t" + l[0] + "\n") + out.write("\t\t" + date.strftime("%Y-%m-%dT%H:%M:%S+00:00") + "\n") + out.write("\t\t" + str(l[1]) + "\n") + out.write("\t\n") + + out.write("\n") + out.close() + + +main()