diff --git a/crawl.conf b/crawl.conf old mode 100644 new mode 100755 index ba6dcb5..30e36c6 --- a/crawl.conf +++ b/crawl.conf @@ -1,9 +1,11 @@ -# Domain Ex: benrmorgan.com -benrmorgan.com -# Prefix Ex: http://www. -http://www. -# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml -/var/www/html/sitemap.xml -# Ignore urls containing Ex: /files/ -/files/ -/images/ \ No newline at end of file +[Config] +; Domain Ex: benrmorgan.com +domain = benrmorgan.com +; Prefix Ex: http://www. +prefix = http://www. +; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml +target = /var/www/html/sitemap.xml +; Checksums path Ex ./checksums +checksums = ./checksums +; Ignore urls containing Ex: /files/ +ignore = /files/, /images/ \ No newline at end of file diff --git a/main.py b/main.py old mode 100644 new mode 100755 index c2f34fa..7524377 --- a/main.py +++ b/main.py @@ -1,23 +1,33 @@ +#!/usr/bin/env python3 +from calendar import different_locale import datetime import os - import bs4 from urllib.request import Request, urlopen from os.path import exists from shutil import move +from hashlib import md5 +import configparser +def get_page_hash(text: str): + text = text.replace(' ', '').replace('\r', '').replace('\n', '') + + return md5(text.encode('utf-8')).hexdigest() + def spider(prefix, domain, exclude): - return spider_rec(dict(), prefix, domain, "/", exclude) + return spider_rec(dict(), dict(), prefix, domain, "/", exclude) -def spider_rec(links, prefix, domain, postfix, exclude): - links[postfix] = 1 - +def spider_rec(links, checksums, prefix, domain, postfix, exclude): req = Request(prefix + domain + postfix) html_page = urlopen(req) soup = bs4.BeautifulSoup(html_page, "lxml") + + checksums[postfix] = get_page_hash(soup.getText()) + links[postfix] = 1 + for link in soup.findAll('a'): href = link.get('href') if "mailto:" not in href and (domain in href or href[0] == '/'): @@ -32,13 +42,14 @@ def spider_rec(links, prefix, domain, postfix, exclude): continue href = href.replace(" ", "%20") + if domain in href: - spider_rec(links, "", "", href, exclude) + spider_rec(links, checksums, "", "", href, exclude) else: - spider_rec(links, prefix, domain, href, exclude) + spider_rec(links, checksums, prefix, domain, href, exclude) else: links[href] += 1 - return links + return links, checksums def cmp(p1, p2): @@ -46,8 +57,6 @@ def cmp(p1, p2): with open(p2, 'r') as f2: l1 = f1.readlines() l2 = f2.readlines() - not_matched = [] - if len(l1) == len(l2): for i in range(len(l1)): if l1[i] != l2[i]: @@ -62,22 +71,28 @@ def cmp(p1, p2): def main(): print("Reading conf...") - conf = [] - with open('crawl.conf', 'r') as file: - for line in file.readlines(): - if line[0] != '#': - line = line.replace("\n", "") - line = line.replace("\r", "") - conf.append(line) + config = configparser.ConfigParser() + config.read('crawl.conf') + config = config['Config'] - domain = conf[0] - prefix = conf[1] - path = conf[2] + domain = config['domain'] + prefix = config['prefix'] + path = config['target'] + checksums_path = config['checksums'] - ignores = conf[3::] + ignores = config['ignore'].split(', ') + + checksums = dict() + try: + with open(checksums_path, 'r') as checksums_file: + for line in checksums_file.readlines(): + thirds = line.split() + checksums[thirds[0]] = (thirds[1:]) + except FileNotFoundError: + print("No checksums file found at path, new file will be created.") print("Crawling site...") - links = spider(prefix, domain, ignores) + links, new_checksums = spider(prefix, domain, ignores) date = datetime.datetime.utcnow() existed = exists(path) @@ -104,21 +119,34 @@ def main(): level += 1 old_num = sorted_links[l] - link = l - if link[0] == '/': - link = prefix + domain + link - ordered.append((link, str(float(str(round(pow(0.8, level), 2)))))) + ordered.append((l, str(float(str(round(pow(0.8, level), 2)))))) + checksums_out = open(checksums_path, 'w') + + different_count = 0 for l in ordered: + lastmod = date.strftime("%Y-%m-%dT%H:%M:%S+00:00") + if l in checksums.keys() and checksums[l[0]] == new_checksums[l[0]]: + lastmod = checksums[l[0]][1] + different_count += 1 + + checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n") + + if l[0] == '/': + l = prefix + domain + l[0] + out.write("\t\n") out.write("\t\t" + l[0] + "\n") - out.write("\t\t" + date.strftime("%Y-%m-%dT%H:%M:%S+00:00") + "\n") + + out.write("\t\t" + lastmod + "\n") out.write("\t\t" + str(l[1]) + "\n") out.write("\t\n") out.write("\n") out.close() + checksums_out.close() + if existed and not cmp(oldpath, path): print("Creating old sitemap backup...") move(oldpath, oldpath + "-old") @@ -129,5 +157,8 @@ def main(): os.remove(path) print("Done.") + print(f"Crawled {len(links.keys())} pages.") + print(f"Found {different_count} modified pages.") + main()