#!/usr/bin/env python3 import datetime import os from urllib.parse import urlparse, urlunparse, urljoin import bs4 from urllib.request import Request, urlopen from os.path import exists from shutil import move from hashlib import md5 import configparser def get_page_hash(text: str): text = text.replace(' ', '').replace('\r', '').replace('\n', '') return md5(text.encode('utf-8')).hexdigest() def spider(target, exclude): parsed_target = urlparse(target) return spider_rec(dict(), dict(), target, parsed_target, exclude) def spider_rec(links, checksums, current_href, base_parse, exclude): target_url = urlunparse(base_parse) parse_result = urlparse(urljoin(target_url, current_href)) req = Request(urlunparse(parse_result)) postfix = parse_result.path if parse_result.query: postfix += "?" + parse_result.query if len(postfix) == 0: postfix = "/" if parse_result.hostname == base_parse.hostname: html_page = urlopen(req) soup = bs4.BeautifulSoup(html_page, "lxml") checksums[postfix] = get_page_hash(soup.getText()) links[postfix] = 1 for link in soup.findAll('a'): href = link.get('href') href = href.replace(" ", "%20") if "mailto:" not in href: if not urlparse(href).hostname: href_parse = urlparse(urljoin(target_url, href)) href = href_parse.path if href_parse.query: href += "?" + href_parse.query if href not in links.keys(): found = False for d in exclude: if d in href: found = True break if found: continue spider_rec(links, checksums, href, base_parse, exclude) else: links[href] += 1 return links, checksums def cmp(p1, p2): with open(p1, 'r') as f1: with open(p2, 'r') as f2: l1 = f1.readlines() l2 = f2.readlines() if len(l1) == len(l2): for i in range(len(l1)): if l1[i] != l2[i]: return False else: return False return True def main(): print("Reading conf...") config = configparser.ConfigParser() config.read('crawl.conf') config = config['Config'] target = config['site'] path = config['target'] checksums_path = config['checksums'] ignores = config['ignore'].split(', ') checksums = dict() try: with open(checksums_path, 'r') as checksums_file: for line in checksums_file.readlines(): thirds = line.split() checksums[thirds[0]] = thirds[1:] except FileNotFoundError: print("No checksums file found at path, new file will be created.") print("Crawling site...") links, new_checksums = spider(target, ignores) date = datetime.datetime.utcnow() existed = exists(path) oldpath = path if existed: print("Sitemap already exists, creating temp...") path = "newmap.xml" print("Writing to target file...") out = open(path, 'w') out.write("\n") out.write("\n") out.write("\n") sorted_links = dict(sorted(links.items(), key=lambda item: item[1], reverse=True)) ordered = [] level = 0 old_num = sorted_links[list(sorted_links.keys())[0]] for l in sorted_links.keys(): if sorted_links[l] != old_num: level += 1 old_num = sorted_links[l] ordered.append((l, str(float(str(round(pow(0.8, level), 2)))))) checksums_out = open(checksums_path, 'w') different_count = 0 for l in ordered: lastmod = date.strftime("%Y-%m-%dT%H:%M:%S+00:00") if l[0] in checksums.keys() and checksums[l[0]][0] == new_checksums[l[0]]: lastmod = checksums[l[0]][1] different_count += 1 checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n") if l[0] == '/': l = target + l[0] out.write("\t\n") out.write("\t\t" + l[0] + "\n") out.write("\t\t" + lastmod + "\n") out.write("\t\t" + str(l[1]) + "\n") out.write("\t\n") out.write("\n") out.close() checksums_out.close() if existed and not cmp(oldpath, path): print("Creating old sitemap backup...") move(oldpath, oldpath + "-old") print("Overwriting old sitemap with new one...") move(path, oldpath) elif existed: print("Sitemaps are the same, removing temp...") os.remove(path) print("Done.") print(f"Crawled {len(links.keys())} pages.") print(f"Found {different_count} modified pages.") main()