simplified config, added checksum-based lastmod check

This commit is contained in:
bMorgan01 2022-09-13 14:47:10 -06:00
parent 64be0f1770
commit 9e688663e1
2 changed files with 69 additions and 36 deletions

20
crawl.conf Normal file → Executable file
View file

@ -1,9 +1,11 @@
# Domain Ex: benrmorgan.com [Config]
benrmorgan.com ; Domain Ex: benrmorgan.com
# Prefix Ex: http://www. domain = benrmorgan.com
http://www. ; Prefix Ex: http://www.
# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml prefix = http://www.
/var/www/html/sitemap.xml ; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
# Ignore urls containing Ex: /files/ target = /var/www/html/sitemap.xml
/files/ ; Checksums path Ex ./checksums
/images/ checksums = ./checksums
; Ignore urls containing Ex: /files/
ignore = /files/, /images/

85
main.py Normal file → Executable file
View file

@ -1,23 +1,33 @@
#!/usr/bin/env python3
from calendar import different_locale
import datetime import datetime
import os import os
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from os.path import exists from os.path import exists
from shutil import move from shutil import move
from hashlib import md5
import configparser
def get_page_hash(text: str):
text = text.replace(' ', '').replace('\r', '').replace('\n', '')
return md5(text.encode('utf-8')).hexdigest()
def spider(prefix, domain, exclude): def spider(prefix, domain, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude) return spider_rec(dict(), dict(), prefix, domain, "/", exclude)
def spider_rec(links, prefix, domain, postfix, exclude): def spider_rec(links, checksums, prefix, domain, postfix, exclude):
links[postfix] = 1
req = Request(prefix + domain + postfix) req = Request(prefix + domain + postfix)
html_page = urlopen(req) html_page = urlopen(req)
soup = bs4.BeautifulSoup(html_page, "lxml") soup = bs4.BeautifulSoup(html_page, "lxml")
checksums[postfix] = get_page_hash(soup.getText())
links[postfix] = 1
for link in soup.findAll('a'): for link in soup.findAll('a'):
href = link.get('href') href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'): if "mailto:" not in href and (domain in href or href[0] == '/'):
@ -32,13 +42,14 @@ def spider_rec(links, prefix, domain, postfix, exclude):
continue continue
href = href.replace(" ", "%20") href = href.replace(" ", "%20")
if domain in href: if domain in href:
spider_rec(links, "", "", href, exclude) spider_rec(links, checksums, "", "", href, exclude)
else: else:
spider_rec(links, prefix, domain, href, exclude) spider_rec(links, checksums, prefix, domain, href, exclude)
else: else:
links[href] += 1 links[href] += 1
return links return links, checksums
def cmp(p1, p2): def cmp(p1, p2):
@ -46,8 +57,6 @@ def cmp(p1, p2):
with open(p2, 'r') as f2: with open(p2, 'r') as f2:
l1 = f1.readlines() l1 = f1.readlines()
l2 = f2.readlines() l2 = f2.readlines()
not_matched = []
if len(l1) == len(l2): if len(l1) == len(l2):
for i in range(len(l1)): for i in range(len(l1)):
if l1[i] != l2[i]: if l1[i] != l2[i]:
@ -62,22 +71,28 @@ def cmp(p1, p2):
def main(): def main():
print("Reading conf...") print("Reading conf...")
conf = [] config = configparser.ConfigParser()
with open('crawl.conf', 'r') as file: config.read('crawl.conf')
for line in file.readlines(): config = config['Config']
if line[0] != '#':
line = line.replace("\n", "")
line = line.replace("\r", "")
conf.append(line)
domain = conf[0] domain = config['domain']
prefix = conf[1] prefix = config['prefix']
path = conf[2] path = config['target']
checksums_path = config['checksums']
ignores = conf[3::] ignores = config['ignore'].split(', ')
checksums = dict()
try:
with open(checksums_path, 'r') as checksums_file:
for line in checksums_file.readlines():
thirds = line.split()
checksums[thirds[0]] = (thirds[1:])
except FileNotFoundError:
print("No checksums file found at path, new file will be created.")
print("Crawling site...") print("Crawling site...")
links = spider(prefix, domain, ignores) links, new_checksums = spider(prefix, domain, ignores)
date = datetime.datetime.utcnow() date = datetime.datetime.utcnow()
existed = exists(path) existed = exists(path)
@ -104,21 +119,34 @@ def main():
level += 1 level += 1
old_num = sorted_links[l] old_num = sorted_links[l]
link = l ordered.append((l, str(float(str(round(pow(0.8, level), 2))))))
if link[0] == '/':
link = prefix + domain + link
ordered.append((link, str(float(str(round(pow(0.8, level), 2))))))
checksums_out = open(checksums_path, 'w')
different_count = 0
for l in ordered: for l in ordered:
lastmod = date.strftime("%Y-%m-%dT%H:%M:%S+00:00")
if l in checksums.keys() and checksums[l[0]] == new_checksums[l[0]]:
lastmod = checksums[l[0]][1]
different_count += 1
checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n")
if l[0] == '/':
l = prefix + domain + l[0]
out.write("\t<url>\n") out.write("\t<url>\n")
out.write("\t\t<loc>" + l[0] + "</loc>\n") out.write("\t\t<loc>" + l[0] + "</loc>\n")
out.write("\t\t<lastmod>" + date.strftime("%Y-%m-%dT%H:%M:%S+00:00") + "</lastmod>\n")
out.write("\t\t<lastmod>" + lastmod + "</lastmod>\n")
out.write("\t\t<priority>" + str(l[1]) + "</priority>\n") out.write("\t\t<priority>" + str(l[1]) + "</priority>\n")
out.write("\t</url>\n") out.write("\t</url>\n")
out.write("</urlset>\n") out.write("</urlset>\n")
out.close() out.close()
checksums_out.close()
if existed and not cmp(oldpath, path): if existed and not cmp(oldpath, path):
print("Creating old sitemap backup...") print("Creating old sitemap backup...")
move(oldpath, oldpath + "-old") move(oldpath, oldpath + "-old")
@ -129,5 +157,8 @@ def main():
os.remove(path) os.remove(path)
print("Done.") print("Done.")
print(f"Crawled {len(links.keys())} pages.")
print(f"Found {different_count} modified pages.")
main() main()