Used networks to keep track of links

This commit is contained in:
bMorgan01 2025-07-04 08:13:23 -06:00
parent ec5ef63474
commit 9ec87b1e29
2 changed files with 173 additions and 67 deletions

View file

@ -1,6 +1,6 @@
[Config] [Config]
; Target site ; Target site
site = http://www.benrmorgan.com site = https://www.benrmorgan.com
; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml ; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
target = /var/www/html/sitemap.xml target = /var/www/html/sitemap.xml
; Checksums path Ex ./checksums ; Checksums path Ex ./checksums

238
main.py
View file

@ -1,14 +1,61 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import datetime import datetime
import os import os
import sys
from typing import List
from urllib.parse import urlparse, urlunparse, urljoin from urllib.parse import urlparse, urlunparse, urljoin
import json
import bs4 import requests_html
from urllib.request import Request, urlopen
from os.path import exists from os.path import exists
from shutil import move from shutil import move
from hashlib import md5 from hashlib import md5
import configparser import configparser
import networkx as nx
session = requests_html.HTMLSession()
import argparse
parser = argparse.ArgumentParser()
class Echo:
def __init__(self):
self.streams = []
def write(self, message):
for stream in self.streams:
stream.write(message)
def flush(self):
# this flush method is needed for python 3 compatibility.
# this handles the flush command by doing nothing.
# you might want to specify some extra behavior here.
pass
def close(self):
for stream in self.streams:
if stream is not sys.stdout:
stream.close()
def convert2cytoscapeJSON(G):
# load all nodes into nodes array
final = {}
final["nodes"] = []
final["edges"] = []
for node in G.nodes():
nx = {}
nx["data"] = {}
nx["data"]["id"] = node
nx["data"]["label"] = node
final["nodes"].append(nx.copy())
#load all edges to edges array
for edge in G.edges():
nx = {}
nx["data"]={}
nx["data"]["id"]=edge[0]+edge[1]
nx["data"]["source"]=edge[0]
nx["data"]["target"]=edge[1]
final["edges"].append(nx)
return json.dumps(final)
def get_page_hash(text: str): def get_page_hash(text: str):
@ -17,57 +64,84 @@ def get_page_hash(text: str):
return md5(text.encode('utf-8')).hexdigest() return md5(text.encode('utf-8')).hexdigest()
def spider(target, exclude): def make_postfix(parse_result, base_parse, current_href):
parsed_target = urlparse(target)
return spider_rec(dict(), dict(), target, parsed_target, exclude)
def spider_rec(links, checksums, current_href, base_parse, exclude):
target_url = urlunparse(base_parse)
parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result))
postfix = parse_result.path postfix = parse_result.path
if parse_result.query: if parse_result.query:
postfix += "?" + parse_result.query postfix += "?" + parse_result.query
if len(postfix) == 0: if len(postfix) == 0:
postfix = "/" postfix = "/"
if parse_result.hostname != base_parse.hostname:
postfix = current_href
if parse_result.hostname == base_parse.hostname: return postfix
html_page = urlopen(req)
soup = bs4.BeautifulSoup(html_page, "lxml")
checksums[postfix] = get_page_hash(soup.getText()) def is_member_of_target(parse_result, base_parse, postfix):
return parse_result.hostname == base_parse.hostname and base_parse.path in postfix
def is_excluded(exclude, postfix):
for prefix in exclude:
if postfix[:len(prefix)] == prefix:
return True
return False
def spider(target, exclude, create_network):
network = None
if create_network:
network = nx.DiGraph()
parsed_target = urlparse(target)
return spider_rec(dict(), dict(), target, parsed_target, exclude, network, [])
def spider_rec(links, checksums, current_href, base_parse, exclude, network, process_status):
target_url = urlunparse(base_parse)
parse_result = urlparse(urljoin(target_url, current_href))
postfix = make_postfix(parse_result, base_parse, current_href)
if is_excluded(exclude, postfix):
return None
if postfix in process_status:
return None
else:
process_status.append(postfix)
if is_member_of_target(parse_result, base_parse, postfix):
r = session.get(urlunparse(parse_result))
hrefs = r.html.absolute_links
try:
r.html.render(timeout=15)
except Exception:
pass
else:
hrefs = r.html.absolute_links
checksums[postfix] = get_page_hash(r.html.text)
links[postfix] = 1 links[postfix] = 1
if network is not None:
network.add_node(postfix)
for link in soup.findAll('a'): for href in hrefs:
href = link.get('href')
href = href.replace(" ", "%20")
if "mailto:" not in href: if "mailto:" not in href:
if not urlparse(href).hostname: href_parse = urlparse(urljoin(target_url, href))
href_parse = urlparse(urljoin(target_url, href)) href = make_postfix(href_parse, base_parse, href)
href = href_parse.path
if href_parse.query: spider_rec(links, checksums, href, base_parse, exclude, network, process_status)
href += "?" + href_parse.query
if href not in links.keys(): if is_member_of_target(href_parse, base_parse, href) and not is_excluded(exclude, href):
found = False if network is not None:
for d in exclude: network.add_edge(postfix, href)
if d in href:
found = True
break
if found:
continue
spider_rec(links, checksums, href, base_parse, exclude)
else:
links[href] += 1 links[href] += 1
return links, checksums
return links, checksums, network
def cmp(p1, p2): def cmp(p1, p2):
@ -85,18 +159,30 @@ def cmp(p1, p2):
return True return True
def main(): def main(args):
print("Reading conf...") args.create_network = args.create_network and args.to_stdout
config = configparser.ConfigParser() abspath = os.path.abspath(__file__)
config.read('crawl.conf') dname = os.path.dirname(abspath)
config = config['Config'] os.chdir(dname)
target = config['site'] if not args.custom:
path = config['target'] print("Reading conf...")
checksums_path = config['checksums']
ignores = config['ignore'].split(', ') config = configparser.ConfigParser()
config.read('crawl.conf')
config = config['Config']
target = config['site']
path = config['target']
checksums_path = config['checksums']
ignores = config['ignore'].split(', ')
else:
target = args.site
path = args.target
checksums_path = args.checksums
ignores = args.ignores.split(',')
checksums = dict() checksums = dict()
try: try:
@ -108,17 +194,24 @@ def main():
print("No checksums file found at path, new file will be created.") print("No checksums file found at path, new file will be created.")
print("Crawling site...") print("Crawling site...")
links, new_checksums = spider(target, ignores) links, new_checksums, network = spider(target, ignores, args.create_network)
date = datetime.datetime.utcnow() date = datetime.datetime.utcnow()
existed = exists(path)
oldpath = path
if existed:
print("Sitemap already exists, creating temp...")
path = "newmap.xml"
print("Writing to target file...") print("Writing to target file...")
out = open(path, 'w') echoer = Echo()
if args.to_stdout:
echoer.streams.append(sys.stdout)
if args.target:
existed = exists(path)
oldpath = path
if existed:
print("Sitemap already exists, creating temp...")
path = "newmap.xml"
echoer.streams.append(open(path, 'w'))
out = echoer
out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
out.write("<!--\n") out.write("<!--\n")
out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n") out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n")
@ -144,37 +237,50 @@ def main():
lastmod = date.strftime("%Y-%m-%dT%H:%M:%S+00:00") lastmod = date.strftime("%Y-%m-%dT%H:%M:%S+00:00")
if l[0] in checksums.keys() and checksums[l[0]][0] == new_checksums[l[0]]: if l[0] in checksums.keys() and checksums[l[0]][0] == new_checksums[l[0]]:
lastmod = checksums[l[0]][1] lastmod = checksums[l[0]][1]
else:
different_count += 1 different_count += 1
checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n") checksums_out.write(f"{l[0]} {new_checksums[l[0]]} {lastmod}\n")
if l[0] == '/': if l[0] == '/':
l = target + l[0] l = (target + l[0], l[1])
out.write("\t<url>\n") out.write("\t<url>\n")
out.write("\t\t<loc>" + l[0] + "</loc>\n") out.write("\t\t<loc>" + l[0] + "</loc>\n")
out.write("\t\t<lastmod>" + lastmod + "</lastmod>\n") out.write("\t\t<lastmod>" + lastmod + "</lastmod>\n")
out.write("\t\t<priority>" + str(l[1]) + "</priority>\n") out.write("\t\t<priority>" + str(l[1]) + "</priority>\n")
out.write("\t</url>\n") out.write("\t</url>\n")
out.write("</urlset>\n") out.write("</urlset>\n")
out.close()
checksums_out.close() checksums_out.close()
if existed and not cmp(oldpath, path): if args.target:
print("Creating old sitemap backup...") out.close()
move(oldpath, oldpath + "-old")
print("Overwriting old sitemap with new one...") if existed and not cmp(oldpath, path):
move(path, oldpath) print("Creating old sitemap backup...")
elif existed: move(oldpath, oldpath + "-old")
print("Sitemaps are the same, removing temp...") print("Overwriting old sitemap with new one...")
os.remove(path) move(path, oldpath)
elif existed:
print("Sitemaps are the same, removing temp...")
os.remove(path)
print("Done.") print("Done.")
print(f"Crawled {len(links.keys())} pages.") print(f"Crawled {len(links.keys())} pages.")
print(f"Found {different_count} modified pages.") print(f"Found {different_count} modified pages.")
if network is not None:
print(convert2cytoscapeJSON(network))
main() parser.add_argument("-c", "--custom-opts", action='store_true', dest='custom', help="Specify options, config is ignored")
parser.add_argument("-o", "--to-stdout", action='store_true', dest='to_stdout', help="Print generated sitemap to console.")
parser.add_argument("-n", "--network", action='store_true', dest='create_network', help="Create visual network of sitemap. Output as JSON")
parser.add_argument("-f", "--to-file", dest='target', default=False, help="Save generated sitemap to file.")
parser.add_argument('site', nargs='?')
parser.add_argument('checksums', nargs='?')
parser.add_argument('ignores', nargs='?')
args = parser.parse_args()
main(args)