From 9ec87b1e29d25b10e9f8bbfd85360c2876858660 Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Fri, 4 Jul 2025 08:13:23 -0600 Subject: [PATCH] Used networks to keep track of links --- crawl.conf | 2 +- main.py | 238 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 173 insertions(+), 67 deletions(-) diff --git a/crawl.conf b/crawl.conf index bc9b4cd..1c4380f 100755 --- a/crawl.conf +++ b/crawl.conf @@ -1,6 +1,6 @@ [Config] ; Target site -site = http://www.benrmorgan.com +site = https://www.benrmorgan.com ; Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml target = /var/www/html/sitemap.xml ; Checksums path Ex ./checksums diff --git a/main.py b/main.py index fa76571..6b6dbd6 100755 --- a/main.py +++ b/main.py @@ -1,14 +1,61 @@ #!/usr/bin/env python3 import datetime import os +import sys +from typing import List from urllib.parse import urlparse, urlunparse, urljoin +import json -import bs4 -from urllib.request import Request, urlopen +import requests_html from os.path import exists from shutil import move from hashlib import md5 import configparser +import networkx as nx +session = requests_html.HTMLSession() +import argparse +parser = argparse.ArgumentParser() + + +class Echo: + def __init__(self): + self.streams = [] + + def write(self, message): + for stream in self.streams: + stream.write(message) + + def flush(self): + # this flush method is needed for python 3 compatibility. + # this handles the flush command by doing nothing. + # you might want to specify some extra behavior here. + pass + + def close(self): + for stream in self.streams: + if stream is not sys.stdout: + stream.close() + +def convert2cytoscapeJSON(G): + # load all nodes into nodes array + final = {} + final["nodes"] = [] + final["edges"] = [] + for node in G.nodes(): + nx = {} + nx["data"] = {} + nx["data"]["id"] = node + nx["data"]["label"] = node + final["nodes"].append(nx.copy()) + #load all edges to edges array + for edge in G.edges(): + nx = {} + nx["data"]={} + nx["data"]["id"]=edge[0]+edge[1] + nx["data"]["source"]=edge[0] + nx["data"]["target"]=edge[1] + final["edges"].append(nx) + return json.dumps(final) def get_page_hash(text: str): @@ -17,57 +64,84 @@ def get_page_hash(text: str): return md5(text.encode('utf-8')).hexdigest() -def spider(target, exclude): - parsed_target = urlparse(target) - return spider_rec(dict(), dict(), target, parsed_target, exclude) - - -def spider_rec(links, checksums, current_href, base_parse, exclude): - target_url = urlunparse(base_parse) - parse_result = urlparse(urljoin(target_url, current_href)) - req = Request(urlunparse(parse_result)) - +def make_postfix(parse_result, base_parse, current_href): postfix = parse_result.path if parse_result.query: postfix += "?" + parse_result.query if len(postfix) == 0: postfix = "/" + + if parse_result.hostname != base_parse.hostname: + postfix = current_href - if parse_result.hostname == base_parse.hostname: - html_page = urlopen(req) + return postfix - soup = bs4.BeautifulSoup(html_page, "lxml") - checksums[postfix] = get_page_hash(soup.getText()) +def is_member_of_target(parse_result, base_parse, postfix): + return parse_result.hostname == base_parse.hostname and base_parse.path in postfix + + +def is_excluded(exclude, postfix): + for prefix in exclude: + if postfix[:len(prefix)] == prefix: + return True + + return False + + +def spider(target, exclude, create_network): + network = None + if create_network: + network = nx.DiGraph() + parsed_target = urlparse(target) + return spider_rec(dict(), dict(), target, parsed_target, exclude, network, []) + + +def spider_rec(links, checksums, current_href, base_parse, exclude, network, process_status): + target_url = urlunparse(base_parse) + parse_result = urlparse(urljoin(target_url, current_href)) + + postfix = make_postfix(parse_result, base_parse, current_href) + + if is_excluded(exclude, postfix): + return None + + if postfix in process_status: + return None + else: + process_status.append(postfix) + + if is_member_of_target(parse_result, base_parse, postfix): + r = session.get(urlunparse(parse_result)) + + hrefs = r.html.absolute_links + try: + r.html.render(timeout=15) + except Exception: + pass + else: + hrefs = r.html.absolute_links + + checksums[postfix] = get_page_hash(r.html.text) links[postfix] = 1 + if network is not None: + network.add_node(postfix) - for link in soup.findAll('a'): - href = link.get('href') - href = href.replace(" ", "%20") - + for href in hrefs: if "mailto:" not in href: - if not urlparse(href).hostname: - href_parse = urlparse(urljoin(target_url, href)) - href = href_parse.path + href_parse = urlparse(urljoin(target_url, href)) + href = make_postfix(href_parse, base_parse, href) - if href_parse.query: - href += "?" + href_parse.query + spider_rec(links, checksums, href, base_parse, exclude, network, process_status) - if href not in links.keys(): - found = False - for d in exclude: - if d in href: - found = True - break + if is_member_of_target(href_parse, base_parse, href) and not is_excluded(exclude, href): + if network is not None: + network.add_edge(postfix, href) - if found: - continue - - spider_rec(links, checksums, href, base_parse, exclude) - else: links[href] += 1 - return links, checksums + + return links, checksums, network def cmp(p1, p2): @@ -85,18 +159,30 @@ def cmp(p1, p2): return True -def main(): - print("Reading conf...") +def main(args): + args.create_network = args.create_network and args.to_stdout - config = configparser.ConfigParser() - config.read('crawl.conf') - config = config['Config'] + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) - target = config['site'] - path = config['target'] - checksums_path = config['checksums'] + if not args.custom: + print("Reading conf...") - ignores = config['ignore'].split(', ') + config = configparser.ConfigParser() + config.read('crawl.conf') + config = config['Config'] + + target = config['site'] + path = config['target'] + checksums_path = config['checksums'] + + ignores = config['ignore'].split(', ') + else: + target = args.site + path = args.target + checksums_path = args.checksums + ignores = args.ignores.split(',') checksums = dict() try: @@ -108,17 +194,24 @@ def main(): print("No checksums file found at path, new file will be created.") print("Crawling site...") - links, new_checksums = spider(target, ignores) + links, new_checksums, network = spider(target, ignores, args.create_network) date = datetime.datetime.utcnow() - existed = exists(path) - oldpath = path - if existed: - print("Sitemap already exists, creating temp...") - path = "newmap.xml" - print("Writing to target file...") - out = open(path, 'w') + echoer = Echo() + if args.to_stdout: + echoer.streams.append(sys.stdout) + + if args.target: + existed = exists(path) + oldpath = path + if existed: + print("Sitemap already exists, creating temp...") + path = "newmap.xml" + + echoer.streams.append(open(path, 'w')) + + out = echoer out.write("\n") out.write("