From 9d2c4f631bf962f071846f3a92a42fe032e504ac Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Thu, 1 Sep 2022 08:10:55 -0600 Subject: [PATCH] init commit --- crawl.conf | 9 ++++ main.py | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 crawl.conf create mode 100644 main.py diff --git a/crawl.conf b/crawl.conf new file mode 100644 index 0000000..791304a --- /dev/null +++ b/crawl.conf @@ -0,0 +1,9 @@ +# Domain Ex: benrmorgan.com +benrmorgan.com +# Prefix Ex: http://www. +http://www. +# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml +./ +# Ignore urls containing Ex: /files/ +/files/ +/images/ \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..9a9c655 --- /dev/null +++ b/main.py @@ -0,0 +1,120 @@ +import datetime +import os + +import bs4 +from urllib.request import Request, urlopen +from os.path import exists +from shutil import move +import language_tool_python +tool = language_tool_python.LanguageTool('en-US') + + +def spider(prefix, domain, exclude): + return spider_rec(dict(), prefix, domain, "/", exclude) + + +def spider_rec(links, prefix, domain, postfix, exclude): + req = Request(prefix + domain + postfix) + html_page = urlopen(req) + + soup = bs4.BeautifulSoup(html_page, "lxml") + + links[postfix] = soup.getText() + for link in soup.findAll('a'): + href = link.get('href') + if "mailto:" not in href and (domain in href or href[0] == '/'): + if href not in links.keys(): + found = False + for d in exclude: + if d in href: + found = True + break + + if found: + continue + + href = href.replace(" ", "%20") + if domain in href: + spider_rec(links, "", "", href, exclude) + else: + spider_rec(links, prefix, domain, href, exclude) + + return links + + +def cmp(p1, p2): + with open(p1, 'r') as f1: + with open(p2, 'r') as f2: + l1 = f1.readlines() + l2 = f2.readlines() + not_matched = [] + + if len(l1) == len(l2): + for i in range(len(l1)): + if l1[i] != l2[i]: + return False + else: + return False + + return True + + +def main(): + print("Reading conf...") + + conf = [] + with open('crawl.conf', 'r') as file: + for line in file.readlines(): + if line[0] != '#': + line = line.replace("\n", "") + line = line.replace("\r", "") + conf.append(line) + + domain = conf[0] + prefix = conf[1] + path = conf[2] + + ignores = conf[3::] + + print("Crawling site...") + links = spider(prefix, domain, ignores) + date = datetime.datetime.utcnow() + + existed = exists(path) + oldpath = path + if existed: + print("Report already exists, creating temp...") + path = "newReport.txt" + + # print("Writing to target file...") + # out = open(path, 'w') + # out.write("\tSpelling Spider by Ben Morgan - www.benrmorgan.com\n\n") + + for l in links.keys(): + strippedLines = [s.strip() for s in links[l].split('\r\n') if s.strip()] + strippedLines += [s.strip() for s in links[l].split('\n') if s.strip()] + uniqueLines = [] + for line in strippedLines: + if line not in uniqueLines: + uniqueLines.append(line) + + text = os.linesep.join(uniqueLines) + matches = tool.check(text) + print(matches) + + for match in matches: + print(match.message) + print(match.context[:match.offsetInContext-1] + "\033[91m {}\033[00m" .format(match.context[match.offsetInContext:match.offsetInContext+match.errorLength]) + match.context[match.offsetInContext+match.errorLength::]) + + # if existed and not cmp(oldpath, path): + # print("Creating old report backup...") + # move(oldpath, oldpath + "-old") + # print("Overwriting old report with new one...") + # move(path, oldpath) + # elif existed: + # print("Reports are the same, removing temp...") + # os.remove(path) + + print("Done.") + +main() \ No newline at end of file