From c21557e78cbe8b26f68b71c5896a3f596296b2d0 Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Thu, 1 Sep 2022 11:23:17 -0600 Subject: [PATCH] finished --- crawl.conf | 20 +++++++- main.py | 133 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 119 insertions(+), 34 deletions(-) diff --git a/crawl.conf b/crawl.conf index 791304a..1b8b3ca 100644 --- a/crawl.conf +++ b/crawl.conf @@ -3,7 +3,23 @@ benrmorgan.com # Prefix Ex: http://www. http://www. # Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml -./ +./reports/report.txt # Ignore urls containing Ex: /files/ /files/ -/images/ \ No newline at end of file +/images/ +# Custom Dictionary Ex: Strato +Strato +Rainmeter +WebinarsGUI +LON-CAPA +EMWAVE +FACQUAD +dx +dy +PATHAG +PNTSLOPE +PERPLINE +QUADRATI +kerf +toolset +cron \ No newline at end of file diff --git a/main.py b/main.py index 9a9c655..2f49ebe 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,13 @@ import datetime import os +import re +from stat import S_ISFIFO import bs4 from urllib.request import Request, urlopen from os.path import exists from shutil import move import language_tool_python -tool = language_tool_python.LanguageTool('en-US') def spider(prefix, domain, exclude): @@ -19,7 +20,7 @@ def spider_rec(links, prefix, domain, postfix, exclude): soup = bs4.BeautifulSoup(html_page, "lxml") - links[postfix] = soup.getText() + links[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")] for link in soup.findAll('a'): href = link.get('href') if "mailto:" not in href and (domain in href or href[0] == '/'): @@ -59,22 +60,42 @@ def cmp(p1, p2): return True +def split(txt, seps): + # https://stackoverflow.com/questions/4697006/python-split-string-by-list-of-separators + default_sep = seps[0] + + # we skip seps[0] because that's the default separator + for sep in seps[1:]: + txt = txt.replace(sep, default_sep) + return [i.strip() for i in txt.split(default_sep)] + + +def abbrev_num(n): + abbrevs = ['', 'K', 'M', 'B', 'T', 'Qd', 'Qt', 'Sx'] + + zeroes = len(str(n)) - 1 + thous = int(zeroes / 3) + prefix = n if thous == 0 else int(n / (1000 ** thous)) + abbrev = abbrevs[thous] + + return str(prefix) + abbrev + + def main(): print("Reading conf...") conf = [] with open('crawl.conf', 'r') as file: for line in file.readlines(): - if line[0] != '#': - line = line.replace("\n", "") - line = line.replace("\r", "") - conf.append(line) + line = line.replace("\n", "") + line = line.replace("\r", "") + conf.append(line) - domain = conf[0] - prefix = conf[1] - path = conf[2] - - ignores = conf[3::] + domain = conf[1] + prefix = conf[3] + path = conf[5] + ignores = conf[7:conf.index("# Custom Dictionary Ex: Strato")] + custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::] print("Crawling site...") links = spider(prefix, domain, ignores) @@ -86,35 +107,83 @@ def main(): print("Report already exists, creating temp...") path = "newReport.txt" + print("Starting local language servers for") + tools = dict() + langs = [] + for l in links.keys(): + if links[l][1] not in langs: + langs.append(links[l][1]) + + for lang in langs: + print("\t", lang + "...") + tools[lang] = language_tool_python.LanguageTool(lang) + # print("Writing to target file...") # out = open(path, 'w') # out.write("\tSpelling Spider by Ben Morgan - www.benrmorgan.com\n\n") + print("Spell and grammar checking...") + links_matched = dict() + all_matches = 0 + all_filtered_matches = 0 + all_text = 0 for l in links.keys(): - strippedLines = [s.strip() for s in links[l].split('\r\n') if s.strip()] - strippedLines += [s.strip() for s in links[l].split('\n') if s.strip()] - uniqueLines = [] - for line in strippedLines: - if line not in uniqueLines: - uniqueLines.append(line) + text = links[l][0].replace('\\r', '\r').replace('\\n', '\n') + sepLines = [s.strip() for s in re.split("\r\n|\r|\n", text) if s.strip()] + text = '\n'.join(sepLines) + all_text += len(text) - text = os.linesep.join(uniqueLines) - matches = tool.check(text) - print(matches) + matches = tools[links[l][1]].check(text) + all_matches += len(matches) + matches = [match for match in matches if + match.context[match.offsetInContext:match.offsetInContext + match.errorLength] not in custDict] + all_filtered_matches += len(matches) - for match in matches: - print(match.message) - print(match.context[:match.offsetInContext-1] + "\033[91m {}\033[00m" .format(match.context[match.offsetInContext:match.offsetInContext+match.errorLength]) + match.context[match.offsetInContext+match.errorLength::]) + if len(matches) > 0: + links_matched[l] = matches - # if existed and not cmp(oldpath, path): - # print("Creating old report backup...") - # move(oldpath, oldpath + "-old") - # print("Overwriting old report with new one...") - # move(path, oldpath) - # elif existed: - # print("Reports are the same, removing temp...") - # os.remove(path) + print() + print("Potential errors:", all_matches, "\t", "Errors ignored:", all_matches - all_filtered_matches, "\t", + "To Fix:", all_filtered_matches) + print("Pages crawled:", len(links.keys()), "\t\t", "Pages w/ errors:", len(links_matched), "\t", "Error rate:", + str(round(len(links_matched) / len(links), 4))) + print("Words checked:", abbrev_num(all_text), "\t\t", "Error rate:", str(round(all_filtered_matches / all_text, 4))) + + if S_ISFIFO(os.fstat(0).st_mode): + do_colors = False + else: + do_colors = True + + for lm in links_matched.keys(): + print(''.join(['='] * 100)) + print(lm) + print(''.join(['-'] * 100)) + + for match in links_matched[lm]: + print(match.message, "Suggestion:" if len(match.replacements) >= 1 else "", + match.replacements[0] if len(match.replacements) >= 1 else "") + + if do_colors: + print(match.context[:match.offsetInContext] + "\033[91m {}\033[00m".format( + match.context[match.offsetInContext:match.offsetInContext + match.errorLength]) + match.context[ + match.offsetInContext + match.errorLength::]) + print() + else: + print(match.context) + print(''.join([' '] * len(match.context[:match.offsetInContext]) + ['^'] * match.errorLength)) + + print(''.join(['='] * 100), "\n") + + if existed and not cmp(oldpath, path): + print("Creating old report backup...") + move(oldpath, oldpath + "-old") + print("Overwriting old report with new one...") + move(path, oldpath) + elif existed: + print("Reports are the same, removing temp...") + os.remove(path) print("Done.") -main() \ No newline at end of file + +main()