diff --git a/crawl.conf b/crawl.conf index 1b8b3ca..6f3815d 100644 --- a/crawl.conf +++ b/crawl.conf @@ -2,8 +2,6 @@ benrmorgan.com # Prefix Ex: http://www. http://www. -# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml -./reports/report.txt # Ignore urls containing Ex: /files/ /files/ /images/ @@ -22,4 +20,4 @@ PERPLINE QUADRATI kerf toolset -cron \ No newline at end of file +cron diff --git a/main.py b/main.py index 2f49ebe..59788de 100644 --- a/main.py +++ b/main.py @@ -2,7 +2,7 @@ import datetime import os import re from stat import S_ISFIFO - +import sys import bs4 from urllib.request import Request, urlopen from os.path import exists @@ -43,23 +43,6 @@ def spider_rec(links, prefix, domain, postfix, exclude): return links -def cmp(p1, p2): - with open(p1, 'r') as f1: - with open(p2, 'r') as f2: - l1 = f1.readlines() - l2 = f2.readlines() - not_matched = [] - - if len(l1) == len(l2): - for i in range(len(l1)): - if l1[i] != l2[i]: - return False - else: - return False - - return True - - def split(txt, seps): # https://stackoverflow.com/questions/4697006/python-split-string-by-list-of-separators default_sep = seps[0] @@ -93,20 +76,13 @@ def main(): domain = conf[1] prefix = conf[3] - path = conf[5] - ignores = conf[7:conf.index("# Custom Dictionary Ex: Strato")] + ignores = conf[5:conf.index("# Custom Dictionary Ex: Strato")] custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::] print("Crawling site...") links = spider(prefix, domain, ignores) date = datetime.datetime.utcnow() - existed = exists(path) - oldpath = path - if existed: - print("Report already exists, creating temp...") - path = "newReport.txt" - print("Starting local language servers for") tools = dict() langs = [] @@ -118,10 +94,6 @@ def main(): print("\t", lang + "...") tools[lang] = language_tool_python.LanguageTool(lang) - # print("Writing to target file...") - # out = open(path, 'w') - # out.write("\tSpelling Spider by Ben Morgan - www.benrmorgan.com\n\n") - print("Spell and grammar checking...") links_matched = dict() all_matches = 0 @@ -145,11 +117,11 @@ def main(): print() print("Potential errors:", all_matches, "\t", "Errors ignored:", all_matches - all_filtered_matches, "\t", "To Fix:", all_filtered_matches) - print("Pages crawled:", len(links.keys()), "\t\t", "Pages w/ errors:", len(links_matched), "\t", "Error rate:", + print("Pages crawled:", len(links.keys()), "\t", "Pages w/ errors:", len(links_matched), "\t", "Error rate:", str(round(len(links_matched) / len(links), 4))) - print("Words checked:", abbrev_num(all_text), "\t\t", "Error rate:", str(round(all_filtered_matches / all_text, 4))) + print("Words checked:", abbrev_num(all_text), "\t", "Error rate:", str(round(all_filtered_matches / all_text, 4))) - if S_ISFIFO(os.fstat(0).st_mode): + if S_ISFIFO(os.fstat(0).st_mode) or not sys.stdout.isatty(): do_colors = False else: do_colors = True @@ -164,7 +136,7 @@ def main(): match.replacements[0] if len(match.replacements) >= 1 else "") if do_colors: - print(match.context[:match.offsetInContext] + "\033[91m {}\033[00m".format( + print(match.context[:match.offsetInContext] + "\033[91m{}\033[00m".format( match.context[match.offsetInContext:match.offsetInContext + match.errorLength]) + match.context[ match.offsetInContext + match.errorLength::]) print() @@ -174,15 +146,6 @@ def main(): print(''.join(['='] * 100), "\n") - if existed and not cmp(oldpath, path): - print("Creating old report backup...") - move(oldpath, oldpath + "-old") - print("Overwriting old report with new one...") - move(path, oldpath) - elif existed: - print("Reports are the same, removing temp...") - os.remove(path) - print("Done.")