This commit is contained in:
bMorgan01 2022-09-01 11:23:17 -06:00
parent 9d2c4f631b
commit c21557e78c
2 changed files with 119 additions and 34 deletions

View file

@ -3,7 +3,23 @@ benrmorgan.com
# Prefix Ex: http://www. # Prefix Ex: http://www.
http://www. http://www.
# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml # Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
./ ./reports/report.txt
# Ignore urls containing Ex: /files/ # Ignore urls containing Ex: /files/
/files/ /files/
/images/ /images/
# Custom Dictionary Ex: Strato
Strato
Rainmeter
WebinarsGUI
LON-CAPA
EMWAVE
FACQUAD
dx
dy
PATHAG
PNTSLOPE
PERPLINE
QUADRATI
kerf
toolset
cron

133
main.py
View file

@ -1,12 +1,13 @@
import datetime import datetime
import os import os
import re
from stat import S_ISFIFO
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from os.path import exists from os.path import exists
from shutil import move from shutil import move
import language_tool_python import language_tool_python
tool = language_tool_python.LanguageTool('en-US')
def spider(prefix, domain, exclude): def spider(prefix, domain, exclude):
@ -19,7 +20,7 @@ def spider_rec(links, prefix, domain, postfix, exclude):
soup = bs4.BeautifulSoup(html_page, "lxml") soup = bs4.BeautifulSoup(html_page, "lxml")
links[postfix] = soup.getText() links[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
for link in soup.findAll('a'): for link in soup.findAll('a'):
href = link.get('href') href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'): if "mailto:" not in href and (domain in href or href[0] == '/'):
@ -59,22 +60,42 @@ def cmp(p1, p2):
return True return True
def split(txt, seps):
# https://stackoverflow.com/questions/4697006/python-split-string-by-list-of-separators
default_sep = seps[0]
# we skip seps[0] because that's the default separator
for sep in seps[1:]:
txt = txt.replace(sep, default_sep)
return [i.strip() for i in txt.split(default_sep)]
def abbrev_num(n):
abbrevs = ['', 'K', 'M', 'B', 'T', 'Qd', 'Qt', 'Sx']
zeroes = len(str(n)) - 1
thous = int(zeroes / 3)
prefix = n if thous == 0 else int(n / (1000 ** thous))
abbrev = abbrevs[thous]
return str(prefix) + abbrev
def main(): def main():
print("Reading conf...") print("Reading conf...")
conf = [] conf = []
with open('crawl.conf', 'r') as file: with open('crawl.conf', 'r') as file:
for line in file.readlines(): for line in file.readlines():
if line[0] != '#': line = line.replace("\n", "")
line = line.replace("\n", "") line = line.replace("\r", "")
line = line.replace("\r", "") conf.append(line)
conf.append(line)
domain = conf[0] domain = conf[1]
prefix = conf[1] prefix = conf[3]
path = conf[2] path = conf[5]
ignores = conf[7:conf.index("# Custom Dictionary Ex: Strato")]
ignores = conf[3::] custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::]
print("Crawling site...") print("Crawling site...")
links = spider(prefix, domain, ignores) links = spider(prefix, domain, ignores)
@ -86,35 +107,83 @@ def main():
print("Report already exists, creating temp...") print("Report already exists, creating temp...")
path = "newReport.txt" path = "newReport.txt"
print("Starting local language servers for")
tools = dict()
langs = []
for l in links.keys():
if links[l][1] not in langs:
langs.append(links[l][1])
for lang in langs:
print("\t", lang + "...")
tools[lang] = language_tool_python.LanguageTool(lang)
# print("Writing to target file...") # print("Writing to target file...")
# out = open(path, 'w') # out = open(path, 'w')
# out.write("\tSpelling Spider by Ben Morgan - www.benrmorgan.com\n\n") # out.write("\tSpelling Spider by Ben Morgan - www.benrmorgan.com\n\n")
print("Spell and grammar checking...")
links_matched = dict()
all_matches = 0
all_filtered_matches = 0
all_text = 0
for l in links.keys(): for l in links.keys():
strippedLines = [s.strip() for s in links[l].split('\r\n') if s.strip()] text = links[l][0].replace('\\r', '\r').replace('\\n', '\n')
strippedLines += [s.strip() for s in links[l].split('\n') if s.strip()] sepLines = [s.strip() for s in re.split("\r\n|\r|\n", text) if s.strip()]
uniqueLines = [] text = '\n'.join(sepLines)
for line in strippedLines: all_text += len(text)
if line not in uniqueLines:
uniqueLines.append(line)
text = os.linesep.join(uniqueLines) matches = tools[links[l][1]].check(text)
matches = tool.check(text) all_matches += len(matches)
print(matches) matches = [match for match in matches if
match.context[match.offsetInContext:match.offsetInContext + match.errorLength] not in custDict]
all_filtered_matches += len(matches)
for match in matches: if len(matches) > 0:
print(match.message) links_matched[l] = matches
print(match.context[:match.offsetInContext-1] + "\033[91m {}\033[00m" .format(match.context[match.offsetInContext:match.offsetInContext+match.errorLength]) + match.context[match.offsetInContext+match.errorLength::])
# if existed and not cmp(oldpath, path): print()
# print("Creating old report backup...") print("Potential errors:", all_matches, "\t", "Errors ignored:", all_matches - all_filtered_matches, "\t",
# move(oldpath, oldpath + "-old") "To Fix:", all_filtered_matches)
# print("Overwriting old report with new one...") print("Pages crawled:", len(links.keys()), "\t\t", "Pages w/ errors:", len(links_matched), "\t", "Error rate:",
# move(path, oldpath) str(round(len(links_matched) / len(links), 4)))
# elif existed: print("Words checked:", abbrev_num(all_text), "\t\t", "Error rate:", str(round(all_filtered_matches / all_text, 4)))
# print("Reports are the same, removing temp...")
# os.remove(path) if S_ISFIFO(os.fstat(0).st_mode):
do_colors = False
else:
do_colors = True
for lm in links_matched.keys():
print(''.join(['='] * 100))
print(lm)
print(''.join(['-'] * 100))
for match in links_matched[lm]:
print(match.message, "Suggestion:" if len(match.replacements) >= 1 else "",
match.replacements[0] if len(match.replacements) >= 1 else "")
if do_colors:
print(match.context[:match.offsetInContext] + "\033[91m {}\033[00m".format(
match.context[match.offsetInContext:match.offsetInContext + match.errorLength]) + match.context[
match.offsetInContext + match.errorLength::])
print()
else:
print(match.context)
print(''.join([' '] * len(match.context[:match.offsetInContext]) + ['^'] * match.errorLength))
print(''.join(['='] * 100), "\n")
if existed and not cmp(oldpath, path):
print("Creating old report backup...")
move(oldpath, oldpath + "-old")
print("Overwriting old report with new one...")
move(path, oldpath)
elif existed:
print("Reports are the same, removing temp...")
os.remove(path)
print("Done.") print("Done.")
main()
main()