uniformizing

This commit is contained in:
bMorgan01 2022-09-25 10:01:16 -06:00
parent 021d67c881
commit 12154ae0e9
2 changed files with 39 additions and 35 deletions

View file

@ -1,7 +1,5 @@
# Domain Ex: benrmorgan.com # Target
benrmorgan.com http://www.benrmorgan.com
# Prefix Ex: http://www.
http://www.
# Ignore urls containing Ex: /files/ # Ignore urls containing Ex: /files/
/files/ /files/
/images/ /images/

68
main.py
View file

@ -1,45 +1,53 @@
import datetime
import os import os
import re import re
from stat import S_ISFIFO from stat import S_ISFIFO
import sys import sys
from urllib.parse import urlparse, urlunparse, urljoin
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from os.path import exists
from shutil import move
import language_tool_python import language_tool_python
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
def spider(prefix, domain, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude) def spider(target, exclude):
parsed_target = urlparse(target)
return spider_rec(dict(), target, parsed_target, exclude)
def spider_rec(page_texts, prefix, domain, postfix, exclude): def spider_rec(page_texts, current_href, base_parse, exclude):
req = Request(prefix + domain + postfix) target_url = urlunparse(base_parse)
html_page = urlopen(req) parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result))
postfix = parse_result.path
soup = bs4.BeautifulSoup(html_page, "lxml") if len(postfix) == 0:
postfix = "/"
page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")] if parse_result.hostname == base_parse.hostname:
for link in soup.findAll('a'): html_page = urlopen(req)
href = link.get('href') soup = bs4.BeautifulSoup(html_page, "lxml")
if "mailto:" not in href and (domain in href or href[0] == '/'): page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
if href not in page_texts.keys():
found = False
for d in exclude:
if d in href:
found = True
break
if found: for link in soup.findAll('a'):
continue href = link.get('href')
href = href.replace(" ", "%20")
href = href.replace(" ", "%20") if not urlparse(href).hostname:
if domain in href: href = urlparse(urljoin(target_url, href)).path
spider_rec(page_texts, "", "", href, exclude)
else: if "mailto:" not in href:
spider_rec(page_texts, prefix, domain, href, exclude) if href not in page_texts.keys():
found = False
for d in exclude:
if d in href:
found = True
break
if found:
continue
spider_rec(page_texts, href, base_parse, exclude)
return page_texts return page_texts
@ -76,15 +84,13 @@ def main(report: bool):
line = line.replace("\r", "") line = line.replace("\r", "")
conf.append(line) conf.append(line)
domain = conf[1] target = conf[1]
prefix = conf[3] ignores = conf[3:conf.index("# Custom Dictionary Ex: Strato")]
ignores = conf[5:conf.index("# Custom Dictionary Ex: Strato")]
custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::] custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::]
if not report: if not report:
print("Crawling site...") print("Crawling site...")
links = spider(prefix, domain, ignores) links = spider(target, ignores)
date = datetime.datetime.utcnow()
if not report: if not report:
print("Starting local language servers for") print("Starting local language servers for")