uniformizing

This commit is contained in:
bMorgan01 2022-09-25 10:01:16 -06:00
parent 021d67c881
commit 12154ae0e9
2 changed files with 39 additions and 35 deletions

View file

@ -1,7 +1,5 @@
# Domain Ex: benrmorgan.com # Target
benrmorgan.com http://www.benrmorgan.com
# Prefix Ex: http://www.
http://www.
# Ignore urls containing Ex: /files/ # Ignore urls containing Ex: /files/
/files/ /files/
/images/ /images/

46
main.py
View file

@ -1,30 +1,42 @@
import datetime
import os import os
import re import re
from stat import S_ISFIFO from stat import S_ISFIFO
import sys import sys
from urllib.parse import urlparse, urlunparse, urljoin
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from os.path import exists
from shutil import move
import language_tool_python import language_tool_python
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
def spider(prefix, domain, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude) def spider(target, exclude):
parsed_target = urlparse(target)
return spider_rec(dict(), target, parsed_target, exclude)
def spider_rec(page_texts, prefix, domain, postfix, exclude): def spider_rec(page_texts, current_href, base_parse, exclude):
req = Request(prefix + domain + postfix) target_url = urlunparse(base_parse)
parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result))
postfix = parse_result.path
if len(postfix) == 0:
postfix = "/"
if parse_result.hostname == base_parse.hostname:
html_page = urlopen(req) html_page = urlopen(req)
soup = bs4.BeautifulSoup(html_page, "lxml") soup = bs4.BeautifulSoup(html_page, "lxml")
page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")] page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
for link in soup.findAll('a'): for link in soup.findAll('a'):
href = link.get('href') href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'): href = href.replace(" ", "%20")
if not urlparse(href).hostname:
href = urlparse(urljoin(target_url, href)).path
if "mailto:" not in href:
if href not in page_texts.keys(): if href not in page_texts.keys():
found = False found = False
for d in exclude: for d in exclude:
@ -35,11 +47,7 @@ def spider_rec(page_texts, prefix, domain, postfix, exclude):
if found: if found:
continue continue
href = href.replace(" ", "%20") spider_rec(page_texts, href, base_parse, exclude)
if domain in href:
spider_rec(page_texts, "", "", href, exclude)
else:
spider_rec(page_texts, prefix, domain, href, exclude)
return page_texts return page_texts
@ -76,15 +84,13 @@ def main(report: bool):
line = line.replace("\r", "") line = line.replace("\r", "")
conf.append(line) conf.append(line)
domain = conf[1] target = conf[1]
prefix = conf[3] ignores = conf[3:conf.index("# Custom Dictionary Ex: Strato")]
ignores = conf[5:conf.index("# Custom Dictionary Ex: Strato")]
custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::] custDict = conf[conf.index("# Custom Dictionary Ex: Strato") + 1::]
if not report: if not report:
print("Crawling site...") print("Crawling site...")
links = spider(prefix, domain, ignores) links = spider(target, ignores)
date = datetime.datetime.utcnow()
if not report: if not report:
print("Starting local language servers for") print("Starting local language servers for")