Merge branch 'master' of github.com:bMorgan01/SpellingSpider

2022-09-25 10:28:30 -06:00 · 2022-09-25 10:28:30 -06:00 · 5b12550e9b
commit 5b12550e9b
parent 4b6abfd511 dc50d7702a
2 changed files with 49 additions and 37 deletions
--- a/crawl.conf
+++ b/crawl.conf
@ -1,7 +1,5 @@
-# Domain                    Ex: benrmorgan.com
+# Target
-benrmorgan.com
+http://www.benrmorgan.com
 # Prefix                    Ex: http://www.
 http://www.
 # Ignore urls containing    Ex: /files/
 /files/
 /images/
--- a/main.py
+++ b/main.py
@ -1,48 +1,64 @@
 import datetime
 import os
 import re
 from stat import S_ISFIFO
 import sys
 from urllib.parse import urlparse, urlunparse, urljoin
 import bs4
 from urllib.request import Request, urlopen
 from os.path import exists
 from shutil import move
 import language_tool_python
 import argparse
 parser = argparse.ArgumentParser()
-def spider(prefix, domain, exclude):
+
-    return spider_rec(dict(), prefix, domain, "/", exclude)
+def spider(target, exclude):
    parsed_target = urlparse(target)
    return spider_rec(dict(), target, parsed_target, exclude)
-def spider_rec(page_texts, prefix, domain, postfix, exclude):
+def spider_rec(page_texts, current_href, base_parse, exclude):
-    req = Request(prefix + domain + postfix)
+    target_url = urlunparse(base_parse)
-    html_page = urlopen(req)
+    parse_result = urlparse(urljoin(target_url, current_href))
    req = Request(urlunparse(parse_result))
-    soup = bs4.BeautifulSoup(html_page, "lxml")
+    postfix = parse_result.path
    if parse_result.query:
        postfix += "?" + parse_result.query
-    page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
+    if len(postfix) == 0:
-    if page_texts[postfix][1] is None:
+        postfix = "/"
        page_texts[postfix][1] = 'en-us'
-    for link in soup.findAll('a'):
+    if parse_result.hostname == base_parse.hostname:
-        href = link.get('href')
+        html_page = urlopen(req)
-        if "mailto:" not in href and (domain in href or href[0] == '/'):
+        soup = bs4.BeautifulSoup(html_page, "lxml")
-            if href not in page_texts.keys():
+        page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
                found = False
                for d in exclude:
                    if d in href:
                        found = True
                        break
-                if found:
+        if page_texts[postfix][1] is None:
-                    continue
+            page_texts[postfix][1] = 'en-us'
-                href = href.replace(" ", "%20")
+        for link in soup.findAll('a'):
-                if domain in href:
+            href = link.get('href')
-                    spider_rec(page_texts, "", "", href, exclude)
+            href = href.replace(" ", "%20")
-                else:
+                
-                    spider_rec(page_texts, prefix, domain, href, exclude)
+            if "mailto:" not in href:
                if not urlparse(href).hostname:
                    href_parse = urlparse(urljoin(target_url, href))
                    href = href_parse.path
                    if href_parse.query:
                        href += "?" + href_parse.query
                if href not in page_texts.keys():
                    found = False
                    for d in exclude:
                        if d in href:
                            found = True
                            break
                    if found:
                        continue
                    spider_rec(page_texts, href, base_parse, exclude)
    return page_texts
@ -79,15 +95,13 @@ def main(report: bool):
            line = line.replace("\r", "")
            conf.append(line)
-    domain = conf[1]
+    target = conf[1]
-    prefix = conf[3]
+    ignores = conf[3:conf.index("# Custom Dictionary         Ex: Strato")]
    ignores = conf[5:conf.index("# Custom Dictionary         Ex: Strato")]
    custDict = conf[conf.index("# Custom Dictionary         Ex: Strato") + 1::]
    if not report:
        print("Crawling site...")
-    links = spider(prefix, domain, ignores)
+    links = spider(target, ignores)
    date = datetime.datetime.utcnow()
    if not report:
        print("Starting local language servers for")