done

2022-09-19 19:26:35 -06:00 · 2022-09-19 19:26:35 -06:00 · 7814e210bc
commit 7814e210bc
parent 75c911f6d2
2 changed files with 53 additions and 41 deletions
--- a/crawl.conf
+++ b/crawl.conf
@ -1,7 +1,5 @@
-# Domain                    Ex: benrmorgan.com
+# Target		    Ex: http://www.google.com
-benrmorgan.com
+http://dev-builder.benrmorgan.com
 # Prefix                    Ex: http://www.
 http://www.
 # Ignore urls containing    Ex: /files/
 /files/
 /images/
--- a/main.py
+++ b/main.py
@ -1,26 +1,41 @@
 import http
 import bs4
 from urllib.request import Request, urlopen
 from urllib.error import HTTPError
 from urllib.parse import urlparse, urlunparse, urljoin
 import re
-def spider(prefix, domain, exclude):
+def spider(target, exclude):
-    return spider_rec(dict(), prefix, domain, "/", exclude)
+    parsed_target = urlparse(target)
    return spider_rec(dict(), target, parsed_target, exclude)
-def spider_rec(page_links, prefix, domain, postfix, exclude):
+def spider_rec(page_links, current_href, base_parse, exclude):
-    req = Request(prefix + domain + postfix)
+    target_url = urlunparse(base_parse)
    parse_result = urlparse(urljoin(target_url, current_href))
    req = Request(urlunparse(parse_result))
    postfix = parse_result.path
    if len(postfix) == 0:
        postfix = "/"
    try:
        html_page = urlopen(req)
-    if int(html_page.status) >= 400:
+        if parse_result.hostname == base_parse.hostname:
        page_links[postfix] = html_page
    else:
            page_links[postfix] = []
            soup = bs4.BeautifulSoup(html_page, "lxml")
            for link in soup.findAll('a'):
                href = link.get('href')
-        if "mailto:" not in href and (domain in href or href[0] == '/'):
+                href = href.replace(" ", "%20")
                if not urlparse(href).hostname:
                    href = urlparse(urljoin(target_url, href)).path         
                if "mailto:" not in href:
                    page_links[postfix].append(href)
                    if href not in page_links.keys():
@ -33,11 +48,9 @@ def spider_rec(page_links, prefix, domain, postfix, exclude):
                        if found:
                            continue
-                href = href.replace(" ", "%20")
+                        spider_rec(page_links, href, base_parse, exclude)
-                if domain in href:
+    except HTTPError as e:
-                    spider_rec(page_links, "", "", href, exclude)
+        page_links[postfix] = e
                else:
                    spider_rec(page_links, prefix, domain, href, exclude)
    return page_links
@ -52,24 +65,23 @@ def main():
            line = line.replace("\r", "")
            conf.append(line)
-    domain = conf[1]
+    target = conf[1]
-    prefix = conf[3]
+    ignores = conf[3:]
    ignores = conf[5:]
    print("Crawling site...")
-    pages = spider(prefix, domain, ignores)
+    pages = spider(target, ignores)
    count = 0
    for link in pages.keys():
-        if type(pages[link]) == http.client.HTTPResponse:
+        if type(pages[link]) == HTTPError:
            count += 1
            found = []
            for search_link in pages.keys():
-                if type(pages[link]) != http.client.HTTPResponse:
+                if type(pages[search_link]) != HTTPError:
-                    for href in pages[link]:
+                    for href in pages[search_link]:
                        if href == link:
-                            found.append(href)
+                            found.append(search_link)
            print(''.join(['='] * 100))
            print(link, pages[link].status, pages[link].reason)
@ -79,6 +91,8 @@ def main():
            for href in found:
                print(href)
            print(''.join(['='] * 100), "\n")
    print("Done.")