From 7814e210bc4b6c8850dcf73fd219a4b309533980 Mon Sep 17 00:00:00 2001
From: bMorgan01 <ben.morgan5000@gmail.com>
Date: Mon, 19 Sep 2022 19:26:35 -0600
Subject: [PATCH] done

---
 crawl.conf |  6 ++--
 main.py    | 88 +++++++++++++++++++++++++++++++-----------------------
 2 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/crawl.conf b/crawl.conf
index 15c50ff..0d3b951 100644
--- a/crawl.conf
+++ b/crawl.conf
@@ -1,7 +1,5 @@
-# Domain                    Ex: benrmorgan.com
-benrmorgan.com
-# Prefix                    Ex: http://www.
-http://www.
+# Target		    Ex: http://www.google.com
+http://dev-builder.benrmorgan.com
 # Ignore urls containing    Ex: /files/
 /files/
 /images/
diff --git a/main.py b/main.py
index 2dfcd78..79a9c0c 100644
--- a/main.py
+++ b/main.py
@@ -1,43 +1,56 @@
 import http
 import bs4
 from urllib.request import Request, urlopen
+from urllib.error import HTTPError
+from urllib.parse import urlparse, urlunparse, urljoin
+import re
 
 
-def spider(prefix, domain, exclude):
-    return spider_rec(dict(), prefix, domain, "/", exclude)
+def spider(target, exclude):
+    parsed_target = urlparse(target)
+    return spider_rec(dict(), target, parsed_target, exclude)
 
 
-def spider_rec(page_links, prefix, domain, postfix, exclude):
-    req = Request(prefix + domain + postfix)
-    html_page = urlopen(req)
+def spider_rec(page_links, current_href, base_parse, exclude):
+    target_url = urlunparse(base_parse)
+    parse_result = urlparse(urljoin(target_url, current_href))
+    req = Request(urlunparse(parse_result))
+    postfix = parse_result.path
+    
+    if len(postfix) == 0:
+        postfix = "/"
 
-    if int(html_page.status) >= 400:
-        page_links[postfix] = html_page
-    else:
-        page_links[postfix] = []
+    try:
+        html_page = urlopen(req)
+       
+        if parse_result.hostname == base_parse.hostname:
+            page_links[postfix] = []
 
-    soup = bs4.BeautifulSoup(html_page, "lxml")
-
-    for link in soup.findAll('a'):
-        href = link.get('href')
-        if "mailto:" not in href and (domain in href or href[0] == '/'):
-            page_links[postfix].append(href)
-
-            if href not in page_links.keys():
-                found = False
-                for d in exclude:
-                    if d in href:
-                        found = True
-                        break
-
-                if found:
-                    continue
+            soup = bs4.BeautifulSoup(html_page, "lxml")
 
+            for link in soup.findAll('a'):
+                href = link.get('href')
                 href = href.replace(" ", "%20")
-                if domain in href:
-                    spider_rec(page_links, "", "", href, exclude)
-                else:
-                    spider_rec(page_links, prefix, domain, href, exclude)
+                
+                if not urlparse(href).hostname:
+                    href = urlparse(urljoin(target_url, href)).path         
+                
+                if "mailto:" not in href:
+                    page_links[postfix].append(href)
+
+                    if href not in page_links.keys():
+                        found = False
+                        for d in exclude:
+                            if d in href:
+                                found = True
+                                break
+
+                        if found:
+                            continue
+
+                        spider_rec(page_links, href, base_parse, exclude)
+    except HTTPError as e:
+        page_links[postfix] = e
 
     return page_links
 
@@ -52,24 +65,23 @@ def main():
             line = line.replace("\r", "")
             conf.append(line)
 
-    domain = conf[1]
-    prefix = conf[3]
-    ignores = conf[5:]
+    target = conf[1]
+    ignores = conf[3:]
 
     print("Crawling site...")
-    pages = spider(prefix, domain, ignores)
+    pages = spider(target, ignores)
 
     count = 0
     for link in pages.keys():
-        if type(pages[link]) == http.client.HTTPResponse:
+        if type(pages[link]) == HTTPError:
             count += 1
 
             found = []
             for search_link in pages.keys():
-                if type(pages[link]) != http.client.HTTPResponse:
-                    for href in pages[link]:
+                if type(pages[search_link]) != HTTPError:
+                    for href in pages[search_link]:
                         if href == link:
-                            found.append(href)
+                            found.append(search_link)
 
             print(''.join(['='] * 100))
             print(link, pages[link].status, pages[link].reason)
@@ -79,6 +91,8 @@ def main():
             for href in found:
                 print(href)
 
+            print(''.join(['='] * 100), "\n")
+
     print("Done.")