prettify

2022-09-18 20:12:41 -06:00 · 2022-09-18 20:12:41 -06:00 · 75c911f6d2
commit 75c911f6d2
parent 5ad1802e17
1 changed files with 28 additions and 3 deletions
--- a/main.py
+++ b/main.py
@ -1,3 +1,4 @@
+import http
 import bs4
 from urllib.request import Request, urlopen

@ -10,7 +11,9 @@ def spider_rec(page_links, prefix, domain, postfix, exclude):
    req = Request(prefix + domain + postfix)
    html_page = urlopen(req)

-    print(html_page.status)
+    if int(html_page.status) >= 400:
+        page_links[postfix] = html_page
+    else:
        page_links[postfix] = []

    soup = bs4.BeautifulSoup(html_page, "lxml")
@ -54,7 +57,29 @@ def main():
    ignores = conf[5:]

    print("Crawling site...")
-    links = spider(prefix, domain, ignores)
+    pages = spider(prefix, domain, ignores)
+
+    count = 0
+    for link in pages.keys():
+        if type(pages[link]) == http.client.HTTPResponse:
+            count += 1
+
+            found = []
+            for search_link in pages.keys():
+                if type(pages[link]) != http.client.HTTPResponse:
+                    for href in pages[link]:
+                        if href == link:
+                            found.append(href)
+
+            print(''.join(['='] * 100))
+            print(link, pages[link].status, pages[link].reason)
+            print(''.join(['-'] * 100))
+            print("Found in:")
+
+            for href in found:
+                print(href)
+
+    print("Done.")


 main()