This commit is contained in:
bMorgan01 2022-09-18 20:12:41 -06:00
parent 5ad1802e17
commit 75c911f6d2

29
main.py
View file

@ -1,3 +1,4 @@
import http
import bs4
from urllib.request import Request, urlopen
@ -10,7 +11,9 @@ def spider_rec(page_links, prefix, domain, postfix, exclude):
req = Request(prefix + domain + postfix)
html_page = urlopen(req)
print(html_page.status)
if int(html_page.status) >= 400:
page_links[postfix] = html_page
else:
page_links[postfix] = []
soup = bs4.BeautifulSoup(html_page, "lxml")
@ -54,7 +57,29 @@ def main():
ignores = conf[5:]
print("Crawling site...")
links = spider(prefix, domain, ignores)
pages = spider(prefix, domain, ignores)
count = 0
for link in pages.keys():
if type(pages[link]) == http.client.HTTPResponse:
count += 1
found = []
for search_link in pages.keys():
if type(pages[link]) != http.client.HTTPResponse:
for href in pages[link]:
if href == link:
found.append(href)
print(''.join(['='] * 100))
print(link, pages[link].status, pages[link].reason)
print(''.join(['-'] * 100))
print("Found in:")
for href in found:
print(href)
print("Done.")
main()