diff --git a/main.py b/main.py index 35134ea..2dfcd78 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import http import bs4 from urllib.request import Request, urlopen @@ -10,8 +11,10 @@ def spider_rec(page_links, prefix, domain, postfix, exclude): req = Request(prefix + domain + postfix) html_page = urlopen(req) - print(html_page.status) - page_links[postfix] = [] + if int(html_page.status) >= 400: + page_links[postfix] = html_page + else: + page_links[postfix] = [] soup = bs4.BeautifulSoup(html_page, "lxml") @@ -54,7 +57,29 @@ def main(): ignores = conf[5:] print("Crawling site...") - links = spider(prefix, domain, ignores) + pages = spider(prefix, domain, ignores) + + count = 0 + for link in pages.keys(): + if type(pages[link]) == http.client.HTTPResponse: + count += 1 + + found = [] + for search_link in pages.keys(): + if type(pages[link]) != http.client.HTTPResponse: + for href in pages[link]: + if href == link: + found.append(href) + + print(''.join(['='] * 100)) + print(link, pages[link].status, pages[link].reason) + print(''.join(['-'] * 100)) + print("Found in:") + + for href in found: + print(href) + + print("Done.") main()