prettify
This commit is contained in:
parent
5ad1802e17
commit
75c911f6d2
1 changed files with 28 additions and 3 deletions
29
main.py
29
main.py
|
|
@ -1,3 +1,4 @@
|
||||||
|
import http
|
||||||
import bs4
|
import bs4
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
|
|
@ -10,7 +11,9 @@ def spider_rec(page_links, prefix, domain, postfix, exclude):
|
||||||
req = Request(prefix + domain + postfix)
|
req = Request(prefix + domain + postfix)
|
||||||
html_page = urlopen(req)
|
html_page = urlopen(req)
|
||||||
|
|
||||||
print(html_page.status)
|
if int(html_page.status) >= 400:
|
||||||
|
page_links[postfix] = html_page
|
||||||
|
else:
|
||||||
page_links[postfix] = []
|
page_links[postfix] = []
|
||||||
|
|
||||||
soup = bs4.BeautifulSoup(html_page, "lxml")
|
soup = bs4.BeautifulSoup(html_page, "lxml")
|
||||||
|
|
@ -54,7 +57,29 @@ def main():
|
||||||
ignores = conf[5:]
|
ignores = conf[5:]
|
||||||
|
|
||||||
print("Crawling site...")
|
print("Crawling site...")
|
||||||
links = spider(prefix, domain, ignores)
|
pages = spider(prefix, domain, ignores)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for link in pages.keys():
|
||||||
|
if type(pages[link]) == http.client.HTTPResponse:
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
found = []
|
||||||
|
for search_link in pages.keys():
|
||||||
|
if type(pages[link]) != http.client.HTTPResponse:
|
||||||
|
for href in pages[link]:
|
||||||
|
if href == link:
|
||||||
|
found.append(href)
|
||||||
|
|
||||||
|
print(''.join(['='] * 100))
|
||||||
|
print(link, pages[link].status, pages[link].reason)
|
||||||
|
print(''.join(['-'] * 100))
|
||||||
|
print("Found in:")
|
||||||
|
|
||||||
|
for href in found:
|
||||||
|
print(href)
|
||||||
|
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue