This commit is contained in:
bMorgan01 2022-09-19 19:26:35 -06:00
parent 75c911f6d2
commit 7814e210bc
2 changed files with 53 additions and 41 deletions

View file

@ -1,7 +1,5 @@
# Domain Ex: benrmorgan.com # Target Ex: http://www.google.com
benrmorgan.com http://dev-builder.benrmorgan.com
# Prefix Ex: http://www.
http://www.
# Ignore urls containing Ex: /files/ # Ignore urls containing Ex: /files/
/files/ /files/
/images/ /images/

56
main.py
View file

@ -1,26 +1,41 @@
import http import http
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError
from urllib.parse import urlparse, urlunparse, urljoin
import re
def spider(prefix, domain, exclude): def spider(target, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude) parsed_target = urlparse(target)
return spider_rec(dict(), target, parsed_target, exclude)
def spider_rec(page_links, prefix, domain, postfix, exclude): def spider_rec(page_links, current_href, base_parse, exclude):
req = Request(prefix + domain + postfix) target_url = urlunparse(base_parse)
parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result))
postfix = parse_result.path
if len(postfix) == 0:
postfix = "/"
try:
html_page = urlopen(req) html_page = urlopen(req)
if int(html_page.status) >= 400: if parse_result.hostname == base_parse.hostname:
page_links[postfix] = html_page
else:
page_links[postfix] = [] page_links[postfix] = []
soup = bs4.BeautifulSoup(html_page, "lxml") soup = bs4.BeautifulSoup(html_page, "lxml")
for link in soup.findAll('a'): for link in soup.findAll('a'):
href = link.get('href') href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'): href = href.replace(" ", "%20")
if not urlparse(href).hostname:
href = urlparse(urljoin(target_url, href)).path
if "mailto:" not in href:
page_links[postfix].append(href) page_links[postfix].append(href)
if href not in page_links.keys(): if href not in page_links.keys():
@ -33,11 +48,9 @@ def spider_rec(page_links, prefix, domain, postfix, exclude):
if found: if found:
continue continue
href = href.replace(" ", "%20") spider_rec(page_links, href, base_parse, exclude)
if domain in href: except HTTPError as e:
spider_rec(page_links, "", "", href, exclude) page_links[postfix] = e
else:
spider_rec(page_links, prefix, domain, href, exclude)
return page_links return page_links
@ -52,24 +65,23 @@ def main():
line = line.replace("\r", "") line = line.replace("\r", "")
conf.append(line) conf.append(line)
domain = conf[1] target = conf[1]
prefix = conf[3] ignores = conf[3:]
ignores = conf[5:]
print("Crawling site...") print("Crawling site...")
pages = spider(prefix, domain, ignores) pages = spider(target, ignores)
count = 0 count = 0
for link in pages.keys(): for link in pages.keys():
if type(pages[link]) == http.client.HTTPResponse: if type(pages[link]) == HTTPError:
count += 1 count += 1
found = [] found = []
for search_link in pages.keys(): for search_link in pages.keys():
if type(pages[link]) != http.client.HTTPResponse: if type(pages[search_link]) != HTTPError:
for href in pages[link]: for href in pages[search_link]:
if href == link: if href == link:
found.append(href) found.append(search_link)
print(''.join(['='] * 100)) print(''.join(['='] * 100))
print(link, pages[link].status, pages[link].reason) print(link, pages[link].status, pages[link].reason)
@ -79,6 +91,8 @@ def main():
for href in found: for href in found:
print(href) print(href)
print(''.join(['='] * 100), "\n")
print("Done.") print("Done.")