This commit is contained in:
bMorgan01 2022-09-19 19:26:35 -06:00
parent 75c911f6d2
commit 7814e210bc
2 changed files with 53 additions and 41 deletions

View file

@ -1,7 +1,5 @@
# Domain Ex: benrmorgan.com # Target Ex: http://www.google.com
benrmorgan.com http://dev-builder.benrmorgan.com
# Prefix Ex: http://www.
http://www.
# Ignore urls containing Ex: /files/ # Ignore urls containing Ex: /files/
/files/ /files/
/images/ /images/

84
main.py
View file

@ -1,43 +1,56 @@
import http import http
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError
from urllib.parse import urlparse, urlunparse, urljoin
import re
def spider(prefix, domain, exclude): def spider(target, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude) parsed_target = urlparse(target)
return spider_rec(dict(), target, parsed_target, exclude)
def spider_rec(page_links, prefix, domain, postfix, exclude): def spider_rec(page_links, current_href, base_parse, exclude):
req = Request(prefix + domain + postfix) target_url = urlunparse(base_parse)
html_page = urlopen(req) parse_result = urlparse(urljoin(target_url, current_href))
req = Request(urlunparse(parse_result))
postfix = parse_result.path
if int(html_page.status) >= 400: if len(postfix) == 0:
page_links[postfix] = html_page postfix = "/"
else:
page_links[postfix] = []
soup = bs4.BeautifulSoup(html_page, "lxml") try:
html_page = urlopen(req)
for link in soup.findAll('a'): if parse_result.hostname == base_parse.hostname:
href = link.get('href') page_links[postfix] = []
if "mailto:" not in href and (domain in href or href[0] == '/'):
page_links[postfix].append(href)
if href not in page_links.keys(): soup = bs4.BeautifulSoup(html_page, "lxml")
found = False
for d in exclude:
if d in href:
found = True
break
if found:
continue
for link in soup.findAll('a'):
href = link.get('href')
href = href.replace(" ", "%20") href = href.replace(" ", "%20")
if domain in href:
spider_rec(page_links, "", "", href, exclude) if not urlparse(href).hostname:
else: href = urlparse(urljoin(target_url, href)).path
spider_rec(page_links, prefix, domain, href, exclude)
if "mailto:" not in href:
page_links[postfix].append(href)
if href not in page_links.keys():
found = False
for d in exclude:
if d in href:
found = True
break
if found:
continue
spider_rec(page_links, href, base_parse, exclude)
except HTTPError as e:
page_links[postfix] = e
return page_links return page_links
@ -52,24 +65,23 @@ def main():
line = line.replace("\r", "") line = line.replace("\r", "")
conf.append(line) conf.append(line)
domain = conf[1] target = conf[1]
prefix = conf[3] ignores = conf[3:]
ignores = conf[5:]
print("Crawling site...") print("Crawling site...")
pages = spider(prefix, domain, ignores) pages = spider(target, ignores)
count = 0 count = 0
for link in pages.keys(): for link in pages.keys():
if type(pages[link]) == http.client.HTTPResponse: if type(pages[link]) == HTTPError:
count += 1 count += 1
found = [] found = []
for search_link in pages.keys(): for search_link in pages.keys():
if type(pages[link]) != http.client.HTTPResponse: if type(pages[search_link]) != HTTPError:
for href in pages[link]: for href in pages[search_link]:
if href == link: if href == link:
found.append(href) found.append(search_link)
print(''.join(['='] * 100)) print(''.join(['='] * 100))
print(link, pages[link].status, pages[link].reason) print(link, pages[link].status, pages[link].reason)
@ -79,6 +91,8 @@ def main():
for href in found: for href in found:
print(href) print(href)
print(''.join(['='] * 100), "\n")
print("Done.") print("Done.")