link-crawler/main.py
2022-09-18 20:12:41 -06:00

85 lines
2.2 KiB
Python

import http
import bs4
from urllib.request import Request, urlopen
def spider(prefix, domain, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude)
def spider_rec(page_links, prefix, domain, postfix, exclude):
req = Request(prefix + domain + postfix)
html_page = urlopen(req)
if int(html_page.status) >= 400:
page_links[postfix] = html_page
else:
page_links[postfix] = []
soup = bs4.BeautifulSoup(html_page, "lxml")
for link in soup.findAll('a'):
href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'):
page_links[postfix].append(href)
if href not in page_links.keys():
found = False
for d in exclude:
if d in href:
found = True
break
if found:
continue
href = href.replace(" ", "%20")
if domain in href:
spider_rec(page_links, "", "", href, exclude)
else:
spider_rec(page_links, prefix, domain, href, exclude)
return page_links
def main():
print("Reading conf...")
conf = []
with open('crawl.conf', 'r') as file:
for line in file.readlines():
line = line.replace("\n", "")
line = line.replace("\r", "")
conf.append(line)
domain = conf[1]
prefix = conf[3]
ignores = conf[5:]
print("Crawling site...")
pages = spider(prefix, domain, ignores)
count = 0
for link in pages.keys():
if type(pages[link]) == http.client.HTTPResponse:
count += 1
found = []
for search_link in pages.keys():
if type(pages[link]) != http.client.HTTPResponse:
for href in pages[link]:
if href == link:
found.append(href)
print(''.join(['='] * 100))
print(link, pages[link].status, pages[link].reason)
print(''.join(['-'] * 100))
print("Found in:")
for href in found:
print(href)
print("Done.")
main()