This commit is contained in:
bMorgan01 2022-09-18 19:43:52 -06:00
parent 653ce94324
commit 021d67c881

12
main.py Normal file → Executable file
View file

@ -15,17 +15,17 @@ def spider(prefix, domain, exclude):
return spider_rec(dict(), prefix, domain, "/", exclude)
def spider_rec(links, prefix, domain, postfix, exclude):
def spider_rec(page_texts, prefix, domain, postfix, exclude):
req = Request(prefix + domain + postfix)
html_page = urlopen(req)
soup = bs4.BeautifulSoup(html_page, "lxml")
links[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
for link in soup.findAll('a'):
href = link.get('href')
if "mailto:" not in href and (domain in href or href[0] == '/'):
if href not in links.keys():
if href not in page_texts.keys():
found = False
for d in exclude:
if d in href:
@ -37,11 +37,11 @@ def spider_rec(links, prefix, domain, postfix, exclude):
href = href.replace(" ", "%20")
if domain in href:
spider_rec(links, "", "", href, exclude)
spider_rec(page_texts, "", "", href, exclude)
else:
spider_rec(links, prefix, domain, href, exclude)
spider_rec(page_texts, prefix, domain, href, exclude)
return links
return page_texts
def split(txt, seps):