From 021d67c88156bff3ae17725c12bffc93a4e7584a Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Sun, 18 Sep 2022 19:43:52 -0600 Subject: [PATCH] config --- main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) mode change 100644 => 100755 main.py diff --git a/main.py b/main.py old mode 100644 new mode 100755 index 83e61e4..a287f6a --- a/main.py +++ b/main.py @@ -15,17 +15,17 @@ def spider(prefix, domain, exclude): return spider_rec(dict(), prefix, domain, "/", exclude) -def spider_rec(links, prefix, domain, postfix, exclude): +def spider_rec(page_texts, prefix, domain, postfix, exclude): req = Request(prefix + domain + postfix) html_page = urlopen(req) soup = bs4.BeautifulSoup(html_page, "lxml") - links[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")] + page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")] for link in soup.findAll('a'): href = link.get('href') if "mailto:" not in href and (domain in href or href[0] == '/'): - if href not in links.keys(): + if href not in page_texts.keys(): found = False for d in exclude: if d in href: @@ -37,11 +37,11 @@ def spider_rec(links, prefix, domain, postfix, exclude): href = href.replace(" ", "%20") if domain in href: - spider_rec(links, "", "", href, exclude) + spider_rec(page_texts, "", "", href, exclude) else: - spider_rec(links, prefix, domain, href, exclude) + spider_rec(page_texts, prefix, domain, href, exclude) - return links + return page_texts def split(txt, seps):