config
This commit is contained in:
parent
653ce94324
commit
021d67c881
1 changed files with 6 additions and 6 deletions
12
main.py
Normal file → Executable file
12
main.py
Normal file → Executable file
|
|
@ -15,17 +15,17 @@ def spider(prefix, domain, exclude):
|
|||
return spider_rec(dict(), prefix, domain, "/", exclude)
|
||||
|
||||
|
||||
def spider_rec(links, prefix, domain, postfix, exclude):
|
||||
def spider_rec(page_texts, prefix, domain, postfix, exclude):
|
||||
req = Request(prefix + domain + postfix)
|
||||
html_page = urlopen(req)
|
||||
|
||||
soup = bs4.BeautifulSoup(html_page, "lxml")
|
||||
|
||||
links[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
|
||||
page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
|
||||
for link in soup.findAll('a'):
|
||||
href = link.get('href')
|
||||
if "mailto:" not in href and (domain in href or href[0] == '/'):
|
||||
if href not in links.keys():
|
||||
if href not in page_texts.keys():
|
||||
found = False
|
||||
for d in exclude:
|
||||
if d in href:
|
||||
|
|
@ -37,11 +37,11 @@ def spider_rec(links, prefix, domain, postfix, exclude):
|
|||
|
||||
href = href.replace(" ", "%20")
|
||||
if domain in href:
|
||||
spider_rec(links, "", "", href, exclude)
|
||||
spider_rec(page_texts, "", "", href, exclude)
|
||||
else:
|
||||
spider_rec(links, prefix, domain, href, exclude)
|
||||
spider_rec(page_texts, prefix, domain, href, exclude)
|
||||
|
||||
return links
|
||||
return page_texts
|
||||
|
||||
|
||||
def split(txt, seps):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue