config
This commit is contained in:
parent
653ce94324
commit
021d67c881
1 changed files with 6 additions and 6 deletions
12
main.py
Normal file → Executable file
12
main.py
Normal file → Executable file
|
|
@ -15,17 +15,17 @@ def spider(prefix, domain, exclude):
|
||||||
return spider_rec(dict(), prefix, domain, "/", exclude)
|
return spider_rec(dict(), prefix, domain, "/", exclude)
|
||||||
|
|
||||||
|
|
||||||
def spider_rec(links, prefix, domain, postfix, exclude):
|
def spider_rec(page_texts, prefix, domain, postfix, exclude):
|
||||||
req = Request(prefix + domain + postfix)
|
req = Request(prefix + domain + postfix)
|
||||||
html_page = urlopen(req)
|
html_page = urlopen(req)
|
||||||
|
|
||||||
soup = bs4.BeautifulSoup(html_page, "lxml")
|
soup = bs4.BeautifulSoup(html_page, "lxml")
|
||||||
|
|
||||||
links[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
|
page_texts[postfix] = [soup.getText(), soup.find_all('html')[0].get("lang")]
|
||||||
for link in soup.findAll('a'):
|
for link in soup.findAll('a'):
|
||||||
href = link.get('href')
|
href = link.get('href')
|
||||||
if "mailto:" not in href and (domain in href or href[0] == '/'):
|
if "mailto:" not in href and (domain in href or href[0] == '/'):
|
||||||
if href not in links.keys():
|
if href not in page_texts.keys():
|
||||||
found = False
|
found = False
|
||||||
for d in exclude:
|
for d in exclude:
|
||||||
if d in href:
|
if d in href:
|
||||||
|
|
@ -37,11 +37,11 @@ def spider_rec(links, prefix, domain, postfix, exclude):
|
||||||
|
|
||||||
href = href.replace(" ", "%20")
|
href = href.replace(" ", "%20")
|
||||||
if domain in href:
|
if domain in href:
|
||||||
spider_rec(links, "", "", href, exclude)
|
spider_rec(page_texts, "", "", href, exclude)
|
||||||
else:
|
else:
|
||||||
spider_rec(links, prefix, domain, href, exclude)
|
spider_rec(page_texts, prefix, domain, href, exclude)
|
||||||
|
|
||||||
return links
|
return page_texts
|
||||||
|
|
||||||
|
|
||||||
def split(txt, seps):
|
def split(txt, seps):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue