make crawl.conf match ini spec
This commit is contained in:
parent
e587b58444
commit
72a195a0e7
2 changed files with 11 additions and 13 deletions
10
crawl.conf
10
crawl.conf
|
|
@ -1,5 +1,5 @@
|
|||
# Target Ex: http://www.google.com
|
||||
http://www.benrmorgan.com
|
||||
# Ignore urls containing Ex: /files/
|
||||
/files/
|
||||
/images/
|
||||
[Config]
|
||||
; Target site
|
||||
site = http://www.benrmorgan.com
|
||||
; Ignore urls containing Ex: /files/
|
||||
ignore = /files/, /images/
|
||||
|
|
|
|||
14
main.py
14
main.py
|
|
@ -1,3 +1,4 @@
|
|||
import configparser
|
||||
import bs4
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
|
|
@ -74,15 +75,12 @@ def spider_rec(page_links, current_href, base_parse, exclude):
|
|||
def main():
|
||||
print("Reading conf...")
|
||||
|
||||
conf = []
|
||||
with open('crawl.conf', 'r') as file:
|
||||
for line in file.readlines():
|
||||
line = line.replace("\n", "")
|
||||
line = line.replace("\r", "")
|
||||
conf.append(line)
|
||||
config = configparser.ConfigParser()
|
||||
config.read('crawl.conf')
|
||||
config = config['Config']
|
||||
|
||||
target = conf[1]
|
||||
ignores = conf[3:]
|
||||
target = config['site']
|
||||
ignores = config['ignore'].split(', ')
|
||||
|
||||
print("Crawling site...")
|
||||
pages = spider(target, ignores)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue