make crawl.conf match ini spec

This commit is contained in:
bMorgan01 2022-09-25 11:17:35 -06:00
parent e587b58444
commit 72a195a0e7
2 changed files with 11 additions and 13 deletions

View file

@ -1,5 +1,5 @@
# Target Ex: http://www.google.com [Config]
http://www.benrmorgan.com ; Target site
# Ignore urls containing Ex: /files/ site = http://www.benrmorgan.com
/files/ ; Ignore urls containing Ex: /files/
/images/ ignore = /files/, /images/

14
main.py
View file

@ -1,3 +1,4 @@
import configparser
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError from urllib.error import HTTPError
@ -74,15 +75,12 @@ def spider_rec(page_links, current_href, base_parse, exclude):
def main(): def main():
print("Reading conf...") print("Reading conf...")
conf = [] config = configparser.ConfigParser()
with open('crawl.conf', 'r') as file: config.read('crawl.conf')
for line in file.readlines(): config = config['Config']
line = line.replace("\n", "")
line = line.replace("\r", "")
conf.append(line)
target = conf[1] target = config['site']
ignores = conf[3:] ignores = config['ignore'].split(', ')
print("Crawling site...") print("Crawling site...")
pages = spider(target, ignores) pages = spider(target, ignores)