From 72a195a0e7217e908175b6d625003af79af1ce59 Mon Sep 17 00:00:00 2001 From: bMorgan01 Date: Sun, 25 Sep 2022 11:17:35 -0600 Subject: [PATCH] make crawl.conf match ini spec --- crawl.conf | 10 +++++----- main.py | 14 ++++++-------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/crawl.conf b/crawl.conf index 5e7d865..bc3adb8 100644 --- a/crawl.conf +++ b/crawl.conf @@ -1,5 +1,5 @@ -# Target Ex: http://www.google.com -http://www.benrmorgan.com -# Ignore urls containing Ex: /files/ -/files/ -/images/ +[Config] +; Target site +site = http://www.benrmorgan.com +; Ignore urls containing Ex: /files/ +ignore = /files/, /images/ diff --git a/main.py b/main.py index 7086913..22ba6b7 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import configparser import bs4 from urllib.request import Request, urlopen from urllib.error import HTTPError @@ -74,15 +75,12 @@ def spider_rec(page_links, current_href, base_parse, exclude): def main(): print("Reading conf...") - conf = [] - with open('crawl.conf', 'r') as file: - for line in file.readlines(): - line = line.replace("\n", "") - line = line.replace("\r", "") - conf.append(line) + config = configparser.ConfigParser() + config.read('crawl.conf') + config = config['Config'] - target = conf[1] - ignores = conf[3:] + target = config['site'] + ignores = config['ignore'].split(', ') print("Crawling site...") pages = spider(target, ignores)