diff --git a/crawl.conf b/crawl.conf index 4dc577c..ba6dcb5 100644 --- a/crawl.conf +++ b/crawl.conf @@ -2,6 +2,8 @@ benrmorgan.com # Prefix Ex: http://www. http://www. +# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml +/var/www/html/sitemap.xml # Ignore urls containing Ex: /files/ /files/ /images/ \ No newline at end of file diff --git a/main.py b/main.py index 8333081..a3d1076 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,10 @@ import datetime +import os + import bs4 from urllib.request import Request, urlopen +from os.path import exists +from shutil import move def spider(prefix, domain, exclude): @@ -37,6 +41,24 @@ def spider_rec(links, prefix, domain, postfix, exclude): return links +def cmp(p1, p2): + with open(p1, 'r') as f1: + with open(p2, 'r') as f2: + l1 = f1.readlines() + l2 = f2.readlines() + not_matched = [] + + if len(l1) == len(l2): + for i in range(len(l1)): + if l1[i] != l2[i]: + if "" not in l1[i]: + return False + else: + return False + + return True + + def main(): conf = [] with open('crawl.conf', 'r') as file: @@ -48,13 +70,19 @@ def main(): domain = conf[0] prefix = conf[1] + path = conf[2] - ignores = conf[2::] + ignores = conf[3::] links = spider(prefix, domain, ignores) date = datetime.datetime.utcnow() - out = open("sitemap.xml", 'w') + existed = exists(path) + oldpath = path + if existed: + path = "newmap.xml" + + out = open(path, 'w') out.write("\n") @@ -86,5 +114,11 @@ def main(): out.write("\n") out.close() + if existed and not cmp(oldpath, path): + move(oldpath, oldpath + "-old") + move(path, oldpath) + else: + os.remove(path) + main()