path option in conf

This commit is contained in:
bMorgan01 2022-03-29 19:54:23 -06:00
parent f03eefe2f6
commit 7d8e7b9065
2 changed files with 38 additions and 2 deletions

View file

@ -2,6 +2,8 @@
benrmorgan.com benrmorgan.com
# Prefix Ex: http://www. # Prefix Ex: http://www.
http://www. http://www.
# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
/var/www/html/sitemap.xml
# Ignore urls containing Ex: /files/ # Ignore urls containing Ex: /files/
/files/ /files/
/images/ /images/

38
main.py
View file

@ -1,6 +1,10 @@
import datetime import datetime
import os
import bs4 import bs4
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from os.path import exists
from shutil import move
def spider(prefix, domain, exclude): def spider(prefix, domain, exclude):
@ -37,6 +41,24 @@ def spider_rec(links, prefix, domain, postfix, exclude):
return links return links
def cmp(p1, p2):
with open(p1, 'r') as f1:
with open(p2, 'r') as f2:
l1 = f1.readlines()
l2 = f2.readlines()
not_matched = []
if len(l1) == len(l2):
for i in range(len(l1)):
if l1[i] != l2[i]:
if "<lastmod>" not in l1[i]:
return False
else:
return False
return True
def main(): def main():
conf = [] conf = []
with open('crawl.conf', 'r') as file: with open('crawl.conf', 'r') as file:
@ -48,13 +70,19 @@ def main():
domain = conf[0] domain = conf[0]
prefix = conf[1] prefix = conf[1]
path = conf[2]
ignores = conf[2::] ignores = conf[3::]
links = spider(prefix, domain, ignores) links = spider(prefix, domain, ignores)
date = datetime.datetime.utcnow() date = datetime.datetime.utcnow()
out = open("sitemap.xml", 'w') existed = exists(path)
oldpath = path
if existed:
path = "newmap.xml"
out = open(path, 'w')
out.write("<!--\n") out.write("<!--\n")
out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n") out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n")
out.write("-->\n") out.write("-->\n")
@ -86,5 +114,11 @@ def main():
out.write("</urlset>\n") out.write("</urlset>\n")
out.close() out.close()
if existed and not cmp(oldpath, path):
move(oldpath, oldpath + "-old")
move(path, oldpath)
else:
os.remove(path)
main() main()