path option in conf

This commit is contained in:
bMorgan01 2022-03-29 19:54:23 -06:00
parent f03eefe2f6
commit 7d8e7b9065
2 changed files with 38 additions and 2 deletions

View file

@ -2,6 +2,8 @@
benrmorgan.com
# Prefix Ex: http://www.
http://www.
# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
/var/www/html/sitemap.xml
# Ignore urls containing Ex: /files/
/files/
/images/

38
main.py
View file

@ -1,6 +1,10 @@
import datetime
import os
import bs4
from urllib.request import Request, urlopen
from os.path import exists
from shutil import move
def spider(prefix, domain, exclude):
@ -37,6 +41,24 @@ def spider_rec(links, prefix, domain, postfix, exclude):
return links
def cmp(p1, p2):
with open(p1, 'r') as f1:
with open(p2, 'r') as f2:
l1 = f1.readlines()
l2 = f2.readlines()
not_matched = []
if len(l1) == len(l2):
for i in range(len(l1)):
if l1[i] != l2[i]:
if "<lastmod>" not in l1[i]:
return False
else:
return False
return True
def main():
conf = []
with open('crawl.conf', 'r') as file:
@ -48,13 +70,19 @@ def main():
domain = conf[0]
prefix = conf[1]
path = conf[2]
ignores = conf[2::]
ignores = conf[3::]
links = spider(prefix, domain, ignores)
date = datetime.datetime.utcnow()
out = open("sitemap.xml", 'w')
existed = exists(path)
oldpath = path
if existed:
path = "newmap.xml"
out = open(path, 'w')
out.write("<!--\n")
out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n")
out.write("-->\n")
@ -86,5 +114,11 @@ def main():
out.write("</urlset>\n")
out.close()
if existed and not cmp(oldpath, path):
move(oldpath, oldpath + "-old")
move(path, oldpath)
else:
os.remove(path)
main()