path option in conf
This commit is contained in:
parent
f03eefe2f6
commit
7d8e7b9065
2 changed files with 38 additions and 2 deletions
|
|
@ -2,6 +2,8 @@
|
||||||
benrmorgan.com
|
benrmorgan.com
|
||||||
# Prefix Ex: http://www.
|
# Prefix Ex: http://www.
|
||||||
http://www.
|
http://www.
|
||||||
|
# Target path Ex /var/www/html/sitemap.xml or ./sitemaps/sitemap.xml
|
||||||
|
/var/www/html/sitemap.xml
|
||||||
# Ignore urls containing Ex: /files/
|
# Ignore urls containing Ex: /files/
|
||||||
/files/
|
/files/
|
||||||
/images/
|
/images/
|
||||||
38
main.py
38
main.py
|
|
@ -1,6 +1,10 @@
|
||||||
import datetime
|
import datetime
|
||||||
|
import os
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
from os.path import exists
|
||||||
|
from shutil import move
|
||||||
|
|
||||||
|
|
||||||
def spider(prefix, domain, exclude):
|
def spider(prefix, domain, exclude):
|
||||||
|
|
@ -37,6 +41,24 @@ def spider_rec(links, prefix, domain, postfix, exclude):
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def cmp(p1, p2):
|
||||||
|
with open(p1, 'r') as f1:
|
||||||
|
with open(p2, 'r') as f2:
|
||||||
|
l1 = f1.readlines()
|
||||||
|
l2 = f2.readlines()
|
||||||
|
not_matched = []
|
||||||
|
|
||||||
|
if len(l1) == len(l2):
|
||||||
|
for i in range(len(l1)):
|
||||||
|
if l1[i] != l2[i]:
|
||||||
|
if "<lastmod>" not in l1[i]:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
conf = []
|
conf = []
|
||||||
with open('crawl.conf', 'r') as file:
|
with open('crawl.conf', 'r') as file:
|
||||||
|
|
@ -48,13 +70,19 @@ def main():
|
||||||
|
|
||||||
domain = conf[0]
|
domain = conf[0]
|
||||||
prefix = conf[1]
|
prefix = conf[1]
|
||||||
|
path = conf[2]
|
||||||
|
|
||||||
ignores = conf[2::]
|
ignores = conf[3::]
|
||||||
|
|
||||||
links = spider(prefix, domain, ignores)
|
links = spider(prefix, domain, ignores)
|
||||||
date = datetime.datetime.utcnow()
|
date = datetime.datetime.utcnow()
|
||||||
|
|
||||||
out = open("sitemap.xml", 'w')
|
existed = exists(path)
|
||||||
|
oldpath = path
|
||||||
|
if existed:
|
||||||
|
path = "newmap.xml"
|
||||||
|
|
||||||
|
out = open(path, 'w')
|
||||||
out.write("<!--\n")
|
out.write("<!--\n")
|
||||||
out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n")
|
out.write("\tSitemap generator by Ben Morgan - www.benrmorgan.com\n")
|
||||||
out.write("-->\n")
|
out.write("-->\n")
|
||||||
|
|
@ -86,5 +114,11 @@ def main():
|
||||||
out.write("</urlset>\n")
|
out.write("</urlset>\n")
|
||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
|
if existed and not cmp(oldpath, path):
|
||||||
|
move(oldpath, oldpath + "-old")
|
||||||
|
move(path, oldpath)
|
||||||
|
else:
|
||||||
|
os.remove(path)
|
||||||
|
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue