import datetime import os import bs4 from urllib.request import Request, urlopen from os.path import exists from shutil import move def spider(prefix, domain, exclude): return spider_rec(dict(), prefix, domain, "/", exclude) def spider_rec(links, prefix, domain, postfix, exclude): links[postfix] = 1 req = Request(prefix + domain + postfix) html_page = urlopen(req) soup = bs4.BeautifulSoup(html_page, "lxml") for link in soup.findAll('a'): href = link.get('href') if "mailto:" not in href and (domain in href or href[0] == '/'): if href not in links.keys(): found = False for d in exclude: if d in href: found = True break if found: continue href = href.replace(" ", "%20") if domain in href: spider_rec(links, "", "", href, exclude) else: spider_rec(links, prefix, domain, href, exclude) else: links[href] += 1 return links def cmp(p1, p2): with open(p1, 'r') as f1: with open(p2, 'r') as f2: l1 = f1.readlines() l2 = f2.readlines() not_matched = [] if len(l1) == len(l2): for i in range(len(l1)): if l1[i] != l2[i]: if "" not in l1[i]: return False else: return False return True def main(): print("Reading conf...") conf = [] with open('crawl.conf', 'r') as file: for line in file.readlines(): if line[0] != '#': line = line.replace("\n", "") line = line.replace("\r", "") conf.append(line) domain = conf[0] prefix = conf[1] path = conf[2] ignores = conf[3::] print("Crawling site...") links = spider(prefix, domain, ignores) date = datetime.datetime.utcnow() existed = exists(path) oldpath = path if existed: print("Sitemap already exists, creating temp...") path = "newmap.xml" print("Writing to target file...") out = open(path, 'w') out.write("\n") out.write( "\n") sorted_links = dict(sorted(links.items(), key=lambda item: item[1], reverse=True)) ordered = [] level = 0 old_num = sorted_links[list(sorted_links.keys())[0]] for l in sorted_links.keys(): if sorted_links[l] != old_num: level += 1 old_num = sorted_links[l] link = l if link[0] == '/': link = prefix + domain + link ordered.append((link, str(float(str(round(pow(0.8, level), 2)))))) for l in ordered: out.write("\t\n") out.write("\t\t" + l[0] + "\n") out.write("\t\t" + date.strftime("%Y-%m-%dT%H:%M:%S+00:00") + "\n") out.write("\t\t" + str(l[1]) + "\n") out.write("\t\n") out.write("\n") out.close() if existed and not cmp(oldpath, path): print("Creating old sitemap backup...") move(oldpath, oldpath + "-old") print("Overwriting old sitemap with new one...") move(path, oldpath) elif existed: print("Sitemaps are the same, removing temp...") os.remove(path) print("Done.") main()