#!/usr/bin/python import os,sys # Get a list of files under the Documentation directory, # filtering out instances of index.html dirlist = [] for i in os.walk("Documentation"): for j in i[1]: dirlist.append("%s/%s/" % (i[0], j)) for j in i[2]: if j!="index.html": dirlist.append("%s/%s" % (i[0], j)) dirlist.sort() # Function to parse a relative link and append it to a list. taglist = [] def handletag(path, tag, data): tag = tag.split() if tag[0]=="a": for i in tag: if i.startswith("href="): i = i[5:] if i[0]=='"' and i[-1]=='"': i=i[1:-1] taglist.append("%s/%s" % (path, i)) # Find all the index.html files under Documentation, read each one, # iterate through the html tags and call handletag() for each. for dir in os.walk("Documentation"): if "index.html" in dir[2]: data = open("%s/index.html" % dir[0]).read() data = data.split("<")[1:] for i in data: i = i.split(">") handletag(dir[0], i[0], i[1]) # Display the links with no files, and the files nothing linked to. print "404 errors:" for i in filter(lambda a: a not in dirlist, taglist): print i print "Unlinked documents:" for i in filter(lambda a: a not in taglist, dirlist): print i