#!/usr/bin/python

import os,sys

# Get a list of files under the Documentation directory,
# filtering out instances of index.html

dirlist = []
for i in os.walk("Documentation"):
  for j in i[1]: dirlist.append("%s/%s/" % (i[0], j))
  for j in i[2]:
    if j!="index.html": dirlist.append("%s/%s" % (i[0], j))
dirlist.sort()

# Function to parse a relative link and append it to a list.
taglist = []
def handletag(path, tag, data):
  tag = tag.split()
  if tag[0]=="a":
    for i in tag:
      if i.startswith("href="):
        i = i[5:]
        if i[0]=='"' and i[-1]=='"': i=i[1:-1]
        taglist.append("%s/%s" % (path, i))

# Find all the index.html files under Documentation, read each one,
# iterate through the html tags and call handletag() for each.

for dir in os.walk("Documentation"):
  if "index.html" in dir[2]:
    data = open("%s/index.html" % dir[0]).read()
    data = data.split("<")[1:]
    for i in data:
      i = i.split(">")
      handletag(dir[0], i[0], i[1])

# Display the links with no files, and the files nothing linked to.
print "404 errors:"
for i in filter(lambda a: a not in dirlist, taglist): print i
print "Unlinked documents:"
for i in filter(lambda a: a not in taglist, dirlist): print i