"""\
This tool regenerates and replaces the ToC in an HTML file from the actual
structure of <div>s and <h[2345]>s present in the body of the document.
The section to be overwritten is identified as the XML subtree
rooted at <ol id="toc">.
Usage: ./toctool.py filename...
"""
import sys
import os
import xml.parsers.expat
class Index:
def __init__(self):
self.title = None
self.tree = []
self._ptr_stack = [self.tree]
def addLevel(self, id, title):
newlevel = [(id, title)]
self._ptr_stack[-1].append(newlevel)
self._ptr_stack.append(newlevel)
def upLevel(self):
self._ptr_stack.pop(-1)
def prettyString(self):
out = []
def step(ilevel, node):
if type(node) == list:
for subnode in node:
step(ilevel+1, subnode)
else:
out.append("%s%s" % (" "*ilevel, node))
step(-2, self.tree)
return "\n".join(out)
def renderXML(self):
out = []
def step(ilevel, node):
if len(node) == 1:
out.append('%s<li><a href="#%s">%s</a></li>'
% (' '*ilevel, node[0][0], node[0][1]))
else:
out.append('%s<li><a href="#%s">%s</a>'
% (' '*ilevel, node[0][0], node[0][1]))
out.append('%s<ol>' % (' '*ilevel))
for subnode in node[1:]:
step(ilevel+1, subnode)
out.append('%s</ol>' % (' '*ilevel))
out.append('%s</li> <!-- %s -->' % (' '*ilevel, node[0][0]))
out.append('<ol id="toc">')
for node in self.tree:
step(1, node)
out.append('</ol>')
return "\n".join(out)
class ExpatParseJob:
def parse(self, file):
p = xml.parsers.expat.ParserCreate()
p.ordered_attributes = self._ordered_attributes
p.returns_unicode = False
p.specified_attributes = True
for name in dir(self):
if name.endswith('Handler'):
setattr(p, name, getattr(self, name))
p.ParseFile(file)
class IndexBuildParse(ExpatParseJob):
keys = {'h2':None, 'h3':None, 'h4':None, 'h5':None}
def __init__(self):
self.index = Index()
self.keyptr = 0
self.collecting_text = False
self.text = ''
self.waiting_for_elt = None
self.saved_id = None
self.elt_stack = []
self._ordered_attributes = False
def StartElementHandler(self, name, attrs):
if name == 'div':
cl = attrs.get('class')
if cl in self.keys:
self.waiting_for_elt = cl
self.saved_id = attrs.get('id')
self.elt_stack.append((name, True))
return
elif name == 'title':
self.collecting_text = name
self.text = ''
elif name == self.waiting_for_elt:
self.waiting_for_elt = None
self.collecting_text = name
self.text = ''
self.elt_stack.append((name, False))
def EndElementHandler(self, name):
if self.collecting_text:
if name == self.collecting_text:
if name == 'title':
self.index.title = self.text
else:
self.index.addLevel(self.saved_id, self.text)
self.saved_id = None
self.collecting_text = False
else:
raise RuntimeError('foo')
eltinfo = self.elt_stack.pop(-1)
assert eltinfo[0] == name
if eltinfo[1]:
self.index.upLevel()
def DefaultHandler(self, data) :
if self.collecting_text:
self.text += data
def attrlist_to_dict(l):
d = {}
for i in range(0, len(l), 2):
d[l[i]] = l[i+1]
return d
def escape_entities(s):
return s.replace('&', '&').replace('<', '<').replace('>', '>')
class IndexInsertParse(ExpatParseJob):
def __init__(self, index, outfp):
self._ordered_attributes = True
self.index = index
self.outfp = outfp
self.elt_stack = []
self.skipping_toc = False
self._line_in_progress = []
self._element_open = None
self.linepos = 0
self.indentpos = 0
self.do_not_minimize = {'script':None}
self.do_not_indent = {'div':None, 'a':None, 'strong':None, 'em':None}
self.do_not_wrap = {'div':None, 'strong':None, 'em':None, 'li':None}
if self.index.title == 'Subversion Design':
self.do_not_wrap['a'] = None
def put_token(self, token, tag_name):
self._line_in_progress.append((token, tag_name))
def done_line(self):
linepos = 0
last_was_tag = False
outq = []
for token, tag_name in self._line_in_progress:
is_tag = tag_name is not None and tag_name not in self.do_not_wrap
no_indent_if_wrap = tag_name in self.do_not_indent
linepos += len(token)
if linepos > 79 and is_tag and last_was_tag:
token = token.lstrip(' ')
if no_indent_if_wrap:
linepos = len(token)
outq.append('\n')
else:
linepos = len(token) + 2
outq.append('\n ')
outq.append(token)
last_was_tag = is_tag
outq.append('\n')
for i in outq:
self.outfp.write(i)
del self._line_in_progress[:]
def _finish_pending(self, minimized_form):
if self._element_open is not None:
name = self._element_open
self._element_open = None
if minimized_form:
self.put_token(' />', name)
return True
else:
self.put_token('>', name)
return False
def StartElementHandler(self, name, attrs):
self._finish_pending(False)
if name == 'ol' and attrlist_to_dict(attrs).get('id') == 'toc':
self.outfp.write(self.index.renderXML())
self.skipping_toc = True
self.elt_stack.append((name, True))
return
if not self.skipping_toc:
self.put_token("<%s" % name, name)
while attrs:
aname = attrs.pop(0)
aval = escape_entities(attrs.pop(0))
self.put_token(' %s="%s"' % (aname, aval), name)
self._element_open = name
self.elt_stack.append((name, False))
def EndElementHandler(self, name):
if not self.skipping_toc:
if not self._finish_pending(name not in self.do_not_minimize):
self.put_token("</%s>" % name, name)
eltinfo = self.elt_stack.pop(-1)
assert eltinfo[0] == name
if eltinfo[1]:
self.skipping_toc = False
def DefaultHandler(self, data):
if self.skipping_toc:
return
self._finish_pending(False)
if data == '\n':
self.done_line()
else:
self.put_token(data, None)
def process(fn):
infp = open(fn, 'r')
builder = IndexBuildParse()
builder.parse(infp)
infp.seek(0)
outfp = open(fn + '.new', 'w')
inserter = IndexInsertParse(builder.index, outfp)
inserter.parse(infp)
infp.close()
outfp.close()
os.rename(fn, fn + '.toctool-backup~')
os.rename(fn + '.new', fn)
def main():
for fn in sys.argv[1:]:
process(fn)
if __name__ == '__main__':
main()