#!/usr/bin/env python """\ This tool regenerates and replaces the ToC in an HTML file from the actual structure of
s and s present in the body of the document. The section to be overwritten is identified as the XML subtree rooted at
    . Usage: ./toctool.py filename... """ import sys import os import xml.parsers.expat class Index: def __init__(self): self.title = None self.tree = [] self._ptr_stack = [self.tree] def addLevel(self, id, title): newlevel = [(id, title)] self._ptr_stack[-1].append(newlevel) self._ptr_stack.append(newlevel) def upLevel(self): self._ptr_stack.pop(-1) def prettyString(self): out = [] def step(ilevel, node): if type(node) == list: for subnode in node: step(ilevel+1, subnode) else: out.append("%s%s" % (" "*ilevel, node)) step(-2, self.tree) return "\n".join(out) def renderXML(self): out = [] def step(ilevel, node): if len(node) == 1: out.append('%s
  1. %s
  2. ' % (' '*ilevel, node[0][0], node[0][1])) else: out.append('%s
  3. %s' % (' '*ilevel, node[0][0], node[0][1])) out.append('%s
      ' % (' '*ilevel)) for subnode in node[1:]: step(ilevel+1, subnode) out.append('%s
    ' % (' '*ilevel)) out.append('%s
  4. ' % (' '*ilevel, node[0][0])) out.append('
      ') for node in self.tree: step(1, node) out.append('
    ') return "\n".join(out) class ExpatParseJob: def parse(self, file): p = xml.parsers.expat.ParserCreate() p.ordered_attributes = self._ordered_attributes p.returns_unicode = False p.specified_attributes = True for name in dir(self): if name.endswith('Handler'): setattr(p, name, getattr(self, name)) p.ParseFile(file) class IndexBuildParse(ExpatParseJob): keys = {'h2':None, 'h3':None, 'h4':None, 'h5':None} def __init__(self): self.index = Index() self.keyptr = 0 self.collecting_text = False self.text = '' self.waiting_for_elt = None self.saved_id = None self.elt_stack = [] self._ordered_attributes = False def StartElementHandler(self, name, attrs): if name == 'div': cl = attrs.get('class') if cl in self.keys: self.waiting_for_elt = cl self.saved_id = attrs.get('id') self.elt_stack.append((name, True)) return elif name == 'title': self.collecting_text = name self.text = '' elif name == self.waiting_for_elt: self.waiting_for_elt = None self.collecting_text = name self.text = '' self.elt_stack.append((name, False)) def EndElementHandler(self, name): if self.collecting_text: if name == self.collecting_text: if name == 'title': self.index.title = self.text else: self.index.addLevel(self.saved_id, self.text) self.saved_id = None self.collecting_text = False else: raise RuntimeError('foo') eltinfo = self.elt_stack.pop(-1) assert eltinfo[0] == name if eltinfo[1]: self.index.upLevel() def DefaultHandler(self, data) : if self.collecting_text: self.text += data def attrlist_to_dict(l): d = {} for i in range(0, len(l), 2): d[l[i]] = l[i+1] return d def escape_entities(s): return s.replace('&', '&').replace('<', '<').replace('>', '>') class IndexInsertParse(ExpatParseJob): def __init__(self, index, outfp): self._ordered_attributes = True self.index = index self.outfp = outfp self.elt_stack = [] self.skipping_toc = False self._line_in_progress = [] self._element_open = None self.linepos = 0 self.indentpos = 0 self.do_not_minimize = {'script':None} self.do_not_indent = {'div':None, 'a':None, 'strong':None, 'em':None} self.do_not_wrap = {'div':None, 'strong':None, 'em':None, 'li':None} if self.index.title == 'Subversion Design': self.do_not_wrap['a'] = None def put_token(self, token, tag_name): self._line_in_progress.append((token, tag_name)) def done_line(self): linepos = 0 last_was_tag = False outq = [] for token, tag_name in self._line_in_progress: is_tag = tag_name is not None and tag_name not in self.do_not_wrap no_indent_if_wrap = tag_name in self.do_not_indent linepos += len(token) if linepos > 79 and is_tag and last_was_tag: token = token.lstrip(' ') if no_indent_if_wrap: linepos = len(token) outq.append('\n') else: linepos = len(token) + 2 outq.append('\n ') outq.append(token) last_was_tag = is_tag outq.append('\n') for i in outq: self.outfp.write(i) del self._line_in_progress[:] def _finish_pending(self, minimized_form): if self._element_open is not None: name = self._element_open self._element_open = None if minimized_form: self.put_token(' />', name) return True else: self.put_token('>', name) return False def StartElementHandler(self, name, attrs): self._finish_pending(False) if name == 'ol' and attrlist_to_dict(attrs).get('id') == 'toc': self.outfp.write(self.index.renderXML()) self.skipping_toc = True self.elt_stack.append((name, True)) return if not self.skipping_toc: self.put_token("<%s" % name, name) while attrs: aname = attrs.pop(0) aval = escape_entities(attrs.pop(0)) self.put_token(' %s="%s"' % (aname, aval), name) self._element_open = name self.elt_stack.append((name, False)) def EndElementHandler(self, name): if not self.skipping_toc: if not self._finish_pending(name not in self.do_not_minimize): self.put_token("" % name, name) eltinfo = self.elt_stack.pop(-1) assert eltinfo[0] == name if eltinfo[1]: self.skipping_toc = False def DefaultHandler(self, data): if self.skipping_toc: return self._finish_pending(False) # This makes an unsafe assumption that expat will pass '\n' as individual # characters to this function. Seems to work at the moment. # Will almost certainly break later. if data == '\n': self.done_line() else: self.put_token(data, None) def process(fn): infp = open(fn, 'r') builder = IndexBuildParse() builder.parse(infp) infp.seek(0) outfp = open(fn + '.new', 'w') inserter = IndexInsertParse(builder.index, outfp) inserter.parse(infp) infp.close() outfp.close() os.rename(fn, fn + '.toctool-backup~') os.rename(fn + '.new', fn) def main(): for fn in sys.argv[1:]: process(fn) if __name__ == '__main__': main()