normalize-dump.py [plain text]

#!/usr/bin/python

import sys
import re
import string

header_re = re.compile(r'^(.*): ?(.*)$')

class NodePath:
    def __init__(self, path, headers):
        self.path = path
        self.headers = headers
        
    def dump(self):
        print (' ' * 3) + self.path
        headers = self.headers.keys()
        headers.sort()
        for header in headers:
            print (' ' * 6) + header + ': ' + self.headers[header]


def dump_revision(rev, nodepaths):
    sys.stderr.write('* Normalizing revision ' + rev + '...')
    print 'Revision ' + rev
    paths = nodepaths.keys()
    paths.sort()
    for path in paths:
        nodepath = nodepaths[path]
        nodepath.dump()
    sys.stderr.write('done\n')
    
        

def parse_header_block(fp):
    headers = {}
    while 1:
        line = fp.readline()
        if line == '':
            return headers, 1
        line = string.strip(line)
        if line == '':
            return headers, 0
        matches = header_re.match(line)
        if not matches:
            raise Exception('Malformed header block')
        headers[matches.group(1)] = matches.group(2)

        
def parse_file(fp):
    nodepaths = {}
    current_rev = None

    while 1:
        # Parse a block of headers
        headers, eof = parse_header_block(fp)

        # This is a revision header block
        if headers.has_key('Revision-number'):

            # If there was a previous revision, dump it
            if current_rev:
                dump_revision(current_rev, nodepaths)

            # Reset the data for this revision
            current_rev = headers['Revision-number']
            nodepaths = {}

            # Skip the contents
            prop_len = headers.get('Prop-content-length', 0)
            fp.read(int(prop_len))

        # This is a node header block
        elif headers.has_key('Node-path'):

            # Make a new NodePath object, and add it to the
            # dictionary thereof
            path = headers['Node-path']
            node = NodePath(path, headers)
            nodepaths[path] = node

            # Skip the content
            text_len = headers.get('Text-content-length', 0)
            prop_len = headers.get('Prop-content-length', 0)
            fp.read(int(text_len) + int(prop_len))

        # Not a revision, not a node -- if we've already seen at least
        # one revision block, we are in an errorful state.
        elif current_rev and len(headers.keys()):
            raise Exception('Header block from outta nowhere')

        if eof:
            if current_rev:
                dump_revision(current_rev, nodepaths)
            break

def usage():
    print 'Usage: ' + sys.argv[0] + ' [DUMPFILE]'
    print ''
    print 'Reads a Subversion dumpfile from DUMPFILE (or, if not provided,'
    print 'from stdin) and normalizes the metadata contained therein,'
    print 'printing summarized and sorted information.  This is useful for'
    print 'generating data about dumpfiles in a diffable fashion.'
    sys.exit(0)
    
def main():
    if len(sys.argv) > 1:
        if sys.argv[1] == '--help':
            usage()
        fp = open(sys.argv[1], 'rb')
    else:
        fp = sys.stdin
    parse_file(fp)

    
if __name__ == '__main__':
    main()