check_cvs.in [plain text]

#! @PERL@ -w
########################################################################
#  Copyright (c) 2000, 2001 by Donald Sharp <sharpd@cisco.com>
#  All Rights Reserved
#
#  Permission is granted to copy and/or distribute this file, with or
#  without modifications, provided this notice is preserved.
#
########################################################################

=head1 check_cvs.pl

    Script to check the integrity of the Repository

=head1 SYNOPSIS

    check_cvs.pl

=head1 DESCRIPTION

    This script will search through a repository and determine if
    any of the files in it are corrupted.

    Please do not run this script inside of the repository itself,
    it will cause it too fail.

    Also it currently can only be run over the entire repository,
    so only point your CVSROOT at the actual CVSROOT.

=head1 OPTIONS

    There are no options.

=head1 EXAMPLES

    setenv CVSROOT /release/111/cvs

    # To see more verbose output
    setenv CVSDEBUGEDIT 1

    check_cvs.pl

=head1 SEE ALSO

    None

=cut

######################################################################
#                    MODULES                                         #
######################################################################
use strict;

use File::Find;
use File::Basename;
use File::Path;
use Cwd;

######################################################################
#                    GLOBALS                                         #
######################################################################

my @list_of_broken_files;
my @extra_files;
my $verbose = 0;

my $total_revisions;
my $total_interesting_revisions;
my $total_files;
my @ignore_files;

######################################################################
#                    SUBROUTINES                                     #
######################################################################

######################################################################
#
#    NAME :
#      main
#
#    PURPOSE :
#      To search the repository for broken files
#
#    PARAMETERS :
#      NONE
#
#    GLOBALS :
#      $ENV{ CVSROOT }       - The CVS repository to search through
#      $ENV{ CVSDEBUGEDIT }  - Turn on Debugging.
#      @list_of_broken_files - The list of files that need to
#                              be fixed.
#      $verbose              - is verbose mode on?
#      $total_revisions      - The number of revisions considered
#      $total_interesting_revisions - The number of revisions used
#      $total_files          - The total number of files looked at.
#
#    RETURNS :
#      A list of broken files
#
#    COMMENTS :
#      Do not run this script inside the repository.  Choose
#      a nice safe spot( like /tmp ) outside of the repository.
#
######################################################################
my $directory_to_look_at;

select (STDOUT); $| = 1;    # make unbuffered

$total_revisions = 0;
$total_interesting_revisions = 0;
$total_files = 0;

if( !exists( $ENV{ CVSROOT } ) )
{
    die( "The script should be run with the CVSROOT environment variable set" );
}

if( exists( $ENV{ CVSDEBUGEDIT } ) )
{
    $verbose = 1;
    print( "Verbose Mode Turned On\n" );
}

$directory_to_look_at = $ENV{ CVSROOT };
my $sym_count = 0;
while( -l $directory_to_look_at )
{
    $directory_to_look_at = readlink( $directory_to_look_at );
    $sym_count += 1;
    if( $sym_count > 5 )
    {
        die( "Encountered too many symlinks for $ENV{ CVSROOT }\n" );
    }
}

print( "Processing: $directory_to_look_at\n" ) if( $verbose );
@ignore_files = &get_ignore_files_from_cvsroot( $directory_to_look_at );
find( \&process_file, $directory_to_look_at );

my $num_files = @list_of_broken_files;
print( "List of corrupted files\n" ) if( $num_files > 0 );
foreach my $broken ( @list_of_broken_files )
{
    print( "**** File: $broken\n" );
}

$num_files = @extra_files;
print( "List of Files That Don't belong in Repository:\n" ) if( $num_files > 0 );
foreach my $extra ( @extra_files )
{
    print( "**** File: $extra\n" );
}
print( "Total Files: $total_files\n" );
print( "Total Revisions: $total_revisions Interesting Revisions: $total_interesting_revisions\n" );

######################################################################
#
#    NAME :
#      process_file
#
#    PURPOSE :
#      This function is called by the find function, it's purpose
#      is to decide if it is important to look at a file or not.
#      We only care about files that have the ,v at the end.
#
#    PARAMETERS :
#      NONE
#
#    GLOBALS :
#      $ENV{ CVSROOT } - The CVS repository to search through
#
#    RETURNS :
#      NONE
#
#    COMMENTS :
#      NONE
#
######################################################################
sub process_file
{
    my $path = $File::Find::name;

    $total_files += 1;
    $path =~ s/^$directory_to_look_at\///;

    print( "\tProcessing File: $path\n" ) if( $verbose );
    if( $path =~ /,v$/ )
    {
        $path =~ s/,v$//;
        look_at_cvs_file( $path );
    }
    elsif( ! -d $File::Find::name )
    {
        my $save = 0;

        foreach my $ignore ( @ignore_files )
        {
            if( $path =~ /$ignore/ )
            {
                $save = 1;
                last;
            }
        }

        if( !$save )
        {
            push( @extra_files, $path );
        }
    }
}

######################################################################
#
#    NAME :
#      look_at_cvs_file
#
#    PURPOSE :
#      To decide if a file is broken or not.  The algorithm is:
#      a)  Get the revision history for the file.
#              - If that fails the file is broken, save the fact
#                and continue processing other files.
#              - If that succeeds we have a list of revisions.
#      b)  For Each revision try to retrieve that version
#              - If that fails the file is broken, save the fact
#                and continue processing other files.
#      c)  Continue on 
#
#    PARAMETERS :
#      $file - The file to look at.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      NONE
#
#    COMMENTS :
#      We have to handle Attic files in a special manner.
#      Basically remove the Attic from the string if it
#      exists at the end of the $path variable.
#
######################################################################
sub look_at_cvs_file
{
    my( $file ) = @_;
    my( $name, $path, $suffix ) = fileparse( $file );

    if( $path =~ s/Attic\/$// )
    {
        $file = $path . $name;
    }

    my $revisions = get_history( $name );

    if( !defined( $revisions ) )
    {
        print( "\t$file is corrupted, this was determined via a cvs log command\n" ) if( $verbose );
        push( @list_of_broken_files, $file );
        return();
    }

    my @int_revisions = find_interesting_revisions( @$revisions );

    foreach my $revision ( @int_revisions )
    {
        print( "\t\tLooking at Revision: $revision\n" ) if( $verbose );
        if( !check_revision( $file, $revision ) )
        {
            print( "\t$file is corrupted in revision: $revision\n" ) if( $verbose );
            push( @list_of_broken_files, $file );
            return();
        }
    }

}

######################################################################
#
#    NAME :
#      get_history
#
#    PURPOSE :
#      To retrieve a array of revision numbers.
#
#    PARAMETERS :
#      $file - The file to retrieve the revision numbers for
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      On Success - Reference to the list of revision numbers
#      On Failure - undef.
#
#    COMMENTS :
#      The $_ is saved off because The File::find functionality
#      expects the $_ to not have been changed.
#      The -N option for the rlog command means to spit out 
#      tags or branch names.
#
######################################################################
sub get_history
{
    my( $file ) = @_;
    $file =~ s/(["\$`\\])/\\$1/g;
    my @revisions;
    my $revision;
    my $ignore = 1;
    my $save_ = $_;

    open( FILE, "rlog -N \"$file\" 2>&1 |" ) or die( "unable to run rlog, help" );

    while( <FILE> )
    {
        #rlog outputs a "----" line before the actual revision
        #without this we'll pick up peoples comments if they 
        #happen to start with revision
	if( /^----------------------------$/ )
	{
	    $ignore = 0;
            next;
        }

	if( ( !$ignore ) && ( ( $revision ) = m/^revision (\S+)/ ) )
	{
            push( @revisions, $revision );
            $ignore = 1;
        }
    }

    $_ = $save_;

    if( !close( FILE ) )
    {
        return( undef );
    }

    return( \@revisions );
}

######################################################################
#
#    NAME :
#      check_revision
#
#    PURPOSE :
#      Given a file and a revision number ensure that we can 
#      check out that file
#
#    PARAMETERS :
#      $file     - The file to look at.
#      $revision - The revision to look at.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      If we can get the File - 1
#      If we can not get the File - 0
#
#    COMMENTS :
#      cvs command line options are as followed:
#        -n - Do not run any checkout program as specified by the -o
#             option in the modules file
#        -p - Put all output to standard out.
#        -r - The revision of the file that we would like to look at.
#      Please note that cvs will return 0 for being able to successfully
#      read the file and 1 for failure to read the file.
#
######################################################################
sub check_revision
{
    my( $file, $revision ) = @_;
    $file =~ s/(["\$`\\])/\\$1/g;

    my $cwd = getcwd();
    chdir( "/tmp" );

    my $ret_code = 0xffff & system( "cvs co -n -p -r $revision \"$file\" > /dev/null 2>&1" );

    chdir( $cwd );
    return( 1 ) if ( $ret_code == 0 );
    return( 0 );

    return( $ret_code );
}

######################################################################
#
#    NAME :
#      find_interesting_revisions
#
#    PURPOSE :
#      CVS stores information in a logical manner.  We only really
#      need to look at some interestin revisions.  These are:
#      The first version
#      And the last version on every branch.
#      This is because cvs stores changes descending from 
#      main line. ie suppose the last version on mainline is 1.6
#      version 1.6 of the file is stored in toto.  version 1.5
#      is stored as a diff between 1.5 and 1.6.  1.4 is stored 
#      as a diff between 1.5 and 1.4.
#      branches are stored a little differently.  They are 
#      stored in ascending order.  Suppose there is a branch
#      on 1.4 of the file.  The first branches revision number
#      would be 1.4.1.1.  This is stored as a diff between 
#      version 1.4 and 1.4.1.1.  The 1.4.1.2 version is stored
#      as a diff between 1.4.1.1 and 1.4.1.2.  Therefore
#      we are only interested in the earliest revision number
#      and the highest revision number on a branch.
#
#    PARAMETERS :
#      @revisions - The list of revisions to find interesting ones
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      @new_revisions - The list of revisions that we find interesting
#
#    COMMENTS :
#
######################################################################
sub find_interesting_revisions
{
    my( @revisions ) = @_;
    my @new_revisions;
    my %branch_revision;
    my $branch_number;
    my $branch_rev;
    my $key;
    my $value;

    START_OVER:
    foreach my $revision( @revisions )
    {
        my $start_over = 0;
        ( $branch_number, $branch_rev ) = branch_split( $revision );

        #if the number of elements in the branch is 1
        #and the new branch is less than the old branch
        if( elements_in_branch( $branch_number ) == 1 )
        {
            ( $start_over,
              %branch_revision ) = find_int_mainline_revision( $branch_number,
                                                               $branch_rev,
                                                               %branch_revision );
            next START_OVER if( $start_over );
        }

        %branch_revision = find_int_branch_revision( $branch_number,
                                                     $branch_rev,
                                                     %branch_revision );

    }

    %branch_revision = remove_duplicate_branches( %branch_revision );

    while( ( $key, $value ) = each ( %branch_revision ) )
    {
        push( @new_revisions, $key . "." . $value );
    }

    my $nrc;
    my $rc;

    $rc = @revisions;
    $nrc = @new_revisions;

    $total_revisions += $rc;
    $total_interesting_revisions += $nrc;

    print( "\t\tTotal Revisions: $rc Interesting Revisions: $nrc\n" ) if( $verbose );

    return( @new_revisions );
}

########################################################################
#
#    NAME :
#      remove_duplicate_branches
#
#    PURPOSE :
#      To remove from the list of branches that we are interested
#      in duplication that will cause cvs to check a revision multiple
#      times.  For Instance revision 1.1.1.1 should be prefered
#      to be checked over revision 1.1, as that v1.1.1.1 can
#      only be retrieved by going through v1.1.  Therefore
#      we should remove v1.1 from the list of branches that
#      are interesting.
#
#    PARAMETERS :
#      %branch_revisions - The hash of the interesting revisions
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      %branch_revisions - The hash of the modified interesting revisions
#
#    COMMENTS :
#      NONE
#
########################################################################
sub remove_duplicate_branches
{
    my( %branch_revisions ) = @_;
    my $key;
    my $value;
    my $branch_comp;
    my $branch;


  RESTART:
    {
        my @keys = keys( %branch_revisions );
        while( ( $key, $value ) = each ( %branch_revisions ) )
        {
            $branch_comp = $key . "." . $value;
            foreach $branch ( @keys )
            {
                if( $branch eq $key )
                {
                    next;
                }
                if( elements_in_branch( $branch_comp ) ==
                    elements_in_branch( $branch ) - 1 )
                {
                    if( $branch =~ /^$branch_comp/ )
                    {
                        delete( $branch_revisions{ $key } );
                        goto RESTART;
                    }
                }
            }
        }
    }

    return( %branch_revisions );
}

######################################################################
#
#    NAME :
#      find_int_branch_revision
#
#    PURPOSE :
#      To Find a interesting branch revision.
#      Algorithm:
#        If the $branch_revision exists in the interesting branch
#        hash and the new $branch_rev is less than currently saved
#        one replace it with the new $branch_rev.
#        else if the $branch_revision doesn't exist in the interesting
#        branch hash, then just store the $branch_number and $branch_rev
#
#    PARAMETERS :
#      $branch_number - The branch that we are looking at
#      $branch_rev    - The particular revision we are looking
#                       at on the $branch_number.
#      %branch_revision - The hash storing the interesting branches
#                         and the revisions on them.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      %branch_revision - The modified hash that stores interesting
#                         branches.
#
#    COMMENTS :
#      NONE
#
######################################################################
sub find_int_branch_revision
{
    my( $branch_number, $branch_rev, %branch_revision ) = @_;

    if( exists( $branch_revision{ $branch_number } ) )
    {
        if( $branch_rev > $branch_revision{ $branch_number } )
        {
            $branch_revision{ $branch_number } = $branch_rev;
        }
    }
    else
    {
        $branch_revision{ $branch_number } = $branch_rev;
    }

    return( %branch_revision );
}

######################################################################
#
#    NAME :
#      find_int_mainline_revision
#
#    PURPOSE :
#      To Find a interesting mainline revision.
#      Algorithm:
#        if the $branch_number is less then a branch number
#        with one element in it, then delete the old branch_number
#        and return.
#        if the $branch_number is greater than a branch number
#        then return, and tell the calling function that we
#        should skip this element, as that it's not important.
#        if the $branch_number is the same as a branch number
#        with one element in it, then check to see if the
#        $branch_rev is less than the stored branch rev if
#        it is replace with new $branch_rev.  Else ignore revision
#
#    PARAMETERS :
#      $branch_number - The branch that we are looking at
#      $branch_rev    - The particular revision we are looking
#                       at on the $branch_number.
#      %branch_revision - The hash storing the interesting branches
#                         and the revisions on them.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      ( $skip, %branch_revision ) -
#      $skip - 1 if we need to ignore this particular $branch_number
#              $branch_rev combo.  Else 0.
#      %branch_revision - The modified hash that stores interesting
#                         branches.
#
#    COMMENTS :
#      NONE
#
######################################################################
sub find_int_mainline_revision
{
    my( $branch_number, $branch_rev, %branch_revision ) = @_;

    foreach my $key ( keys %branch_revision )
    {
        if( elements_in_branch( $key ) == 1 )
        {
            if( $branch_number < $key )
            {
                delete( $branch_revision{ $key } );
                next;
            }

            if( $branch_number > $key )
            {
                return( 1, %branch_revision );
            }
            if( ( exists( $branch_revision{ $branch_number } ) ) &&
                ( $branch_rev < $branch_revision{ $branch_number } ) )
            {
                $branch_revision{ $branch_number } = $branch_rev;
                return( 1, %branch_revision );
            }
        }
    }

    return( 0, %branch_revision );
}

######################################################################
#
#    NAME :
#      elements_in_branch
#
#    PURPOSE :
#      Determine the number of elements in a revision number
#      Elements are defined by numbers seperated by ".".
#      the revision 1.2.3.4 would have 4 elements
#      the revision 1.2.4.5.6.7 would have 6 elements
#
#    PARAMETERS :
#      $branch - The revision to look at.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      $count - The number of elements
#
#    COMMENTS :
#      NONE
#
######################################################################
sub elements_in_branch
{
    my( $branch ) = @_;
    my @split_rev;

    @split_rev = split /\./, $branch;

    my $count = @split_rev;
    return( $count );
}

######################################################################
#
#    NAME :
#      branch_split
#
#    PURPOSE :
#      To split up a revision number up into the branch part and
#      the number part.  For Instance:
#      1.1.1.1 - is split 1.1.1 and 1
#      2.1     - is split 2 and 1
#      1.3.4.5.7.8 - is split 1.3.4.5.7 and 8
#
#    PARAMETERS :
#      $revision - The revision to look at.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      ( $branch, $revision ) - 
#      $branch - The branch part of the revision number 
#      $revision - The revision part of the revision number
#
#    COMMENTS :
#      NONE
#
######################################################################
sub branch_split
{
    my( $revision ) = @_;
    my $branch;
    my $version;
    my @split_rev;
    my $count;

    @split_rev = split /\./, $revision;

    my $numbers = @split_rev;
    @split_rev = reverse( @split_rev );
    $branch = pop( @split_rev );
    for( $count = 0; $count < $numbers - 2 ; $count++ )
    {
        $branch .= "." . pop( @split_rev );
    }

    return( $branch, pop( @split_rev ) );
}

######################################################################
#
#    NAME :
#      get_ignore_files_from_cvsroot
#
#    PURPOSE :
#      Retrieve the list of files from the CVSROOT/ directory
#      that should be ignored. 
#      These are the regular files (e.g., commitinfo, loginfo)
#      and those specified in the checkoutlist file.
#
#    PARAMETERS :
#      The CVSROOT
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      @ignore - the list of files to ignore
#
#    COMMENTS :
#      NONE
#
######################################################################
sub get_ignore_files_from_cvsroot {
    my( $cvsroot ) = @_;
    my @ignore = ( 'CVS\/fileattr$',
                   '^CVSROOT\/loginfo',
                   '^CVSROOT\/.#loginfo',
                   '^CVSROOT\/rcsinfo',
                   '^CVSROOT\/.#rcsinfo',
                   '^CVSROOT\/editinfo',
                   '^CVSROOT\/.#editinfo',
                   '^CVSROOT\/verifymsg',
                   '^CVSROOT\/.#verifymsg',
                   '^CVSROOT\/commitinfo',
                   '^CVSROOT\/.#commitinfo',
                   '^CVSROOT\/taginfo',
                   '^CVSROOT\/.#taginfo',
                   '^CVSROOT\/cvsignore',
                   '^CVSROOT\/.#cvsignore',
                   '^CVSROOT\/checkoutlist',
                   '^CVSROOT\/.#checkoutlist',
                   '^CVSROOT\/cvswrappers',
                   '^CVSROOT\/.#cvswrappers',
                   '^CVSROOT\/notify',
                   '^CVSROOT\/.#notify',
                   '^CVSROOT\/modules',
                   '^CVSROOT\/.#modules',
                   '^CVSROOT\/readers',
                   '^CVSROOT\/.#readers',
                   '^CVSROOT\/writers',
                   '^CVSROOT\/.#writers',
                   '^CVSROOT\/passwd',
                   '^CVSROOT\/config',
                   '^CVSROOT\/.#config',
                   '^CVSROOT\/val-tags',
                   '^CVSROOT\/.#val-tags',
                   '^CVSROOT\/history' );
    my $checkoutlist_file = "$cvsroot\/CVSROOT\/checkoutlist";
    open( CHECKOUTLIST, "<$cvsroot\/CVSROOT\/checkoutlist" )
        or die( "Unable to read checkoutlist file: $!\n" );

    my @list = <CHECKOUTLIST>;
    chomp( @list );
    close( CHECKOUTLIST )
        or die( "Unable to close checkoutlist file: $!\n" );

    foreach my $line( @list )
    {
        next if( $line =~ /^#/ || $line =~ /^$/ );
        if( $line =~ /^\s*(\S*)\s*/ ) { $line = $1 };
        push( @ignore, "^CVSROOT\/$line", "^CVSROOT\/\.#$line" );
    }	

    return @ignore;
}