cmpCSUM.rb

#!/usr/local/bin/ruby

# Operational Options
#   -doPrefix=(Y|N)
#       Find maximal common prefix
#   -csum=(md5|sha1)
#       Set the checksum to use
# Printing/Display Options
#   -sCOL=pattern
#       Search criteria -- i.e. which file data lines to print. "COL"
#       is the name of a column, or the keyword 'ALL'. When a column
#       other than NAME is given, then it will be used to match the
#       first part of the printed output for that column.  When NAME
#       is the column, it will be used as a regexp against the NAME
#       col.  When appearing multiple times, these options combine
#       with "OR".  The ALL keyword directs the script to display a
#       line for Every file -- even the ones that are the same on both
#       sides. The default is to print if anything is different.  Any
#       option of this type will destroy the default setting.
#   -pColTitles=(Y|N)
#       Print column titles.  Default is 'Y'
#   -pDups=(Y|N)
#       Under each file line printed, also print the names of files
#       with same content, but with different names.
#   -pPrefix=(Y|N)
#       If -doPrefix=Y, then print the maximal common prefix before
#       the file data and column titles are printed.
#   -pCols=col1,col2,...
#       What cols to print. Note that "NAME" must be last if it is
#       listed because it has a variable column width.
#   -backup
#       Set options so that the output is useful for backup programs:
#          -pPrefix=N -pColTitles=N -pCols=NAME -sH=\| -sH=\>
#       This is useful when one wishes to have a list of "new" files
#       or files that have "changed" in the case when the first
#       checksum file is assumed to be for the backup directory while
#       the second is assumed to be the current working directory from
#       which the backup was generated.
#
# Output:
#
# One file-name per line.
#   * NL: Number of copies on left with same check-sum
#   * NR: Number of copies on right with same check-sum
#   * CS: Checksum (content) difference between current file and file on other side of same name:
#     = same
#     | different
#     < name on left only (but if NR>0, then a copy exists on right with different name)
#     > name on right only (but if NL>0, then a copy exists on left with different name)
#   * ctime: Create time
#     .. N/A -- file is missing on one side
#     == same
#     <U left older
#     >U left newer
#        Where U is one of: (s)econds, (h)ours, (d)ays, (w)eeks, (m)onths, (y)ears
#   * mtime: Modify Time
#     same notation as ctime 
#   * size: File Size
#     .. N/A -- file is missing on one side
#     == same
#     <U left smaller
#     >U left bigger
#        Where U is one of: (B)ytes, (K)ilobytes, (M)egabytes, (G)igabytes

################################################################################################################################
# Process command line arguments.
doPrefix    = true
pDups       = false
pPrefix     = true
pColTitles   = true
pCols       = ['NL', 'NR', 'H', 'CT', 'MT', 'SZ', 'HASH', 'NAME']
pColsFmt    = { 'NL'   => "-3", 
                'NR'   => "-3",
                'H'    => "-1",
                'CT'   => "-2",
                'MT'   => "-2",
                'SZ'   => "-2",
                'HASH' => "-36",
                'NAME' => 0  }
searchArg   = [[ 'H'  , '!=' ],
               [ 'CT' , '!=' ],
               [ 'MT' , '!=' ],
               [ 'SZ' , '!=' ]]
searchArgD  = true
csumToRep   = [ 0, 1 ]
csumType    = 'md5'
csumFiles   = Array.new
srchArgPat  = Regexp.new('-+s(' + pCols.join('|') + ')=(.+)$')
ARGV.each do |curArg|
  if (curArg.match(/-+backup/i)) then
    pPrefix    = false
    pColTitles = false
    searchArg  = [ ['H', '|'],  ['H', '>'] ]
    pCols      = [ 'NAME' ]
  elsif (tmp=curArg.match(/-+sALL/)) then
    searchArg = nil
    searchArgD = true
  elsif (tmp=curArg.match(/-+pCols=(.+)/i)) then
    pCols    = tmp[1].split(/\s*,\s*/)
  elsif (tmp=curArg.match(srchArgPat)) then
    if (searchArgD) then
      searchArg = Array.new
      searchArgD = false
    end
    searchArg.push([ tmp[1], tmp[2] ])
  elsif (tmp=curArg.match(/-+pColTitles=(Y|N)/i)) then
    pColTitles    = (tmp[1].upcase=='Y')
  elsif (tmp=curArg.match(/-+pDups=(Y|N)/i)) then
    pDups    = (tmp[1].upcase=='Y')
  elsif (tmp=curArg.match(/-+pPrefix=(Y|N)/i)) then
    pPrefix = (tmp[1].upcase=='Y')
  elsif (tmp=curArg.match(/-+doPrefix=(Y|N)/i)) then
    doPrefix = (tmp[1].upcase=='Y')
  elsif (tmp=curArg.match(/-+csum=(md5|sha1)/i)) then
    csumType   = tmp[1].downcase
    pColsFmt['HASH'] = ( csumType == 'md5' ? "-31" : "-40" )
  else
    csumFiles.push(curArg)
    if (curArg.match(/^-/)) then
      STDERR.puts("WARNING: Argument assumed to be file-name: #{curArg}!")
    end
  end
end
# Make sure the arguments are OK
if (csumFiles.length > 2) then
  STDERR.puts("ERROR: too many checksum files to process: #{csumFiles.inspect}!")
  exit
elsif (csumFiles.length < 2) then
  STDERR.puts("ERROR: too few checksum files to process: #{csumFiles.inspect}!")
  exit
end

################################################################################################################################
# Compute units on a delta
def deltaUnits(a, b, utype)
  if(a.nil? || b.nil?) then
    return '..'
  end
  if(a==b) then
    return '=='
  else
    { 'time' => [ [ 31536000,   'y' ], [ 2592000, 'm' ], [ 604800, 'w' ], [ 86400, 'd' ], [3600, 'h' ], [ 1, 's' ] ],
      'size' => [ [ 1073741824, 'G' ], [ 1048576, 'M' ], [ 1024,   'K' ], [ 1,     'B' ] ]
    }[utype].each do |sz, lb|
      if((a-b).abs>=sz) then
        return (if (a<b) then '<' else '>' end)  + lb
      end
    end
    return 'ER'
  end
end

################################################################################################################################
# Read in the data files...
fileInfo  = Hash.new
['N2SN', 'N2H', 'N2CT', 'N2MT', 'N2SZ', 'H2N', 'PFIX'].each do |key|          # Create all the arrays
  fileInfo[key]  = Array.new
end
csumFiles.each_with_index do |fileName, i|
  ['N2SN', 'N2H', 'N2CT', 'N2MT', 'N2SZ', 'H2N'].each do |key|                # Init all the fileInfo members
    fileInfo[key][i]  = Hash.new
  end
  open(fileName, 'r') do |file|
    fileFormat = nil
    file.each_line do |line|
      timeStamp, atime, ctime, mtime, md5, sha1, prtCharCnt, lineCnt, charCnt, fname = line.chomp.split(/ /, 10)
      if (doPrefix) then                                                      # Find the maximal path-name prefix
        if (fileInfo['PFIX'][i]) then
          0.upto(fileInfo['PFIX'][i].length-1) do |j|
            if (fileInfo['PFIX'][i][j] != fname[j]) then
              if (j==0) then
                fileInfo['PFIX'][i] = ''
              else
                fileInfo['PFIX'][i] = fileInfo['PFIX'][i][0..j-1]
              end
              break;
            end
          end
        else
          fileInfo['PFIX'][i] = fname;
        end
      end
      csum = ( csumType=='md5' ? md5 : sha1 )
      fileInfo['N2H'][i][fname]  = csum                                       # Store away the data...
      fileInfo['N2SN'][i][fname] = fname
      fileInfo['N2CT'][i][fname] = ctime.to_i
      fileInfo['N2MT'][i][fname] = mtime.to_i
      fileInfo['N2SZ'][i][fname] = charCnt.to_i
      fileInfo['H2N'][i][csum]   = (fileInfo['H2N'][i][csum] || Array.new).push(fname)
    end
  end
end

################################################################################################################################
# Post-process the data
#Compute short names if we found a prefix...
if (doPrefix) then
  fileInfo['N2H'].each_index do |i|
    if (fileInfo['PFIX'][i].length > 0) then
      prefixRe = Regexp.new('^' + fileInfo['PFIX'][i])
      fileInfo['N2H'][i].keys.each do |fname|
        fileInfo['N2SN'][i][fname] = fname.sub(prefixRe, '')
      end
    end
  end
end

################################################################################################################################
# Build up the search patterns from the inputs before we need them in the output stage below
searchCriteria = nil
if ( !(searchArg.nil?)) then
  searchCriteria = Array.new
  searchArg.each do |tag, pat| 
    if    (tag.upcase == 'NAME') then
      searchCriteria.push( [ tag, Regexp.new(pat) ] )
    elsif (tmp=pat.match(/^(!{0,1})(.+)$/i)) then
      searchCriteria.push([tag, [ (tmp[1].upcase == '!'), tmp[2]] ])
    else
      STDERR.puts("ERROR: Bad search argument: #{tag.inspect} => #{pat.inspect}!")
      exit
    end
  end
end

################################################################################################################################
# Print out report
if (pPrefix && doPrefix) then
  puts("< Prefix: #{fileInfo['PFIX'][0].inspect}")
  puts("> Prefix: #{fileInfo['PFIX'][1].inspect}")
end
if (pColTitles) then
  pCols.each { |ct| printf("%#{pColsFmt[ct]}s ", ct) }
  printf("\n")
end
fnameSeenInBigList = Hash.new
fnameSeenInDupList = Hash.new
csumToRep.each do |i|
  j = (i-1).abs # Index of the other file (0=>1, 1=>0)
  fileInfo['N2H'][i].keys.sort.each do |fnamei|
    shortName = fileInfo['N2SN'][i][fnamei]

    if (! (fnameSeenInBigList.member?(shortName))) then
      theCols = Hash.new
      fnameSeenInBigList[shortName] = 1;

      fnames = Array.new
      fnames[i] = fnamei
      fnames[j] = ( fileInfo['PFIX'][j] ? fileInfo['PFIX'][j] + shortName : fnamei )

      inFile  = [0, 1].map { |k| fileInfo['N2H'][k].member?(fnames[k]) }             # What sides is file name on
      curHash = fileInfo['N2H'][i][fnames[i]]                                        # Hash of current file
      numFnd  = [0, 1].map { |k| (fileInfo['H2N'][k][curHash] || Array.new).length } # Number of times has appears on each side
      theCols['NL'] = sprintf('%03d', numFnd[0])
      theCols['NR'] = sprintf('%03d', numFnd[1])

      theCols['H']  = ''
      if(inFile[0] && inFile[1]) then
        if(curHash == fileInfo['N2H'][j][fnames[j]]) then
          theCols['H'] = '='
        else
          theCols['H'] = '|'
        end
      else
        if(inFile[0]) then
          theCols['H'] = '<'
        else
          theCols['H'] = '>'
        end
      end

      theCols['CT'] = deltaUnits(fileInfo['N2CT'][0][fnames[0]], fileInfo['N2CT'][1][fnames[1]], 'time')
      theCols['MT'] = deltaUnits(fileInfo['N2MT'][0][fnames[0]], fileInfo['N2MT'][1][fnames[1]], 'time')
      theCols['SZ']  = deltaUnits(fileInfo['N2SZ'][0][fnames[0]], fileInfo['N2SZ'][1][fnames[1]], 'size')

      theCols['HASH'] = fileInfo['N2H'][i][fnames[i]]
      theCols['NAME'] = shortName

      # Evaluate search criteria
      printThisOne = false
      if (searchCriteria.nil?) then
        printThisOne = true
      else
        searchCriteria.each do |tag, pat|
          if (printThisOne) then
            break
          end
          if (pat.class == Regexp) then
            if (theCols[tag].match(pat)) then
              printThisOne = true
            end
          else
            tmp = (theCols[tag].slice(0, pat[1].length) == pat[1])
            printThisOne = ( pat[0] ? !tmp : tmp )
          end
        end
      end

      if (printThisOne) then
        pCols.each { |ct| printf("%#{pColsFmt[ct]}s ", theCols[ct]) }
        printf("\n")
        if(pDups) then
          sameContentFileList = Hash.new
          [0, 1].each do |k|
            if(fileInfo['H2N'][k].member?(curHash)) then
              fileInfo['H2N'][k][curHash].sort.each do |dfname|
                dshortName = fileInfo['N2SN'][k][dfname]
                if (sameContentFileList.member?(dshortName)) then
                  sameContentFileList[dshortName] = '='
                else
                  if (k == 0)
                    sameContentFileList[dshortName] = '<'
                  else
                    sameContentFileList[dshortName] = '>'
                  end
                end
              end
            end
          end
          if (sameContentFileList.keys.length > 1) then
            sameContentFileList.each do |dshortName, locs|
              if ( !(fnameSeenInDupList.member?(dshortName))) then
                printf("%s %s %s\n", ' '*53, locs, dshortName)
                fnameSeenInDupList[dshortName] = 1
              end
            end
          end
        end
      end
    end
  end
end

Generated by GNU Enscript 1.6.5.2.