#!/usr/bin/env perl =head1 NAME dupes - Report on files with duplicate contents, via SHA1 hash. =cut =head1 SYNOPSIS dupes [options] directory General Options: --help Show the help information for this script. --verbose Show useful debugging information. =cut =head1 ABOUT dupes is a simple script to report upon files that are identical, recursively. The process involves calculating the SHA1 hash of the file contents and reporting on anything collisions we see. Note that a collision might be caused by a symbolic link, or hardlink, so blindly deleting duplicates without investigation is almost certainly a mistake. =cut =head1 AUTHOR Steve -- http://www.steve.org.uk/ =cut =head1 LICENSE Copyright (c) 2013 by Steve Kemp. All rights reserved. This script is free software;you can redistribute it and/or modify it under the same terms as Perl itself. The LICENSE file contains the full text of the license. =cut use strict; use warnings; use File::Find; use Getopt::Long; use Pod::Usage; # # Parse the arguments # my %config = parsedOptions(); # # The path to examine. # my $path = $ARGV[0] || '.'; # # Get the hashing object, dynamically. # my $ctx = getHashObject(); my %digest; # # Find files and store the hash of their contents. # find( { 'wanted' => sub { if ( -f $_ ) { lstat; if ( ( -r _ ) && ( !-l _ ) ) { $ctx->reset; $ctx->addfile($_); my $md5 = $ctx->hexdigest; if ( exists $digest{ $md5 } ) { push @{ $digest{ $md5 }->{ 'dupes' } }, $_; } else { $digest{ $md5 } = { 'file' => $_, 'dupes' => [] }; } } } else { $config{ 'verbose' } && print "Entering $_\n"; } }, 'no_chdir' => 1 }, $path ); # # Report upon collisions. # foreach my $hash ( keys %digest ) { my $dupes = $digest{ $hash }->{ 'dupes' }; my $src = $digest{ $hash }->{ 'file' }; if (@$dupes) { print $src . "\n"; foreach my $dupe (@$dupes) { print "\t$dupe\n"; } } } # # All done. # exit(0); =begin doc Load one of M and M, depending on what is available. =end doc =cut sub getHashObject { my $hash = undef; foreach my $module (qw! Digest::SHA Digest::SHA1 !) { # If we succeeded in calculating the hash we're done. next if ( defined($hash) ); # Attempt to load the module my $eval = "use $module;"; ## no critic (Eval) eval($eval); ## use critic if ( !$@ ) { $hash = $module->new; } } if ($hash) { return ($hash); } else { print "Failed to load either DIgest::SHA or Digest::SHA1\n"; exit(1); } } =begin doc Parse the options and return suitable values. =end doc =cut sub parsedOptions { my %vars; exit if ( !GetOptions( "help" => \$vars{ 'help' }, "verbose" => \$vars{ 'verbose' } ) ); pod2usage(1) if ( $vars{ 'help' } ); return (%vars); }