#!/usr/bin/perl
# Copyright (c) 2000 Flavio Glock <fglock@pucrs.br>. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# repetidos.pl
# finds duplicate directories and files
#
# USAGE:
#	repetidos.pl ([directory|file] ...)
#
# TODO:
#	--verbose
#	--outfile
#	use compare_text for text files
#
# 0.008
#	always group compares
#
# 0.007
#	prints filenames using "\" in Windows
#
# 0.006
#	more uniform compare
#	doesn't miss compares
#	Directories can match even if filenames are different
#
# 0.005
#	nicer output
#	Uses more pointers
#
# 0.004
#   Corrected Subdir size
#
# 0.003
#	Sorts numerically by directory size
#
# 0.002
#	Subdir compare is exact
#
# 0.001
# 	Subdir compare is uncertain - will show "?"
# 	File compare is exact
#

use File::Compare;

$dots = 20;
$join = '/';
$separator = '#';
$nFields = 2;

$a  = shift;
$a	= '.' unless $a;

@b = ();
@subdir = ();
%visited = ();
print "Reading:";
while ($a) {
	# $a =~ s|\\|\/|g;
	$a =~ s/([^:])\/$/$1/;
	if ($visited{$a}) {
		print "\n\t(", &print_filename($a), " done)";
	}
	else {
		$size = &readmydir ($a);
		if (-d $a) {
			push @subdir, "$size$separator$a";
			$visited{$a} = 1;
		}
	}
	$a  = shift;
}
undef %visited;
print "\n\n";

# print "Sorting: ";
# sort files in reverse order of filesize
@b = sort  {$b <=> $a} @b;
foreach(0 .. $#b) {
	($filesize[$_], $filename[$_]) = split($separator, $b[$_], $nFields);
}
undef @b;
# sort dirs in reverse order of the sum of filesizes
@subdir = sort {$b <=> $a} @subdir;
foreach(0 .. $#subdir) {
	($dirsize[$_], $dirname[$_]) = split($separator, $subdir[$_], $nFields);
}
undef @subdir;
# print "done.\n";

print "Duplicate Subdirs:\n";
foreach (0 .. $#dirname) {
	$last = $_ + 1;
	$match = 0;
	while (($last <= $#dirname) and	($dirsize[$_] == $dirsize[$last])) {
		if ( &compare_dir ($_, $last) == 0 ) {
			$report_size = $dirsize[$_];
			$report_size =~ s/(.*)\.(.*)/$1 bytes in $2 files/;
			print "\n\t", &print_filename($dirname[$_]), " \t$report_size\n" unless $match;
			print "\t", &print_filename($dirname[$last]), "\n";
			splice(@dirname, $last, 1);
			splice(@dirsize, $last, 1);
			$match = 1;
		}
		else {
			$last++;
		}
	}
}
print "\n";

print "Duplicate Files:\n";
foreach(0 .. $#filename) {
	$last = $_ + 1;
	$match = 0;
nextfile:
	while (($last < $#filename) and	($filesize[$_] == $filesize[$last])) {
		if ( &compare_file ($_, $last) == 0 ) {
			print "\n\t", &print_filename($filename[$_]), " \t$filesize[$_] bytes\n" unless $match;
			print "\t", &print_filename($filename[$last]), "\n";			splice(@filename, $last, 1);
			splice(@filesize, $last, 1);
			$match = 1;
		}
		else {
			$last++;
		}
	}
}

sub readmydir {
	# globals: @subdir, @b
	my ($a) = @_;
	my $size;

	if (-d $a) {
		# is-subdir
		my @dir;
		$size = 0;

		if ($visited{$a}) {
			print "\n\t(", &print_filename($a), " done)";
		}
		else {
			print "\n\tread:\t", &print_filename($a), " \t";
			opendir(DIR,$a);
			@dir = readdir(DIR);
			closedir DIR;
			foreach(@dir) {
				if (($_ ne ".") and ($_ ne "..")) {
					$item = "$a$join$_";
					$size += readmydir ($item);
				}
			}
			push @subdir, "$size.$#dir$separator$a$join";
			$visited{$a} = 1;
			return $size;
		}
	} 
	else {
		# is-file
		$size = -s $a;
		if ($k++ > $dots) { print "."; $k = 0; }
		push @b, "$size$separator$a";
		return $size;
	}
}

sub compare_file {
	# globals: @filesize, @filename
	my ($this, $other) = @_;
	return -2 if ($this == $other);
	return  1 if ($filesize[$this] != $filesize[$other]);
	return -3 if ($filesize[$this] < 1);
	return -4 if ($filename[$this] eq $filename[$other]);
	return compare ($filename[$this], $filename[$other]);
}

sub compare_file_zero {
	# globals: @filesize, @filename
	my ($this, $other) = @_;
	return -2 if ($this == $other);
	return  1 if ($filesize[$this] != $filesize[$other]);
	return  0 if ($filesize[$this] < 1);
	return -4 if ($filename[$this] eq $filename[$other]);
	return compare ($filename[$this], $filename[$other]);
}

sub compare_dir {
	# globals: @dirname, @dirsize, @filename
	my ($this, $other) = @_;
	my (@files_this, @files_other, $tmp, $tmp_this, $tmp_other);
	return -2 if ($this == $other);
	return  1 if ($dirsize[$this] != $dirsize[$other]);
	return -3 if ($dirsize[$this] < 1);
	return -4 if ($dirname[$this] eq $dirname[$other]);
	return -5 if
		( substr($dirname[$other], 0, length($dirname[$this]))  eq $dirname[$this]) or
		( substr($dirname[$this],  0, length($dirname[$other])) eq $dirname[$other]);
	# get list of files in subdirs
		$tmp_this = $dirname[$this];
		$tmp_other = $dirname[$other];
		@files_this = ();
		@files_other = ();
		foreach(0 .. $#filename) {
			$tmp = substr($filename[$_], 0, length($tmp_this));
			if ($tmp eq $tmp_this) {
				push @files_this, $_;
			}
			$tmp = substr($filename[$_], 0, length($tmp_other));
			if ($tmp eq $tmp_other) {
				push @files_other, $_;
			}
		}
	# compare files
		CMPFILE: foreach $this (0 .. $#files_this) {
			foreach $other (0 .. $#files_other) {
				if ( &compare_file_zero ($files_this[$this], $files_other[$other]) == 0 ) {
					# match
					splice(@files_other, $other, 1);
					next CMPFILE;
				}
			}
			# no match
			return 1;
		}
	# report
		return 0;
}

sub print_filename {
    if ($^O =~ /win32/i) { 
        my $a = $_[0];
		$a =~ tr|\/|\\|;
		return $a;
    }
	return $_[0];
}

1;
