#! /usr/bin/perl
# find duplicate words on one line
# also finds repeated last word on lineN and first word of lineN+1
#
# For source files (vs. Docs), drop a leading " *" in case it might be
# kernel-doc notation. This would facilitate catching repeated words
# at the end of one line and the beginning of the next line, after the " *".

# TBD: print entire offending line(s) when a repeated word is found.
# can this use an environment variable instead of ARGV?

$VERSION = "v0.2";

my $infile;
my $line;
my $line_num;
my $last_word;
my $ix;
my $last_ix;

sub usage()
{
	print "find_dup_words <filenames>  {$VERSION}\n";
	exit 1;
}

# test for integer number or hex number (0x0-9a-f)
sub is_numeric($)
{
	$var = shift;

	return 1 if ($var =~ /^[+-]?\d+$/);
	return 1 if ($var =~ /^0x[0-9A-F]+$/i);
	return 0;
}

sub is_special_chars($)
{
	$var = shift;

	return 1 if ($var =~ /[^[a-zA-Z0-9 ]]*/);
	##return 1 if ($var =~ /[[:punct:]]*/);
	return 0;
}


sub report_words($$$$$)
{
	$file = $_[0];
	$line = $_[1];
	$crossline = $_[2];
	$word1 = $_[3];
	$word2 = $_[4];
	$crossing = $crossline ? "/=" : "==";

	print "$file:$line: '$word1' $crossing '$word2'\n";
}

sub dump_line_words($$$)
{
	$line = shift(@_);
	$mx = shift(@_);
	@wrds = @_;

	print "## $line_num: #wrds=$mx: ";
	print "@wrds\n";
}

# main:

if (int(@ARGV) == 0 || $ARGV[0] eq "-h" || $ARGV[0] eq "--help") {
	usage();
}

foreach $infile (@ARGV)
{
	open (INFILE, $infile) or die "cannot open '$infile'\n";
	$line_num = 0;
	$last_word = "";

LINE:
	while ($line = <INFILE>) {
		$line_num++;
		chomp $line;
		next LINE if $line eq "";

		# drop common punctuation: period, comma, qmark, semi-colon, colon
		$line =~ tr/.,;:?//d;

		@words = split(/\s+/, $line);

		# For a line that begins with " * foobar() does soandso.",
		# words[0] is "" and words[1] eq "*", so ignore both of them.
		if ($words[0] eq "") {
			shift @words;
		}
		if ($words[0] eq "*") {
			shift @words;
		}

		next LINE if ($last_word eq "" && $words[0] eq "");

		##dump_line_words($line_num, scalar @words, @words);
		$numwords = scalar @words;
		##print "## $line_num: #wrds=$numwords:=\n";
		##print "@words\n";

		if (lc($last_word) eq lc($words[0])) {
			if (is_numeric($last_word) || is_special_chars($last_word)) {}
			else {
				report_words($infile, $line_num, 1, $last_word, $words[0]);
			}
		}

		# note: using /m/ matches succeed on subsets,
		# e.g., "this" matches "is". Not good.
		# So I am using lc(word1) eq lc(word2) instead.

		for ($ix = 1; $ix < scalar @words; $ix++) {
			if (lc($words[$ix - 1]) eq lc($words[$ix])) {
				if (is_numeric($words[$ix]) || is_special_chars($words[$ix])) {}
				else {
					report_words($infile, $line_num, 0,
						$words[$ix - 1], $words[$ix]);
				}
			}
			$last_ix = $ix;
		}

		$last_word = $words[$last_ix];
	} # end one infile

	close INFILE;
	print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";

} # end for all infiles

# end find_dup_words.pl;
