#! /usr/bin/perl # find duplicate words on one line # also finds repeated last word on lineN and first word of lineN+1 # # For source files (vs. Docs), drop a leading " *" in case it might be # kernel-doc notation. This would facilitate catching repeated words # at the end of one line and the beginning of the next line, after the " *". # TBD: print entire offending line(s) when a repeated word is found. # can this use an environment variable instead of ARGV? $VERSION = "v0.2"; my $infile; my $line; my $line_num; my $last_word; my $ix; my $last_ix; sub usage() { print "find_dup_words {$VERSION}\n"; exit 1; } # test for integer number or hex number (0x0-9a-f) sub is_numeric($) { $var = shift; return 1 if ($var =~ /^[+-]?\d+$/); return 1 if ($var =~ /^0x[0-9A-F]+$/i); return 0; } sub is_special_chars($) { $var = shift; return 1 if ($var =~ /[^[a-zA-Z0-9 ]]*/); ##return 1 if ($var =~ /[[:punct:]]*/); return 0; } sub report_words($$$$$) { $file = $_[0]; $line = $_[1]; $crossline = $_[2]; $word1 = $_[3]; $word2 = $_[4]; $crossing = $crossline ? "/=" : "=="; print "$file:$line: '$word1' $crossing '$word2'\n"; } sub dump_line_words($$$) { $line = shift(@_); $mx = shift(@_); @wrds = @_; print "## $line_num: #wrds=$mx: "; print "@wrds\n"; } # main: if (int(@ARGV) == 0 || $ARGV[0] eq "-h" || $ARGV[0] eq "--help") { usage(); } foreach $infile (@ARGV) { open (INFILE, $infile) or die "cannot open '$infile'\n"; $line_num = 0; $last_word = ""; LINE: while ($line = ) { $line_num++; chomp $line; next LINE if $line eq ""; # drop common punctuation: period, comma, qmark, semi-colon, colon $line =~ tr/.,;:?//d; @words = split(/\s+/, $line); # For a line that begins with " * foobar() does soandso.", # words[0] is "" and words[1] eq "*", so ignore both of them. if ($words[0] eq "") { shift @words; } if ($words[0] eq "*") { shift @words; } next LINE if ($last_word eq "" && $words[0] eq ""); ##dump_line_words($line_num, scalar @words, @words); $numwords = scalar @words; ##print "## $line_num: #wrds=$numwords:=\n"; ##print "@words\n"; if (lc($last_word) eq lc($words[0])) { if (is_numeric($last_word) || is_special_chars($last_word)) {} else { report_words($infile, $line_num, 1, $last_word, $words[0]); } } # note: using /m/ matches succeed on subsets, # e.g., "this" matches "is". Not good. # So I am using lc(word1) eq lc(word2) instead. for ($ix = 1; $ix < scalar @words; $ix++) { if (lc($words[$ix - 1]) eq lc($words[$ix])) { if (is_numeric($words[$ix]) || is_special_chars($words[$ix])) {} else { report_words($infile, $line_num, 0, $words[$ix - 1], $words[$ix]); } } $last_ix = $ix; } $last_word = $words[$last_ix]; } # end one infile close INFILE; print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; } # end for all infiles # end find_dup_words.pl;