Re: [RFC] globmatch() helper function

From: Casey Dahlin
Date: Thu Dec 18 2008 - 14:53:40 EST


George Spelvin wrote:
Eureka!

Never mind all of this angst; I've figured out a non-recursive way
to do it. Thanks for pushing me to think a bit harder and find it.
(I was actually looking for an example of the exponential pathology I
kept referring to when it dawned on me that it's actually impossible
without the additional expressive power of regexps.)

Somewhat abusing TeX's line-breaking terminology, consider a shell glob to
consist of a series of "boxes" and "glue" A box is a run of character
classes (of which literal characters and ? are degenerate cases). The
point is, a box has a well-defined width, the number of characters in the
string that it must match.
Yours is better, but I may as well post my solution:

--code starts--
#include <stdbool.h>
#include <string.h> /* For strchr */

#ifndef __pure
/* Kernel compatibility #define */
#define __pure __attribute__((pure))
#endif

/**
* is_in_class - globmatch() helper, returns true if character is in class.
* @c: Character to be tested.
* @pat: Character class to test for inclusion in, terminated by ']'.
*
* Evaluate the "body" of a character class. Given the class "[!a-z]",
* this expects @pat to point to the a, and proceeds until the closing ].
* The caller must skip the opening bracket and the complement character.
*
* The caller must verify that a terminating ] exists; this will NOT
* stop at a trailing nul byte. Also, the first character may be ];
* that is not taken as a terminator.
*
* Comparison is done using unsigned bytes, 0...255.
*/
static bool __pure
is_in_class(unsigned char c, char const *pat)
{
/*
* Iterate over each span in the character class.
* a span is either a single character x, or a
* range x-y.
*/
do {
unsigned char c1 = *pat++, c2 = c1;

if (pat[0] == '-' && pat[1] != ']') {
c2 = pat[1];
pat += 2;
}
/* Any special action if c1 > c2? */
if (c1 <= c && c <= c2)
return true;
} while (*pat != ']');
return false;
}

/**
* given a pattern, find the minimum string length that would match it
**/
static int __pure
patsize(char const *pat)
{
int retval = 0;
int grouplen = 0;

for(;*pat;pat++) {
if (*pat == '*') {
continue;
} else if ((grouplen > 2 || (grouplen == 2 && *(pat-1) != '^' &&
*(pat-1) != '!')) && *pat == ']') {
grouplen = 0;
} else if (grouplen) {
grouplen++;
} else {
if (*pat == '[') {
grouplen++;
}
retval++;
}
}
return retval;
}

/**
* Generates the next balls-and-boxes sequence of integer pattern widths
*/
static bool
next_arrangement(int *buckets, int found, int total)
{
int i;
bool retval = false;

if (!found || total == 1)
return false;

for (i = found; i < total-1; i++) {
buckets[total-1] += buckets[i];
buckets[i] = 0;
}

for (i = total - 2; i >= 0; i--) {
if (!buckets[i])
continue;
retval = true;
buckets[i]--;
buckets[i+1]++;
if ((i < total-2) && total > 2) {
buckets[i+1] += buckets[total-1];
buckets[total-1] = 0;
}
break;
}

return retval;
}

/**
* globmatch - Shell-style pattern matching, like !fnmatch(pat, str, 0)
* @pat: Pattern to match. Metacharacters are ?, *, [ and \.
* @str: String to match. the pattern must match the entire string.
*
* Perform shell-style glob matching, returning true if the match succeeds.
* Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT treat / or
* leading . specially; it isn't actually used for pathnames.
*
* This is small and simple implementation intended for device blacklists
* where the pattern is part of the kernel. It recurses on each * in
* the pattern (although the stack frame is small), so shouldn't be
* fed pathological patterns with many *s.
*/
static bool __pure
globmatch(char const *in_pat, char const *in_str)
{
int star_width[10] = { 0,0,0,0,0,0,0,0,0,0 };
int stars_found = 0;
int stars_total = 0;
char const *str = in_str;
char const *pat = in_pat;
int i;

char const *loc;
for (loc = pat; *loc; stars_total += (*(loc++) == '*'));
if (stars_total > 10)
return false;

for (loc = str; *loc; loc++, star_width[0]++);
star_width[0] -= patsize(pat);

if (star_width[stars_total-1] < 0)
return false;

/*
* Loop over each token (character or class) in pat, matching
* it against the remaining unmatched tail of str. Return false
* on mismatch, or true after matching the trailing nul bytes.
*
*/
repeat: do {
char const *end;
bool inv;
int i;

/*
* (To handle * properly, which may match zero bytes, each case is
* required to increment the str pointer itself.
*/
switch (pat[0]) {
case '?':
if (!*str++)
goto rearrange;
break;
case '*':
if (pat[1] == '\0') /* Optimize trailing * case */
return true;
for (i = 0; *str && i < star_width[stars_found];
i++, str++);
stars_found++;
break;
case '[':
/* Character class */
/* Is it an inverted character class? */
inv = (pat[1] == '^' || pat[1] == '!');
/* If no terminating ], interpret as plain text. */
end = strchr(pat+2+inv, ']');
if (!end)
goto def;
pat += 1 + inv;
if (is_in_class(*str++, pat) == inv)
goto rearrange;
pat = end;
break;
case '\\':
pat++;
/* FALLLTHROUGH */
default:
def:
if (*pat != *str++)
goto rearrange;
break;
}
} while (*pat++);
return true;

rearrange:
if (next_arrangement(star_width, stars_found, stars_total)) {
stars_found = 0;
str = in_str;
pat = in_pat;
goto repeat;
}

return false;
}

/* Test code */
#include <stdio.h>
#include <stdlib.h>

static void
test(char const *pat, char const *str, bool expected)
{
bool match = globmatch(pat, str);
printf("\"%s\" vs. \"%s\": %s %s\n", pat, str, match ? "match" : "mismatch",
match == expected ? "OK" : "*** ERROR ***");
/*if (match != expected)
exit(1);*/
}

int
main(void)
{
test("a", "a", true);
test("a", "b", false);
test("a", "aa", false);
test("a", "", false);
test("", "", true);
test("", "a", false);
test("?", "a", true);
test("?", "aa", false);
test("??", "a", false);
test("?x?", "axb", true);
test("?x?", "abx", false);
test("?x?", "xab", false);
test("*??", "a", false);
test("*??", "ab", true);
test("*??", "abc", true);
test("??*", "a", false);
test("??*", "ab", true);
test("??*", "abc", true);
test("?*?", "a", false);
test("?*?", "ab", true);
test("?*?", "abc", true);
test("*b", "b", true);
test("*b", "ab", true);
test("*b", "aab", true);
test("*bc", "abbc", true);
test("*bc", "bc", true);
test("*bc", "bbc", true);
test("*abcd*", "abcabcabcabcdefg", true);
test("*abcd*", "abcabcabcabcefg", false);
test("*abcd*q*r*", "abcabcabcabcdefgqabrcd", true);
test("[a]", "a", true);
test("[^a]", "a", false);
test("[!a]", "a", false);
test("[!a]", "b", true);
test("[ab]", "a", true);
test("[ab]", "b", true);
test("[ab]", "c", false);
test("[^ab]", "c", true);
test("[a-c]", "b", true);
test("[a-c]", "d", false);
test("[]a-ceg-ik[]", "a", true);
test("[]a-ceg-ik[]", "]", true);
test("[]a-ceg-ik[]", "[", true);
test("[]a-ceg-ik[]", "h", true);
test("[]a-ceg-ik[]", "f", false);
test("[!]a-ceg-ik[]", "h", false);
test("[!]a-ceg-ik[]", "]", false);
test("[!]a-ceg-ik[]", "f", true);

return 0;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/