Post by Robert PrinsHi all,
http://www.algorytm.org/przetwarzanie-tekstu/algorytm-bm-boyer-moorea/bm-d.html
but neither of them seems to work correctly, the masm32 version is
unable to find one particular string, the Pascal version only finds 9
occurrences of one particular (other) string.
Does anyone have the mixed source/assembler output of a working 32-bit
version written in C?
Robert
Here's some random code I wrote in 1998, its main calim to fame is that
it should allow you to search in a case-independent way inside unicode
strings, while still using very little setup/&table memory.
It works by generating two copies of the search string, forcing one to
UPPERCASE_ONLY and the other to lowercase_only, then when verifying a
match is is OK if at least one of the forms fits at any given position.
This means that it works OK even if the function convert to/from
upper/lower case is complicated & slow.
Terje
#include <stddef.h> // wchar_t definition
#include <windows.h> // To get the monocasing functions!
#define _memuprw(buf, len) CharUpperBuffW(buf, len)
#define _memlwrw(buf, len) CharLowerBuffW(buf, len)
#define _hashw(h) (h & 255) // _very_ simplistic hash from Unicode to
[0..255]
#define _memupra(buf, len) CharUpperBuffA((char *) buf, len)
#define _memlwra(buf, len) CharLowerBuffA((char *) buf, len)
typedef unsigned char achar_t;
class wboyer_moore {
private:
wchar_t *lPattern;
wchar_t *uPattern;
wchar_t *lPatEnd;
wchar_t *uPatEnd;
unsigned plen;
bool ignorecase;
unsigned char skiptable[256];
bool init(wchar_t *pattern, unsigned len, bool icase);
public:
wboyer_moore(wchar_t *pattern, unsigned len, bool icase);
~wboyer_moore(void);
wchar_t *search(wchar_t *buf, unsigned buflen);
};
bool wboyer_moore::init(wchar_t *pattern, unsigned len, bool icase)
{
plen = len;
ignorecase = icase;
lPattern = uPattern = NULL;
if (len == 0) return true;
lPattern = new wchar_t [len];
if (!lPattern) return false;
memcpy(lPattern, pattern, len * sizeof(wchar_t));
uPattern = lPattern;
if (icase) {
uPattern = new wchar_t [len];
if (!uPattern) return false;
memcpy(lPattern, pattern, len * sizeof(wchar_t));
_memlwrw(lPattern, len);
_memuprw(uPattern, len);
}
if (len > 1) {
unsigned l, skip, p;
lPatEnd = lPattern + len - 1;
uPatEnd = uPattern + len - 1;
l = min(len, 255);
memset(skiptable, l, sizeof(skiptable));
for (skip = l - 1, p = plen - len; skip--; p++) {
skiptable[_hashw(lPattern[p])] = skiptable[_hashw(uPattern[p])] = skip;
}
}
return true;
}
wboyer_moore::wboyer_moore(wchar_t *pattern, unsigned len, bool icase)
{
init(pattern, len, icase);
}
wboyer_moore::~wboyer_moore(void)
{
if (lPattern) delete lPattern;
if (ignorecase && uPattern) delete uPattern;
}
wchar_t *wboyer_moore::search(wchar_t *buf, unsigned buflen)
{
if (buflen < plen) return NULL; // Buffer too shart, cannot match
wchar_t *bufend = buf + buflen;
if (plen <= 1) {
if (plen == 0) return buf; // An empty search pattern will always match
wchar_t l = lPattern[0];
if (ignorecase) {
wchar_t u = uPattern[0];
do {
wchar_t w;
if (((w = *buf) == l) || (w == u)) return buf;
} while (++buf < bufend);
return NULL;
}
// case-sensitive search for a single char, could use _wmemchr()!
do {
if (*buf == l) return buf;
} while (++buf < bufend);
return NULL;
}
// The search pattern is at least two characters long, so use the skip
table:
wchar_t *b = buf + plen - 1; // Starting position for end of pattern
do { // Repeat until the end of the search buffer
unsigned skip;
do { // repeat until we find a possible matching endpoint
skip = skiptable[_hashw(*b)];
b += skip; // Guaranteed safe skip length
if (b >= bufend) return NULL;
} while (skip);
// We have a possible match on the endpoint, check the full pattern:
wchar_t *pl = lPatEnd, *pu = uPatEnd, *pb = b, w;
for (unsigned l = plen; ( (w = *pb) == *pl) || (w == *pu); pb--, pl--,
pu--) {
if (--l == 0) return pb;
}
} while (++b < bufend); // Increment end pointer and try again
return NULL;
}
class aboyer_moore {
private:
achar_t *lPattern;
achar_t *uPattern;
achar_t *lPatEnd;
achar_t *uPatEnd;
unsigned plen;
bool ignorecase;
unsigned char skiptable[256];
bool init(achar_t *pattern, unsigned len, bool icase);
public:
aboyer_moore(achar_t *pattern, unsigned len, bool icase);
~aboyer_moore(void);
achar_t *search(achar_t *buf, unsigned buflen);
};
bool aboyer_moore::init(achar_t *pattern, unsigned len, bool icase)
{
plen = len;
ignorecase = icase;
lPattern = uPattern = NULL;
if (len == 0) return true;
lPattern = new achar_t [len];
if (!lPattern) return false;
memcpy(lPattern, pattern, len * sizeof(achar_t));
uPattern = lPattern;
if (icase) {
uPattern = new achar_t [len];
if (!uPattern) return false;
memcpy(lPattern, pattern, len * sizeof(achar_t));
_memlwra(lPattern, len);
_memupra(uPattern, len);
}
if (len > 1) {
unsigned l, skip, p;
lPatEnd = lPattern + len - 2;
uPatEnd = uPattern + len - 2;
l = min(len, 255);
memset(skiptable, l, sizeof(skiptable));
for (skip = l - 1, p = plen - len; skip--; p++) {
skiptable[lPattern[p]] = skiptable[uPattern[p]] = skip;
}
}
return true;
}
aboyer_moore::aboyer_moore(achar_t *pattern, unsigned len, bool icase)
{
init(pattern, len, icase);
}
aboyer_moore::~aboyer_moore(void)
{
if (lPattern) delete lPattern;
if (ignorecase && uPattern) delete uPattern;
}
achar_t *aboyer_moore::search(achar_t *buf, unsigned buflen)
{
if (buflen < plen) return NULL; // Buffer too shart, cannot match
achar_t *bufend = buf + buflen;
if (plen <= 1) {
if (plen == 0) return buf; // An empty search pattern will allways match
achar_t l = lPattern[0];
if (ignorecase) {
achar_t u = uPattern[0];
do {
achar_t a;
if (((a = *buf) == l) || (a == u)) return buf;
} while (++buf < bufend);
return NULL;
}
// case-sensitive search for a single char, could use _memchr()!
do {
if (*buf == l) return buf;
} while (++buf < bufend);
return NULL;
}
// The search pattern is at least two characters long, so use the skip
table:
achar_t *b = buf + plen - 1; // Starting position for end of pattern
do { // Repeat until the end of the search buffer
unsigned skip;
do { // repeat until we find a matching endpoint
skip = skiptable[*b];
b += skip; // Guaranteed safe skip length
if (b >= bufend) return NULL;
} while (skip);
// We have a match on the endpoint, check the rest of the pattern:
achar_t *pl = lPatEnd, *pu = uPatEnd, *pb = b - 1, a;
for (unsigned l = plen; ( (a = *pb) == *pl) || (a == *pu); pb--, pl--,
pu--) {
if (--l == 0) return pb;
}
} while (++b < bufend); // Increment end pointer and try again
return NULL;
}
int main(int, char **)
{
return 0;
}
--
- <Terje.Mathisen at tmsw.no>
"almost all programming can be viewed as an exercise in caching"