/****************************************************************************************************/
/* */
/* stringSearch.command.c: */
/* */
/****************************************************************************************************/
/****************************************************************************************************/
/* */
/* Copyright (C) 2004 Joerg Kunze */
/* */
/* This file is part of siliconBrain. */
/* */
/* siliconBrain is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2 of the License, or */
/* (at your option) any later version. */
/* */
/* siliconBrain is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public License */
/* along with this program; if not, write to the Free Software */
/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
/* */
/****************************************************************************************************/
#include <stdio.h>
#include <limits.h>
#include "siliconBrainSpecification"
#include "stringSearch"
const char *stringSearchSiliconBrainRelease = "$siliconBrainRelease: 0.2.3 $";
const char *stringSearchSiliconBrainRcsIdentifier = "$Id: stringSearch.command.c,v 1.8 2004/12/14 23:31:26 joerg Exp $";
const char *stringSearchSiliconBrainSaveStamp = "$siliconBrainSaveStamp: 2004/12/14 20:37:11, Joerg Kunze$";
// try to beat grep -F --mmap
// time grep -F --mmap "Joerg liebt Ute" ~/big.data real 0m0.091s, user 0m0.090s, sys 0m0.000s
// time grep -F --mmap "Joerg liebt Ute" ~/jk.test real 0m0.001s, user 0m0.000s, sys 0m0.000s
// stringSearch --complete --expression="Joerg liebt Ute" ~/big.data real 0m0.102s, user 0m0.120s, sys 0m0.020s
// stringSearch --complete --expression="Joerg liebt Ute" ~/jk.test real 0m0.001s, user 0m0.000s, sys 0m0.000s
/* Boyer-Moore-Horspool Algorithm */
typedef struct {
String expression;
char *lastExpression;
bool expressionMatches;
ssize_t remainingLength;
size_t jump[ UCHAR_MAX ];
} Context;
extern void *stringSearchOpen( const stringSearchOptions *options ) {
Context *context = malloc( sizeof( Context ) );
context->expression = string( options->expression );
context->lastExpression = context->expression.begin + context->expression.length - 1;
context->expressionMatches = 0;
context->remainingLength = 0;
unsigned char *expression = (unsigned char *)context->expression.begin;
size_t jumpLength = context->expression.length;
unsigned int i;
for( i = 0; i <= UCHAR_MAX; ++i )
context->jump[ i ] = jumpLength;
while( expression < (unsigned char *)context->lastExpression )
context->jump[ *expression++ ] = --jumpLength;
return context;
}
extern CommandReturnCode stringSearchClose( const stringSearchOptions *options, void *context ) {
CommandReturnCode returnCode = ((Context *)context)->expressionMatches ? commandOk : commandError;
free( context );
return returnCode;
}
/* TODO
- Extensively comment the source and beuatify it
- if needle is larger than source, but can span several sources:
{ echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'cdyabcdyy'; } | stringSearch --expression=$'ab\nab\nab\ncd'
*/
extern CommandReturnCode stringSearch( const stringSearchOptions *options, void *commandData, String source ) {
Context *context = commandData;
char *lastExpression = context->lastExpression;
register char lastCharacter = *lastExpression;
register char *lastSource = source.begin + source.length - 1;
register char *lastTrial = (source.begin + context->expression.length) - 1;
if( context->remainingLength ) {
ssize_t remainingLength = context->remainingLength;
char *remainingFirst = context->expression.begin;
lastTrial -= remainingLength;
while( (remainingLength > 0 ) && (lastTrial <= lastSource) ) {
if( *lastTrial == *lastExpression ) {
if(
!memcmp( context->expression.begin, remainingFirst, remainingLength ) &&
!memcmp( context->expression.begin + remainingLength, lastTrial - (context->expression.length - remainingLength) + 1, (context->expression.length - remainingLength) )
) {
context->expressionMatches = 1;
return commandStop;
}
}
lastTrial += context->jump[ (unsigned char)*lastTrial ];
remainingFirst += context->jump[ (unsigned char)*lastTrial ];
remainingLength -= context->jump[ (unsigned char)*lastTrial ];
}
}
while( lastTrial <= lastSource ) {
if( *lastTrial == lastCharacter ) {
if( !memcmp( context->expression.begin, lastTrial - context->expression.length + 1, context->expression.length ) ) {
context->expressionMatches = 1;
return commandStop;
}
}
lastTrial += context->jump[ (unsigned char)*lastTrial ];
}
/*-----------------------------------------------------------------------------------------------------------------------*/
/* if we are here, it could be that a starting part of the needle matches the rest of the haystack. This occurrs only at */
/* the end of the buffer, so we allow for a brute force method. */
/* We use a last time the jump table information, to know where to start. In the line buffered case, where the last */
/* cahr is a '\n' always, this is a good optimazation for when there is no \n in the needle, which in turn is normal. */
/* In the fully buffered case we are seldom here and thus need no optimazation. */
/*-----------------------------------------------------------------------------------------------------------------------*/
char * firstTrial2 = lastSource + context->jump[ (unsigned char)*lastSource ];
char * firstTrial = firstTrial2 > lastTrial ? firstTrial2 : lastTrial;
firstTrial -= context->expression.length + 1;
if( firstTrial > lastSource ) return commandOk;
size_t remainingLength;
for( remainingLength = lastSource - firstTrial + 1; remainingLength; --remainingLength, ++firstTrial ) {
if( !memcmp( context->expression.begin, firstTrial, remainingLength ) ) {
context->remainingLength = remainingLength;
return commandOk;
}
}
return commandOk;
}
/*
$Log: stringSearch.command.c,v $
Revision 1.8 2004/12/14 23:31:26 joerg
published for new release 0.2.3
Revision 1.7 2004/12/14 23:17:05 joerg
published for new release 0.2.2
Revision 1.6 2004/12/14 22:20:09 joerg
pipe output of make command into stringSearch c-function
without creating a process for stringSearch. The later is used as a C-function command, not as a main.
Revision 1.5 2004/12/12 11:37:48 joerg
allow for multiple buffer spanning needles.
Revision 1.4 2004/12/02 22:45:23 joerg
Correct and test error when last character of needel apears somewhere else in needle additionally.
And prepare (already test) for buffer spanning expresions.
Revision 1.3 2004/11/30 22:39:58 joerg
Handle return code (exit status) of commands, and stop file processing if indicated by command (if not a pipe, which would block eventually).
Revision 1.2 2004/11/30 00:17:59 joerg
First implementation of Boyer-Moore-Horspool, with no second buffer
Revision 1.1 2004/11/29 00:11:57 joerg
first not really running version of a string search algorithm
*/