stringSearch.command.c
/****************************************************************************************************/
/*                                                                                                  */
/* stringSearch.command.c:                                                                          */
/*                                                                                                  */
/****************************************************************************************************/

/****************************************************************************************************/
/*                                                                                                  */
/*     Copyright (C) 2004 Joerg Kunze                                                               */
/*                                                                                                  */
/*     This file is part of siliconBrain.                                                           */
/*                                                                                                  */
/*     siliconBrain is free software; you can redistribute it and/or modify                         */
/*     it under the terms of the GNU General Public License as published by                         */
/*     the Free Software Foundation; either version 2 of the License, or                            */
/*     (at your option) any later version.                                                          */
/*                                                                                                  */
/*     siliconBrain is distributed in the hope that it will be useful,                              */
/*     but WITHOUT ANY WARRANTY; without even the implied warranty of                               */
/*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                                */
/*     GNU General Public License for more details.                                                 */
/*                                                                                                  */
/*     You should have received a copy of the GNU General Public License                            */
/*     along with this program; if not, write to the Free Software                                  */
/*     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA                    */
/*                                                                                                  */
/****************************************************************************************************/
#include <stdio.h>
#include <limits.h>

#include "siliconBrainSpecification"
#include "stringSearch"

const char *stringSearchSiliconBrainRelease       = "$siliconBrainRelease: 0.2.3 $";
const char *stringSearchSiliconBrainRcsIdentifier = "$Id: stringSearch.command.c,v 1.8 2004/12/14 23:31:26 joerg Exp $";
const char *stringSearchSiliconBrainSaveStamp     = "$siliconBrainSaveStamp: 2004/12/14 20:37:11, Joerg Kunze$";

// try to beat grep -F --mmap
// time grep -F --mmap "Joerg liebt Ute" ~/big.data                   real    0m0.091s, user    0m0.090s, sys     0m0.000s
// time grep -F --mmap "Joerg liebt Ute" ~/jk.test                    real    0m0.001s, user    0m0.000s, sys     0m0.000s

// stringSearch --complete --expression="Joerg liebt Ute" ~/big.data  real    0m0.102s, user    0m0.120s, sys     0m0.020s
// stringSearch --complete --expression="Joerg liebt Ute" ~/jk.test   real    0m0.001s, user    0m0.000s, sys     0m0.000s

/* Boyer-Moore-Horspool Algorithm */

typedef struct {
   String  expression;
   char   *lastExpression;
   bool    expressionMatches;
   ssize_t remainingLength;
   size_t  jump[ UCHAR_MAX ];
} Context;

extern void *stringSearchOpen( const stringSearchOptions *options ) {
    Context *context = malloc( sizeof( Context ) );

    context->expression        = string( options->expression );
    context->lastExpression    = context->expression.begin + context->expression.length - 1;
    context->expressionMatches = 0;
    context->remainingLength   = 0;

    unsigned char *expression = (unsigned char *)context->expression.begin;
    size_t         jumpLength = context->expression.length;

    unsigned int i;

    for( i = 0; i <= UCHAR_MAX; ++i )
       context->jump[ i ] = jumpLength;

    while( expression < (unsigned char *)context->lastExpression )
       context->jump[ *expression++ ] = --jumpLength;

    return context;
}

extern CommandReturnCode stringSearchClose( const stringSearchOptions *options, void *context ) {
   CommandReturnCode returnCode = ((Context *)context)->expressionMatches ? commandOk : commandError;
   free( context );
   return returnCode;
}

/* TODO
- Extensively comment the source and beuatify it
- if needle is larger than source, but can span several sources:
     { echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'ab'; sync; echo 'cdyabcdyy'; } | stringSearch --expression=$'ab\nab\nab\ncd'
*/

extern CommandReturnCode stringSearch( const stringSearchOptions *options, void *commandData, String source ) {

   Context *context = commandData;

            char *lastExpression = context->lastExpression;
   register char  lastCharacter  = *lastExpression;
   register char *lastSource     = source.begin + source.length - 1;

   register char *lastTrial      = (source.begin + context->expression.length) - 1;

   if( context->remainingLength ) {

      ssize_t remainingLength = context->remainingLength;

      char *remainingFirst = context->expression.begin;

      lastTrial -= remainingLength;

      while( (remainingLength > 0 ) && (lastTrial <= lastSource) ) {
         if( *lastTrial == *lastExpression ) {
            if(
               !memcmp( context->expression.begin, remainingFirst, remainingLength ) &&
               !memcmp( context->expression.begin + remainingLength, lastTrial - (context->expression.length - remainingLength) + 1, (context->expression.length - remainingLength) )
            ) {
               context->expressionMatches = 1;
               return commandStop;
            }
         }
         lastTrial       += context->jump[ (unsigned char)*lastTrial ];
         remainingFirst  += context->jump[ (unsigned char)*lastTrial ];
         remainingLength -= context->jump[ (unsigned char)*lastTrial ];
      }
   }

   while( lastTrial <= lastSource ) {
      if( *lastTrial == lastCharacter ) {
         if( !memcmp( context->expression.begin, lastTrial - context->expression.length + 1, context->expression.length ) ) {
            context->expressionMatches = 1;
            return commandStop;
         }
      }

      lastTrial += context->jump[ (unsigned char)*lastTrial ];
   }

   /*-----------------------------------------------------------------------------------------------------------------------*/
   /* if we are here, it could be that a starting part of the needle matches the rest of the haystack. This occurrs only at */
   /* the end of the buffer, so we allow for a brute force method.                                                          */
   /* We use a last time the jump table information, to know where to start. In the line buffered case, where the last      */
   /* cahr is a '\n' always, this is a good optimazation for when there is no \n in the needle, which in turn is normal.    */
   /* In the fully buffered case we are seldom here and thus need no optimazation.                                          */
   /*-----------------------------------------------------------------------------------------------------------------------*/
   char * firstTrial2 = lastSource + context->jump[ (unsigned char)*lastSource ];
   char * firstTrial  = firstTrial2 > lastTrial ? firstTrial2 : lastTrial;

   firstTrial -= context->expression.length + 1;

   if( firstTrial > lastSource ) return commandOk;

   size_t remainingLength;

   for( remainingLength = lastSource - firstTrial + 1; remainingLength; --remainingLength, ++firstTrial ) {
      if( !memcmp( context->expression.begin, firstTrial, remainingLength ) ) {
         context->remainingLength = remainingLength;
         return commandOk;
      }
   }

   return commandOk;
}

/*
$Log: stringSearch.command.c,v $
Revision 1.8  2004/12/14 23:31:26  joerg
published for new release 0.2.3

Revision 1.7  2004/12/14 23:17:05  joerg
published for new release 0.2.2

Revision 1.6  2004/12/14 22:20:09  joerg
pipe output of make command into stringSearch c-function
without creating a process for stringSearch. The later is used as a C-function command, not as a main.

Revision 1.5  2004/12/12 11:37:48  joerg
allow for multiple buffer spanning needles.

Revision 1.4  2004/12/02 22:45:23  joerg
Correct and test error when last character of needel apears somewhere else in needle additionally.
And prepare (already test) for buffer spanning expresions.

Revision 1.3  2004/11/30 22:39:58  joerg
Handle return code (exit status) of commands, and stop file processing if indicated by command (if not a pipe, which would block eventually).

Revision 1.2  2004/11/30 00:17:59  joerg
First implementation of Boyer-Moore-Horspool, with no second buffer

Revision 1.1  2004/11/29 00:11:57  joerg
first not really running version of a string search algorithm

*/