Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members  

matcher.cc

00001 /*
00002 *  Name:      matcher.cc
00003 *  Author:    Rafael Jesus Alcantara Perez
00004 *  Summary:   Regular expression matcher         
00005 *  Date:      $Date: 2003/04/14 00:18:35 $
00006 *  Revision:  $Revision: 1.1 $
00007 *
00008 *  Copyright (C) 1994-2002  Rafael Jesus Alcantara Perez <rafa@dedalo-ing.com>
00009 *
00010 *  This program is free software; you can redistribute it and/or modify
00011 *  it under the terms of the GNU General Public License as published by
00012 *  the Free Software Foundation; either version 2 of the License, or
00013 *  (at your option) any later version.
00014 *
00015 *  This program is distributed in the hope that it will be useful,
00016 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018 *  GNU General Public License for more details.
00019 *
00020 *  You should have received a copy of the GNU General Public License
00021 *  along with this program; if not, write to the Free Software
00022 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
00023 *  MA 02111-1307, USA.
00024 */
00025 
00026 #include <cctype>
00027 #include <cstdarg>
00028 #include <cstdlib>
00029 #include <cstring>
00030 #include <mpcl/text/string.hh>
00031 #include <mpcl/text/regex/matcher.hh>
00032 
00033 
00034 //
00035 //  L O C A L   M A C R O S
00036 //
00037 
00038 #define REM_SEPARATOR           "[[:space:]]"
00039 #define REM_SEPARATOR_PLUS      REM_SEPARATOR "+"
00040 #define REM_SEPARATOR_ASTERISK  REM_SEPARATOR "*"
00041 #define REM_BLANK               "[[:blank:]]"
00042 #define REM_BLANK_PLUS          REM_BLANK "+"
00043 #define REM_BLANK_ASTERISK      REM_BLANK "*"
00044 #define REM_ANY_ASTERISK        "[.]*"
00045 #define REM_STRING              "%s"
00046 #define REM_INTEGER             "%d"
00047 #define REM_QUOTABLE_STRING     "%q"
00048 #define REM_TEXT                "%t"
00049 #define REM_EOF                 "<<EOF>>"
00050 
00051 
00052 //
00053 //  L O C A L   V A R I A B L E S
00054 //
00055 
00057 namespace mpcl
00058 {
00059 
00061   namespace text
00062   {
00063 
00065     namespace regex
00066     {
00067 
00068       static const char*   _pkcAnyAsteriskPattern       = REM_ANY_ASTERISK;
00069       static const char*   _pkcBlankAsteriskPattern     = REM_BLANK_ASTERISK;
00070       static const char*   _pkcBlankPattern             = REM_BLANK;
00071       static const char*   _pkcBlankPlusPattern         = REM_BLANK_PLUS;
00072       static const char*   _pkcEofPattern               = REM_EOF;
00073       static const char*   _pkcIntegerPattern           = REM_INTEGER;
00074       static const char*   _pkcQuotableStringPattern    = REM_QUOTABLE_STRING;
00075       static const char*   _pkcSeparatorAsteriskPattern = REM_SEPARATOR_ASTERISK;
00076       static const char*   _pkcSeparatorPattern         = REM_SEPARATOR;
00077       static const char*   _pkcSeparatorPlusPattern     = REM_SEPARATOR_PLUS;
00078       static const char*   _pkcStringPattern            = REM_STRING;
00079       static const char*   _pkcTextPattern              = REM_TEXT;
00080 
00081       static std::size_t   _zAnyAsteriskPatternLength       = std::strlen (_pkcAnyAsteriskPattern);
00082       static std::size_t   _zBlankAsteriskPatternLength     = std::strlen (_pkcBlankAsteriskPattern);
00083       static std::size_t   _zBlankPatternLength             = std::strlen (_pkcBlankPattern);
00084       static std::size_t   _zBlankPlusPatternLength         = std::strlen (_pkcBlankPlusPattern);
00085       static std::size_t   _zEofPatternLength               = std::strlen (_pkcEofPattern);
00086       static std::size_t   _zIntegerPatternLength           = std::strlen (_pkcIntegerPattern);
00087       static std::size_t   _zQuotableStringPatternLength    = std::strlen (_pkcQuotableStringPattern);
00088       static std::size_t   _zSeparatorAsteriskPatternLength = std::strlen (_pkcSeparatorAsteriskPattern);
00089       static std::size_t   _zSeparatorPatternLength         = std::strlen (_pkcSeparatorPattern);
00090       static std::size_t   _zSeparatorPlusPatternLength     = std::strlen (_pkcSeparatorPlusPattern);
00091       static std::size_t   _zStringPatternLength            = std::strlen (_pkcStringPattern);
00092       static std::size_t   _zTextPatternLength              = std::strlen (_pkcTextPattern);
00093 
00094     }  // namespace regex
00095 
00096   }  // namespace text
00097 
00098 }  // namespace mpcl
00099 
00100 #undef REM_SEPARATOR
00101 #undef REM_SEPARATOR_PLUS
00102 #undef REM_SEPARATOR_ASTERISK
00103 #undef REM_BLANK
00104 #undef REM_BLANK_PLUS
00105 #undef REM_BLANK_ASTERISK
00106 #undef REM_ANY_ASTERISK
00107 #undef REM_STRING
00108 #undef REM_INTEGER
00109 #undef REM_QUOTABLE_STRING
00110 #undef REM_TEXT
00111 #undef REM_EOF
00112 
00113 
00114 //
00115 //  C O N S T R U C T O R S
00116 //
00117 
00118 void mpcl::text::regex::TMatcher::
00119 clearDefinitions (void)
00120 {
00121   
00122   tTerminalMap.clear();
00123   
00124 }  // clearDefinitions()
00125 
00126 
00127 void mpcl::text::regex::TMatcher::
00128 clearStream (void)
00129 {
00130 
00131   if ( gLocalStream )
00132   {
00133     if ( !ptSourceIstream )
00134     {
00135       throw TNoStreamException ("bad stream", __FILE__, __LINE__);
00136     }
00137     else
00138     {
00139       delete ptSourceIstream;
00140     }
00141     ptSourceIstream = NULL;
00142     
00143     gLocalStream = false;
00144   }
00145 
00146 }  // clearStream()
00147 
00148 
00149 void mpcl::text::regex::TMatcher::
00150 define ( const char* pkcTERMINAL   ,
00151          const char* pkcDEFINITION )
00152 {
00153   
00154   tTerminalMap.bind (pkcTERMINAL, pkcDEFINITION);
00155   
00156 }  // define()
00157 
00158 
00159 bool mpcl::text::regex::TMatcher::
00160 match (const char* pkcPATTERN_STRING) const
00161 {
00162   
00163   bool   gSuccess = false;
00164   
00165   if ( !ptSourceIstream )
00166   {
00167     throw TNoStreamException ("bad stream", __FILE__, __LINE__);
00168   }
00169   else
00170   {
00171     std::ios::iostate   tInitialIostate   = ptSourceIstream->rdstate();
00172     std::streampos      tInitialStreampos = ptSourceIstream->tellg();
00173     
00174     gSuccess = ( scan (pkcPATTERN_STRING, NULL) > 0 );
00175     if ( gSuccess )
00176     {
00177       //
00178       //  Only if there are matched chars (the pointer has moved),
00179       //  then it recovers the old stream state.
00180       //
00181       ptSourceIstream->seekg (tInitialStreampos);
00182       ptSourceIstream->clear (tInitialIostate);
00183     }
00184   }
00185   return gSuccess;
00186   
00187 }  // match()
00188 
00189 
00190 void mpcl::text::regex::TMatcher::
00191 redefine ( const char* pkcTERMINAL   , 
00192            const char* pkcDEFINITION )
00193 {
00194   
00195   tTerminalMap [pkcTERMINAL] = pkcDEFINITION;
00196   
00197 }  // redefine()
00198 
00199 
00200 std::size_t mpcl::text::regex::TMatcher::
00201 scan (const char* pkcPATTERN_STRING...) const
00202 {
00203 
00204   using std::size_t;
00205   using std::streampos;
00206   using std::string;
00207   
00208   size_t   zTotalMatchedChars = 0;
00209   
00210   if ( !ptSourceIstream )
00211   {
00212     throw TNoStreamException ("bad stream", __FILE__, __LINE__);
00213   }
00214   else
00215   {
00216     //
00217     //  pyArgument:      Pointer to a string (std::string) argument.
00218     //  piArgument:      Pointer to a integer (int) argument.
00219     //  gMoreArguments:  Boolean with the successfulness of the last
00220     //                   call to va_arg().
00221     //
00222     
00223     va_list             tVa_list;
00224     int                 iNextCharacter;
00225     bool                gMoreArguments    = true;
00226     TString             yPatternInstance  = instantiate (pkcPATTERN_STRING);
00227     char*               pcPatternIterator = (char*) yPatternInstance.c_str();
00228     std::ios::iostate   tInitialIostate   = ptSourceIstream->rdstate();
00229     std::streampos      tInitialStreampos = ptSourceIstream->tellg();
00230     string*             pyArgument        = NULL;
00231     int*                piArgument        = NULL;
00232     
00233     va_start (tVa_list, pkcPATTERN_STRING);
00234     while ( *pcPatternIterator && ptSourceIstream->good() )
00235     {
00236       iNextCharacter = ptSourceIstream->peek();
00237       if ( iNextCharacter == EOF )
00238       {
00239         break;
00240       }
00241       
00242       //
00243       //  _pkcSeparatorPlusPattern
00244       //
00245       if ( !strncmp ( _pkcSeparatorPlusPattern     ,
00246                       pcPatternIterator            ,
00247                       _zSeparatorPlusPatternLength ) )
00248       {
00249         if ( !isspace (iNextCharacter) )
00250         {
00251           break;
00252         }
00253         else
00254         {
00255           ptSourceIstream->get();
00256           ++zTotalMatchedChars;
00257           while ( isspace (ptSourceIstream->peek()) )
00258           {
00259             ptSourceIstream->get();
00260             ++zTotalMatchedChars;
00261           }      
00262           pcPatternIterator += _zSeparatorPlusPatternLength;
00263           continue;
00264         }
00265       }
00266       //
00267       //  _pkcSeparatorAsteriskPattern
00268       //
00269       if ( !strncmp ( _pkcSeparatorAsteriskPattern     ,
00270                       pcPatternIterator                ,
00271                       _zSeparatorAsteriskPatternLength ) )
00272       {
00273         while ( isspace (ptSourceIstream->peek()) )
00274         { 
00275           ptSourceIstream->get();
00276           ++zTotalMatchedChars;
00277         }
00278         pcPatternIterator += _zSeparatorAsteriskPatternLength;
00279         continue;      
00280       }
00281       //
00282       //  _pkcSeparatorPattern
00283       //
00284       if ( !strncmp ( _pkcSeparatorPattern     ,
00285                       pcPatternIterator        ,
00286                       _zSeparatorPatternLength ) )
00287       {
00288         if ( !isspace (ptSourceIstream->peek()) )
00289         {
00290           break;
00291         }
00292         else
00293         {
00294           ptSourceIstream->get();
00295           ++zTotalMatchedChars;
00296           pcPatternIterator += _zSeparatorPatternLength;
00297           continue;
00298         }
00299       }
00300       //
00301       //  _pkcBlankPlusPattern
00302       //
00303       if ( !strncmp (_pkcBlankPlusPattern, pcPatternIterator, _zBlankPlusPatternLength) )
00304       {
00305         if ( ( iNextCharacter != ' ' ) && ( iNextCharacter != '\t' ) )
00306         {
00307           break;
00308         }
00309         else
00310         {
00311           ptSourceIstream->get();
00312           ++zTotalMatchedChars;
00313           iNextCharacter = ptSourceIstream->peek();
00314           while ( ( iNextCharacter == ' ' ) || ( iNextCharacter == '\t') )
00315           {
00316             ptSourceIstream->get();
00317             iNextCharacter = ptSourceIstream->peek();
00318             ++zTotalMatchedChars;
00319           }
00320           pcPatternIterator += _zBlankPlusPatternLength;
00321           continue;
00322         }
00323       }
00324       //
00325       //  _pkcBlankAsteriskPattern
00326       //
00327       if ( !strncmp ( _pkcBlankAsteriskPattern     ,
00328                       pcPatternIterator            ,
00329                       _zBlankAsteriskPatternLength ) )
00330       {
00331         while ( ( iNextCharacter == ' ' ) || ( iNextCharacter == '\t') )
00332         {
00333           ptSourceIstream->get();
00334           iNextCharacter = ptSourceIstream->peek();
00335           ++zTotalMatchedChars;
00336         }
00337         pcPatternIterator += _zBlankAsteriskPatternLength;
00338         continue;      
00339       }
00340       //
00341       //  _pkcAnyAsteriskPattern
00342       //
00343       if ( !strncmp (_pkcAnyAsteriskPattern, pcPatternIterator, _zAnyAsteriskPatternLength) )
00344       {
00345         const char*   pkcNextPattern = pcPatternIterator + _zAnyAsteriskPatternLength;
00346         
00347         if ( *pkcNextPattern )
00348         {
00349           while ( ( iNextCharacter != EOF ) && !match (pkcNextPattern) )
00350           { 
00351             ptSourceIstream->get();
00352             iNextCharacter = ptSourceIstream->peek();
00353             ++zTotalMatchedChars;
00354           }
00355         }
00356         else
00357         {
00358           //
00359           //  If this is the last pattern, then it matches all
00360           //  characters till the end of stream.
00361           //
00362           while ( iNextCharacter != EOF )
00363           { 
00364             ptSourceIstream->get();
00365             iNextCharacter = ptSourceIstream->peek();
00366             ++zTotalMatchedChars;
00367           }
00368         }
00369         pcPatternIterator = (char*) pkcNextPattern;
00370         continue;      
00371       }
00372       //
00373       //  _pkcBlankPattern
00374       //
00375       if ( !strncmp (_pkcBlankPattern, pcPatternIterator, _zBlankPatternLength) )
00376       {
00377         if ( ( iNextCharacter != ' ' ) && ( iNextCharacter != '\t' ) )
00378         {
00379           break;
00380         }
00381         else
00382         {
00383           ptSourceIstream->get();
00384           ++zTotalMatchedChars;
00385           pcPatternIterator += _zBlankPatternLength;
00386           continue;
00387         }
00388       }
00389       //
00390       //  _pkcEofPattern
00391       //
00392       if ( !strncmp (_pkcEofPattern, pcPatternIterator, _zEofPatternLength) )
00393       {
00394         //
00395         //  If matcher reaches this point then input is not matching.
00396         //
00397         break;
00398       }
00399       //
00400       //  _pkcStringPattern
00401       //
00402       if ( !strncmp (_pkcStringPattern, pcPatternIterator, _zStringPatternLength) )
00403       {
00404         string   yMatched;
00405         
00406         while ( ( isalnum (iNextCharacter) ) ||
00407                 ( iNextCharacter == '_' )    ||
00408                 ( iNextCharacter == '-' )    ||
00409                 ( iNextCharacter == '.' )     )
00410         {
00411           yMatched       += ptSourceIstream->get();
00412           iNextCharacter  = ptSourceIstream->peek();
00413           ++zTotalMatchedChars;
00414         }
00415         pcPatternIterator += _zStringPatternLength;
00416         if ( gMoreArguments )
00417         {
00418           pyArgument     = va_arg (tVa_list, string*);
00419           gMoreArguments = ( pyArgument != NULL );
00420           if ( gMoreArguments )
00421           { 
00422             *pyArgument = yMatched;
00423           }
00424         }
00425         continue;
00426       }
00427       //
00428       //  _pkcIntegerPattern
00429       //
00430       if ( !strncmp (_pkcIntegerPattern, pcPatternIterator, _zIntegerPatternLength) )
00431       {
00432         TString   yMatched;
00433         
00434         while ( isdigit (iNextCharacter) )
00435         {
00436           yMatched       += ptSourceIstream->get();
00437           iNextCharacter  = ptSourceIstream->peek();
00438           ++zTotalMatchedChars;
00439         }
00440         pcPatternIterator += _zStringPatternLength;
00441         if ( gMoreArguments )
00442         {
00443           piArgument     = va_arg (tVa_list, int*);
00444           gMoreArguments = ( piArgument != NULL );
00445           if ( gMoreArguments )
00446           { 
00447             *piArgument = atoi (yMatched.c_str());
00448           }
00449         }
00450         continue;
00451       }
00452       //
00453       //  _pkcQuotableStringPattern
00454       //
00455       if ( !strncmp ( _pkcQuotableStringPattern     , 
00456                       pcPatternIterator             , 
00457                       _zQuotableStringPatternLength ) )
00458       {
00459         TString   yMatched;
00460         
00461         iNextCharacter = ptSourceIstream->peek();
00462         if ( iNextCharacter == EOF )
00463         {
00464           continue;
00465         }
00466         else
00467         {
00468           if ( iNextCharacter != '"' )
00469           {
00470             //
00471             //  Not-quoted string case.
00472             //
00473             while ( ( isalnum (iNextCharacter) ) ||
00474                     ( iNextCharacter == '_' )    ||
00475                     ( iNextCharacter == '-' )    ||
00476                     ( iNextCharacter == '.' )     )
00477             { 
00478               yMatched       += ptSourceIstream->get();
00479               iNextCharacter  = ptSourceIstream->peek();
00480               ++zTotalMatchedChars;
00481             }
00482           }
00483           else
00484           {
00485             //
00486             //  Quoted string case.
00487             //
00488             ptSourceIstream->get();
00489             ++zTotalMatchedChars;
00490             iNextCharacter = ptSourceIstream->peek();
00491             while ( ( iNextCharacter != '"' ) && ( iNextCharacter != EOF ) )
00492             { 
00493               yMatched       += ptSourceIstream->get();
00494               iNextCharacter  = ptSourceIstream->peek();
00495               ++zTotalMatchedChars;
00496             }
00497             if ( iNextCharacter != '"' )
00498             {
00499               break;
00500             }
00501             else
00502             {
00503               ptSourceIstream->get();
00504               ++zTotalMatchedChars;
00505             }
00506           }
00507           pcPatternIterator += _zQuotableStringPatternLength;
00508           if ( gMoreArguments )
00509           {
00510             pyArgument     = va_arg (tVa_list, string*);
00511             gMoreArguments = ( pyArgument != NULL );
00512             if ( gMoreArguments )
00513             { 
00514               *pyArgument = yMatched;
00515             }
00516           }
00517           continue;
00518         }
00519       }
00520       //
00521       //  _pkcTextPattern
00522       //
00523       if ( !strncmp (_pkcTextPattern, pcPatternIterator, _zTextPatternLength) )
00524       {
00525         TString       yMatched;
00526         const char*   pkcNextPattern = pcPatternIterator + _zTextPatternLength;
00527         
00528         if ( *pkcNextPattern )
00529         {
00530           while ( ( iNextCharacter != EOF ) && !match (pkcNextPattern) )
00531           { 
00532             yMatched       += ptSourceIstream->get();
00533             iNextCharacter  = ptSourceIstream->peek();
00534             ++zTotalMatchedChars;
00535           }
00536         }
00537         else
00538         {
00539           //
00540           //  If this is the last pattern, then it matches all
00541           //  characters till the end of stream.
00542           //
00543           while ( iNextCharacter != EOF )
00544           { 
00545             yMatched       += ptSourceIstream->get();
00546             iNextCharacter  = ptSourceIstream->peek();
00547             ++zTotalMatchedChars;
00548           }
00549         }
00550         pcPatternIterator = (char*) pkcNextPattern;
00551         if ( gMoreArguments )
00552         {
00553           pyArgument     = va_arg (tVa_list, string*);
00554           gMoreArguments = ( pyArgument != NULL );
00555           if ( gMoreArguments )
00556           { 
00557             *pyArgument = yMatched;
00558           }
00559         }
00560         continue;      
00561       }
00562       
00563       //
00564       //  Simple character matching.
00565       //
00566       if ( iNextCharacter != EOF )
00567       {
00568         if ( matchChars (iNextCharacter, pcPatternIterator) )
00569         {
00570           ++zTotalMatchedChars;
00571           ptSourceIstream->get();
00572         }
00573         else
00574         {
00575           if ( *(pcPatternIterator + 1) != '?' )
00576           {
00577             break;
00578           }
00579         }
00580         if ( *(pcPatternIterator + 1) == '?' )
00581         {
00582           ++pcPatternIterator;
00583         }
00584         ++pcPatternIterator;
00585       }
00586     }
00587     if ( *pcPatternIterator )
00588     {
00589       //
00590       //  _pkcEofPattern:  Must be the last pattern.
00591       //
00592       if ( !strcmp (_pkcEofPattern, pcPatternIterator) )
00593       {
00594         if ( iNextCharacter != EOF )
00595         { 
00596           zTotalMatchedChars = 0;
00597           ptSourceIstream->seekg (tInitialStreampos);
00598           ptSourceIstream->clear (tInitialIostate);
00599         }
00600       }
00601       else
00602       {
00603         zTotalMatchedChars = 0;
00604         ptSourceIstream->seekg (tInitialStreampos);
00605         ptSourceIstream->clear (tInitialIostate);
00606       }
00607     }
00608     va_end (tVa_list);
00609   }
00610   return zTotalMatchedChars;
00611   
00612 }  // scan()
00613 
00614 
00615 void mpcl::text::regex::TMatcher::
00616 setCaseSensitiveness (bool gTRUTH)
00617 {
00618   
00619   gCaseSensitive = gTRUTH;
00620   
00621 }  // setCaseSensitiveness()
00622 
00623 
00624 void mpcl::text::regex::TMatcher::
00625 setInput (const char* pkcSTRING)
00626 {
00627 
00628   clearStream();
00629   ptSourceIstream = new std::basic_istringstream<char> (pkcSTRING);
00630   gLocalStream    = true;
00631   if ( pkcSTRING && std::strlen (pkcSTRING) )
00632   {
00633     checkStream();
00634   }
00635 
00636 }  // setInput()
00637 
00638 
00639 void mpcl::text::regex::TMatcher::
00640 setInput (std::basic_istream<char>& rtSOURCE_ISTREAM)
00641 {
00642   
00643   clearStream();
00644   ptSourceIstream = &rtSOURCE_ISTREAM;
00645   checkStream();
00646   
00647 }  // setInput()
00648 
00649 
00650 //
00651 //  S E L E C T O R S
00652 //
00653 
00654 void mpcl::text::regex::TMatcher::
00655 checkStream (void) const
00656 {
00657 
00658   if ( ptSourceIstream )
00659   {
00660     if ( !ptSourceIstream->eof() )
00661     {
00662       //
00663       //  Checks that the stream is  repositionable (and it is not yet at the end
00664       //  of the stream).
00665       //
00666       if ( EOF == ptSourceIstream->rdbuf()->pubseekoff (0, std::ios_base::cur, std::ios_base::in) )
00667       {
00668         throw TNotRePositionableStreamException ("bad stream", __FILE__, __LINE__);
00669       }
00670     }
00671   }
00672 
00673 }  // checkStream()
00674 
00675 
00676 mpcl::text::TString mpcl::text::regex::TMatcher::
00677 instantiate (const char* pkcPATTERN_STRING) const
00678 {
00679 
00680   TStringToStringMap::const_iterator   ktIter = tTerminalMap.begin();
00681   TStringToStringMap::const_iterator   ktEnd  = tTerminalMap.end();
00682   TString                              yInstance (pkcPATTERN_STRING);
00683   
00684   for (; ( ktIter != ktEnd ) ;++ktIter)
00685   {
00686     yInstance.replaceAll (ktIter->first, ktIter->second);
00687   }
00688   return yInstance;
00689   
00690 }  // instantiate()
00691 
00692 
00693 bool mpcl::text::regex::TMatcher::
00694 matchChars (char cSOURCE, const char* pkcPATTERN_STRING) const
00695 {
00696   
00697   bool   gSuccess;
00698   
00699   if ( gCaseSensitive )
00700   {
00701     gSuccess = ( *pkcPATTERN_STRING == cSOURCE );
00702   }
00703   else
00704   {
00705     gSuccess = ( tolower (*pkcPATTERN_STRING) == tolower (cSOURCE) );
00706   }
00707   return gSuccess;
00708   
00709 }  // matchChars()

Generated on Mon Oct 13 02:35:23 2003 for MPCL by doxygen1.2.18