Archon: archon/util/lexer.C Source File

00001 /*
00002  * This file is part of the "Archon" framework.
00003  * (http://files3d.sourceforge.net)
00004  *
00005  * Copyright © 2002 by Kristian Spangsege and Brian Kristiansen.
00006  *
00007  * Permission to use, copy, modify, and distribute this software and
00008  * its documentation under the terms of the GNU General Public License is
00009  * hereby granted. No representations are made about the suitability of
00010  * this software for any purpose. It is provided "as is" without express
00011  * or implied warranty. See the GNU General Public License
00012  * (http://www.gnu.org/copyleft/gpl.html) for more details.
00013  *
00014  * The characters in this file are ISO8859-1 encoded.
00015  *
00016  * The documentation in this file is in "Doxygen" style
00017  * (http://www.doxygen.org).
00018  */
00019 
00020 #include <archon/util/text.H>
00021 #include <archon/util/nfa.H>
00022 #include <archon/util/dfa.H>
00023 
00024 #include <archon/util/lexer.H>
00025 
00026 using namespace std;
00027 
00028 namespace Archon
00029 {
00030   namespace Utilities
00031   {
00032     namespace
00033     {
00034       const unsigned bol_bit = 1;
00035       const unsigned eol_bit = 2;
00036       const unsigned bow_bit = 4;
00037       const unsigned eow_bit = 8;
00038     }
00039 
00040     string LexerBase::InputException::getMessage() const throw()
00041     {
00042       return Text::toString(lineNumber)+ ": Unexpected character " +
00043         Text::toString(Text::escapeNonprintable(text));
00044     }
00045 
00046     Lexer::Engine::Engine(const RuleSet &ruleSet, const ActorBase *a):
00047       actor(a)
00048     {
00049       rules.resize(ruleSet.rules.size());
00050       NFA nfa;
00051       for(unsigned i=0; i<ruleSet.rules.size(); ++i)
00052       {
00053         const RuleSet::Rule &r = ruleSet.rules[i];
00054         const int priority = ruleSet.rules.size()-i-1;
00055         nfa.altern(NFA(r.regex, priority));
00056         rules[priority] = Rule(r.method, r.lexeme);
00057       }
00058 
00059       DFA::AnchorInfo anchorInfo(0xE000, 4);
00060       //cerr << nfa.print() << "\n";
00061       DFA dfa1(nfa, 0, &anchorInfo);
00062       //cerr << a.print() << "\n";
00063       dfa = auto_ptr<DFA>(new DFA(DFA::Minimize(dfa1)));
00064       //cerr << dfa->print() << "\n";
00065 
00066       // Precalculate anchor masks for each state
00067       anchorMasks.resize(dfa->getNumberOfStates(), 0);
00068       for(unsigned i=0; i<static_cast<unsigned>(dfa->getNumberOfStates()); ++i)
00069       {
00070         const DFA::State &s = dfa->getState(i);
00071         unsigned &mask = anchorMasks[i];
00072         for(unsigned j=0; j<4; ++j)
00073           if(s.step(0xE000+j) != -1) mask |= 1u<<j;
00074       }
00075     }
00076 
00077     void Lexer::getNext(Lexeme &lexeme)
00078     {
00079       for(;;)
00080       {
00081         // Discard previous lexeme from buffer
00082         if(charCount > 0)
00083         {
00084           copy(buffer.begin()+charCount, buffer.begin()+bufferSize, buffer.begin());
00085           bufferSize -= charCount;
00086           charCount = 0;
00087         }
00088 
00089         int stateIndex = 0;
00090 
00091         int finalRuleIndex   = -1;
00092         int finalCharCount   = -1;
00093         int finalAnchorState = -1;
00094         int finalLineNumber  = -1;
00095 
00096         // Support for discartion of characters
00097         int secondAnchorState = -1;
00098         int secondLineNumber = -1;
00099 
00100         do
00101         {
00102           // Fetch the current DFA state
00103           const DFA::State *state = &engine.dfa->getState(stateIndex);
00104           const int ruleIndex = state->getFinalValue();
00105           if(ruleIndex >= 0)
00106           {
00107             finalRuleIndex   = ruleIndex;
00108             finalCharCount   = charCount;
00109             finalAnchorState = anchorState;
00110             finalLineNumber  = lineNumber;
00111           }
00112 
00113           const int charCountBeforeRead   = charCount;
00114           const int anchorStateBeforeRead = anchorState;
00115           const int lineNumberBeforeRead  = lineNumber;
00116 
00117           // Read another character
00118           uchar ch;
00119           bool eoi = false;
00120           if(charCount<bufferSize) ch = buffer[charCount++];
00121           else
00122           {
00123             if(reader->read(&ch, 1)<1) eoi = true;
00124             else
00125             {
00126               if(bufferSize<static_cast<int>(buffer.size()))
00127                 buffer[bufferSize] = ch;
00128               else buffer += ch;
00129               ++bufferSize;
00130               ++charCount;
00131             }
00132           }
00133 
00134           unsigned anchorMask = 0; // A function of anchor state and input symbol
00135           switch(anchorState)
00136           {
00137             // At the beginning of a line
00138           case 0:
00139             if(eoi)
00140             {
00141               anchorMask = bol_bit|eol_bit;
00142               break;        
00143             }
00144             switch(ch)
00145             {
00146             case '\r':
00147               anchorMask = bol_bit|eol_bit;
00148               ++lineNumber;
00149               anchorState = 2;
00150               break;
00151             case '\n':
00152             case '\v':
00153             case '\f':
00154             case 0x85:
00155             case 0x2028:
00156             case 0x2029:
00157               anchorMask = bol_bit|eol_bit;
00158               ++lineNumber;
00159               break;
00160             default:
00161               anchorMask = bol_bit;
00162               anchorState = 1;
00163             }
00164             break;
00165 
00166             // After the beginning of a line
00167           case 1:
00168             if(eoi)
00169             {
00170               anchorMask = eol_bit;
00171               break;        
00172             }
00173             switch(ch)
00174             {
00175             case '\r':
00176               anchorMask = eol_bit;
00177               anchorState = 2;
00178               break;
00179             case '\n':
00180             case '\v':
00181             case '\f':
00182             case 0x85:
00183             case 0x2028:
00184             case 0x2029:
00185               anchorMask = eol_bit;
00186               ++lineNumber;
00187               anchorState = 0;
00188               break;
00189             default:
00190               anchorMask = 0;
00191             }
00192             break;
00193 
00194             // After a '\r'
00195           case 2:
00196             if(eoi)
00197             {
00198               anchorMask = bol_bit|eol_bit;
00199               break;        
00200             }
00201             switch(ch)
00202             {
00203             case '\n':
00204               anchorMask = 0;
00205               anchorState = 0;
00206               break;
00207             case '\r':
00208               anchorMask = bol_bit|eol_bit;
00209               ++lineNumber;
00210               break;
00211             case '\v':
00212             case '\f':
00213             case 0x85:
00214             case 0x2028:
00215             case 0x2029:
00216               anchorMask |= bol_bit|eol_bit;
00217               ++lineNumber;
00218               anchorState = 0;
00219               break;
00220             default:
00221               anchorMask = bol_bit;
00222               anchorState = 1;
00223             }
00224             break;
00225           }
00226 
00227           if(secondAnchorState == -1)
00228           {
00229             secondAnchorState = anchorState;
00230             secondLineNumber  = lineNumber;
00231           }
00232 
00233           // Step along anchor edges first
00234           while(unsigned mask = engine.anchorMasks[stateIndex] & anchorMask)
00235           {
00236             unsigned bitPos = 0;
00237             while(!(mask&1u))
00238             {
00239               mask >>= 1;
00240               ++bitPos;
00241             }
00242             anchorMask &= ~(1u<<bitPos);
00243             stateIndex = state->step(0xE000+bitPos);
00244             state = &engine.dfa->getState(stateIndex);
00245             const int ruleIndex = state->getFinalValue();
00246             if(ruleIndex >= 0)
00247             {
00248               finalRuleIndex   = ruleIndex;
00249               finalCharCount   = charCountBeforeRead;
00250               finalAnchorState = anchorStateBeforeRead;
00251               finalLineNumber  = lineNumberBeforeRead;
00252             }
00253           }
00254 
00255           if(eoi) break;
00256 
00257           // Step to the next DFA state
00258           stateIndex = state->step(ch);
00259         }
00260         while(stateIndex != -1);
00261 
00262         if(finalRuleIndex < 0)
00263         {
00264           if(charCount == 0)
00265           {
00266             lexeme.type = type = -1;
00267             return;
00268           }
00269 
00270           charCount   = 1;
00271           anchorState = secondAnchorState;
00272           lineNumber  = secondLineNumber;
00273           if(context)
00274           {
00275             context->lexerError();
00276             continue;
00277           }
00278           throw InputException(lineNumber, getText());
00279         }
00280 
00281         charCount   = finalCharCount;
00282         anchorState = finalAnchorState;
00283         lineNumber  = finalLineNumber;
00284 
00285         const Engine::Rule &rule = engine.rules[finalRuleIndex];
00286         lexeme = rule.lexeme;
00287         if(rule.method > -1)
00288           engine.actor->call(rule.method, context, lexeme);
00289 
00290         if(lexeme.type < 0) continue;
00291         type = lexeme.type;
00292         return;
00293       }
00294     }
00295   }
00296 }