00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <archon/util/text.H>
00021 #include <archon/util/nfa.H>
00022 #include <archon/util/dfa.H>
00023
00024 #include <archon/util/lexer.H>
00025
00026 using namespace std;
00027
00028 namespace Archon
00029 {
00030 namespace Utilities
00031 {
00032 namespace
00033 {
00034 const unsigned bol_bit = 1;
00035 const unsigned eol_bit = 2;
00036 const unsigned bow_bit = 4;
00037 const unsigned eow_bit = 8;
00038 }
00039
00040 string LexerBase::InputException::getMessage() const throw()
00041 {
00042 return Text::toString(lineNumber)+ ": Unexpected character " +
00043 Text::toString(Text::escapeNonprintable(text));
00044 }
00045
00046 Lexer::Engine::Engine(const RuleSet &ruleSet, const ActorBase *a):
00047 actor(a)
00048 {
00049 rules.resize(ruleSet.rules.size());
00050 NFA nfa;
00051 for(unsigned i=0; i<ruleSet.rules.size(); ++i)
00052 {
00053 const RuleSet::Rule &r = ruleSet.rules[i];
00054 const int priority = ruleSet.rules.size()-i-1;
00055 nfa.altern(NFA(r.regex, priority));
00056 rules[priority] = Rule(r.method, r.lexeme);
00057 }
00058
00059 DFA::AnchorInfo anchorInfo(0xE000, 4);
00060
00061 DFA dfa1(nfa, 0, &anchorInfo);
00062
00063 dfa = auto_ptr<DFA>(new DFA(DFA::Minimize(dfa1)));
00064
00065
00066
00067 anchorMasks.resize(dfa->getNumberOfStates(), 0);
00068 for(unsigned i=0; i<static_cast<unsigned>(dfa->getNumberOfStates()); ++i)
00069 {
00070 const DFA::State &s = dfa->getState(i);
00071 unsigned &mask = anchorMasks[i];
00072 for(unsigned j=0; j<4; ++j)
00073 if(s.step(0xE000+j) != -1) mask |= 1u<<j;
00074 }
00075 }
00076
00077 void Lexer::getNext(Lexeme &lexeme)
00078 {
00079 for(;;)
00080 {
00081
00082 if(charCount > 0)
00083 {
00084 copy(buffer.begin()+charCount, buffer.begin()+bufferSize, buffer.begin());
00085 bufferSize -= charCount;
00086 charCount = 0;
00087 }
00088
00089 int stateIndex = 0;
00090
00091 int finalRuleIndex = -1;
00092 int finalCharCount = -1;
00093 int finalAnchorState = -1;
00094 int finalLineNumber = -1;
00095
00096
00097 int secondAnchorState = -1;
00098 int secondLineNumber = -1;
00099
00100 do
00101 {
00102
00103 const DFA::State *state = &engine.dfa->getState(stateIndex);
00104 const int ruleIndex = state->getFinalValue();
00105 if(ruleIndex >= 0)
00106 {
00107 finalRuleIndex = ruleIndex;
00108 finalCharCount = charCount;
00109 finalAnchorState = anchorState;
00110 finalLineNumber = lineNumber;
00111 }
00112
00113 const int charCountBeforeRead = charCount;
00114 const int anchorStateBeforeRead = anchorState;
00115 const int lineNumberBeforeRead = lineNumber;
00116
00117
00118 uchar ch;
00119 bool eoi = false;
00120 if(charCount<bufferSize) ch = buffer[charCount++];
00121 else
00122 {
00123 if(reader->read(&ch, 1)<1) eoi = true;
00124 else
00125 {
00126 if(bufferSize<static_cast<int>(buffer.size()))
00127 buffer[bufferSize] = ch;
00128 else buffer += ch;
00129 ++bufferSize;
00130 ++charCount;
00131 }
00132 }
00133
00134 unsigned anchorMask = 0;
00135 switch(anchorState)
00136 {
00137
00138 case 0:
00139 if(eoi)
00140 {
00141 anchorMask = bol_bit|eol_bit;
00142 break;
00143 }
00144 switch(ch)
00145 {
00146 case '\r':
00147 anchorMask = bol_bit|eol_bit;
00148 ++lineNumber;
00149 anchorState = 2;
00150 break;
00151 case '\n':
00152 case '\v':
00153 case '\f':
00154 case 0x85:
00155 case 0x2028:
00156 case 0x2029:
00157 anchorMask = bol_bit|eol_bit;
00158 ++lineNumber;
00159 break;
00160 default:
00161 anchorMask = bol_bit;
00162 anchorState = 1;
00163 }
00164 break;
00165
00166
00167 case 1:
00168 if(eoi)
00169 {
00170 anchorMask = eol_bit;
00171 break;
00172 }
00173 switch(ch)
00174 {
00175 case '\r':
00176 anchorMask = eol_bit;
00177 anchorState = 2;
00178 break;
00179 case '\n':
00180 case '\v':
00181 case '\f':
00182 case 0x85:
00183 case 0x2028:
00184 case 0x2029:
00185 anchorMask = eol_bit;
00186 ++lineNumber;
00187 anchorState = 0;
00188 break;
00189 default:
00190 anchorMask = 0;
00191 }
00192 break;
00193
00194
00195 case 2:
00196 if(eoi)
00197 {
00198 anchorMask = bol_bit|eol_bit;
00199 break;
00200 }
00201 switch(ch)
00202 {
00203 case '\n':
00204 anchorMask = 0;
00205 anchorState = 0;
00206 break;
00207 case '\r':
00208 anchorMask = bol_bit|eol_bit;
00209 ++lineNumber;
00210 break;
00211 case '\v':
00212 case '\f':
00213 case 0x85:
00214 case 0x2028:
00215 case 0x2029:
00216 anchorMask |= bol_bit|eol_bit;
00217 ++lineNumber;
00218 anchorState = 0;
00219 break;
00220 default:
00221 anchorMask = bol_bit;
00222 anchorState = 1;
00223 }
00224 break;
00225 }
00226
00227 if(secondAnchorState == -1)
00228 {
00229 secondAnchorState = anchorState;
00230 secondLineNumber = lineNumber;
00231 }
00232
00233
00234 while(unsigned mask = engine.anchorMasks[stateIndex] & anchorMask)
00235 {
00236 unsigned bitPos = 0;
00237 while(!(mask&1u))
00238 {
00239 mask >>= 1;
00240 ++bitPos;
00241 }
00242 anchorMask &= ~(1u<<bitPos);
00243 stateIndex = state->step(0xE000+bitPos);
00244 state = &engine.dfa->getState(stateIndex);
00245 const int ruleIndex = state->getFinalValue();
00246 if(ruleIndex >= 0)
00247 {
00248 finalRuleIndex = ruleIndex;
00249 finalCharCount = charCountBeforeRead;
00250 finalAnchorState = anchorStateBeforeRead;
00251 finalLineNumber = lineNumberBeforeRead;
00252 }
00253 }
00254
00255 if(eoi) break;
00256
00257
00258 stateIndex = state->step(ch);
00259 }
00260 while(stateIndex != -1);
00261
00262 if(finalRuleIndex < 0)
00263 {
00264 if(charCount == 0)
00265 {
00266 lexeme.type = type = -1;
00267 return;
00268 }
00269
00270 charCount = 1;
00271 anchorState = secondAnchorState;
00272 lineNumber = secondLineNumber;
00273 if(context)
00274 {
00275 context->lexerError();
00276 continue;
00277 }
00278 throw InputException(lineNumber, getText());
00279 }
00280
00281 charCount = finalCharCount;
00282 anchorState = finalAnchorState;
00283 lineNumber = finalLineNumber;
00284
00285 const Engine::Rule &rule = engine.rules[finalRuleIndex];
00286 lexeme = rule.lexeme;
00287 if(rule.method > -1)
00288 engine.actor->call(rule.method, context, lexeme);
00289
00290 if(lexeme.type < 0) continue;
00291 type = lexeme.type;
00292 return;
00293 }
00294 }
00295 }
00296 }