00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <stack>
00021 #include <algorithm>
00022
00023 #include <archon/util/text.H>
00024 #include <archon/util/dfa.H>
00025 #include <archon/util/regex.H>
00026
00027 #include <archon/util/nfa.H>
00028
00029 using namespace std;
00030
00031 namespace Archon
00032 {
00033 namespace Utilities
00034 {
00039 bool NFA::closedAdd(int state, set<int> &stateSet) const
00040 {
00041 if(!stateSet.insert(state).second) return false;
00042 stack<int> uncheckedStates;
00043 uncheckedStates.push(state);
00044 while(!uncheckedStates.empty())
00045 {
00046 const State &s = states[uncheckedStates.top()];
00047 uncheckedStates.pop();
00048 for(unsigned i=0; i<s.epsilonEdges.size(); ++i)
00049 {
00050 const int e = s.epsilonEdges[i];
00051 if(stateSet.insert(e).second) uncheckedStates.push(e);
00052 }
00053 }
00054 return true;
00055 }
00056
00073 void NFA::updateDfaEdgeRange(Edge e, map<unsigned, pair<unsigned, set<int> > > &dEdges) const
00074 {
00075 bool hasRawStateSet = false;
00076 set<int> rawStateSet;
00077
00078 map<unsigned, pair<unsigned, set<int> > >::iterator i = dEdges.lower_bound(e.begin);
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094 if(i != dEdges.begin())
00095 {
00096
00097
00098 map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00099 --j;
00100 if(e.begin <= j->second.first)
00101 {
00102
00103
00104
00105
00106
00107 dEdges.insert(i, make_pair(e.begin, j->second));
00108 j->second.first = e.begin-1;
00109
00110
00111
00112 --i;
00113 }
00114 }
00115
00116
00117 for(;;)
00118 {
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144 if(i == dEdges.end() || e.begin < i->first)
00145 {
00146
00147
00148
00149
00150
00151
00152 if(!hasRawStateSet) closedAdd(e.targetState, rawStateSet);
00153 bool expanded = false;
00154 const int gapEnd = i == dEdges.end() ? e.end : min(i->first-1, e.end);
00155 if(i != dEdges.begin())
00156 {
00157 map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00158 --j;
00159
00160 if(e.begin-1 == j->second.first &&
00161 rawStateSet == j->second.second)
00162 {
00163 j->second.first = gapEnd;
00164 expanded = true;
00165 }
00166 }
00167
00168 if(!expanded)
00169 dEdges.insert(i, make_pair(e.begin, make_pair(gapEnd, rawStateSet)));
00170
00171
00172
00173
00174
00175
00176
00177
00178 if(i == dEdges.end() || e.end < i->first-1) break;
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191 map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00192 --j;
00193 if(e.end == i->first-1)
00194 {
00195 if(j->second.second == i->second.second)
00196 {
00197 j->second.first = i->second.first;
00198 dEdges.erase(i);
00199 }
00200 break;
00201 }
00202
00203
00204
00205
00206
00207
00208 e.begin = i->first;
00209 }
00210
00211 ++i;
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229 map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00230 --j;
00231
00232 bool done = false;
00233 set<int> stateSet = j->second.second;
00234 if(closedAdd(e.targetState, stateSet))
00235 {
00236 if(e.end < j->second.first)
00237 {
00238 dEdges.insert(i, make_pair(e.end+1, j->second));
00239 --i;
00240 j->second.first = e.end;
00241 done = true;
00242 }
00243 else
00244 {
00245 if(e.end == j->second.first) done = true;
00246 else e.begin = j->second.first+1;
00247 }
00248 j->second.second = stateSet;
00249 }
00250 else
00251 {
00252 if(e.end <= j->second.first) done = true;
00253 else e.begin = j->second.first+1;
00254 }
00255
00256
00257 if(j!=dEdges.begin())
00258 {
00259 map<unsigned, pair<unsigned, set<int> > >::iterator k = j;
00260 --k;
00261
00262 if(k->second.first == j->first-1 &&
00263 k->second.second == j->second.second)
00264 {
00265 k->second.first = j->second.first;
00266 dEdges.erase(j);
00267 }
00268 }
00269
00270 if(done) break;
00271 }
00272 }
00273
00274 int NFA::chooseFinalValue(const set<int> &stateSet) const
00275 {
00276 int rule = -1;
00277 for(set<int>::iterator i=stateSet.begin(); i!=stateSet.end(); ++i)
00278 {
00279 const int r = states[*i].finalValue;
00280 if(r > rule) rule = r;
00281 }
00282 return rule;
00283 }
00284
00285 template<typename T>
00286 void set_include(const set<T> &right, set<T> &left)
00287 {
00288 typename set<T>::iterator i=left.begin();
00289 typename set<T>::const_iterator j=right.begin();
00290 while(j!=right.end())
00291 {
00292 if(i == left.end() || *j < *i) left.insert(i, *j++);
00293 else if(*j == *i) ++j;
00294 else ++i;
00295 }
00296 }
00297
00326 void NFA::constructDFA(DFA *dfa, vector<set<int> > *stateSets, DFA::AnchorInfo *anchorInfo) const
00327 {
00328 if(!states.size()) return;
00329
00330
00331 map<set<int>, int> dStates;
00332
00333 stack<pair<set<int>, int> > uncheckedDStates;
00334
00335
00336 for(unsigned i=0; i<startStates.size(); ++i)
00337 {
00338 set<int> stateSet;
00339 closedAdd(startStates[i], stateSet);
00340 pair<map<set<int>, int>::iterator ,bool> r =
00341 dStates.insert(make_pair(stateSet, 0));
00342 if(r.second)
00343 {
00344 r.first->second = dfa->addStartState(chooseFinalValue(stateSet));
00345 uncheckedDStates.push(*r.first);
00346 }
00347 }
00348
00349 while(!uncheckedDStates.empty())
00350 {
00351 const pair<set<int>, int> &dState = uncheckedDStates.top();
00352 map<unsigned, pair<unsigned, set<int> > > dEdges;
00353 for(set<int>::iterator i=dState.first.begin(); i!=dState.first.end(); ++i)
00354 {
00355 const State &s = states[*i];
00356 for(unsigned i=0; i<s.symbolEdges.size(); ++i)
00357 updateDfaEdgeRange(s.symbolEdges[i], dEdges);
00358 }
00359
00360
00361 if(anchorInfo) do
00362 {
00363
00364
00365 map<unsigned, pair<unsigned, set<int> > >::iterator i =
00366 dEdges.lower_bound(anchorInfo->start);
00367 if(i != dEdges.begin() && (i == dEdges.end() || i->first != anchorInfo->start)) --i;
00368 for(unsigned j=0; j<anchorInfo->numberOf; ++j)
00369 {
00370 unsigned anchor = anchorInfo->start + j;
00371 while(i != dEdges.end() && i->second.first < anchor) ++i;
00372 if(i == dEdges.end()) break;
00373 if(i->first <= anchor)
00374 {
00375 for(set<int>::iterator k=dState.first.begin(); k!=dState.first.end(); ++k)
00376 updateDfaEdgeRange(Edge(anchor, anchor, *k), dEdges);
00377 }
00378 }
00379 }
00380 while(false);
00381
00382 const int currentDFAState = dState.second;
00383 uncheckedDStates.pop();
00384 for(map<unsigned, pair<unsigned, set<int> > >::iterator i=dEdges.begin(); i!=dEdges.end(); ++i)
00385 {
00386 pair<map<set<int>, int>::iterator ,bool> r =
00387 dStates.insert(make_pair(i->second.second, 0));
00388 if(r.second)
00389 {
00390 r.first->second = dfa->addState(chooseFinalValue(r.first->first));
00391 uncheckedDStates.push(*r.first);
00392 }
00393 dfa->addEdgeRange(i->first, i->second.first, currentDFAState, r.first->second);
00394 }
00395 }
00396
00397 if(stateSets)
00398 {
00399 stateSets->clear();
00400 stateSets->resize(dfa->states.size());
00401 for(map<set<int>, int>::iterator i=dStates.begin(); i!=dStates.end(); ++i)
00402 (*stateSets)[i->second] = i->first;
00403 }
00404 }
00405
00406 NFA::NFA(basic_string<unsigned> s, int finalValue)
00407 {
00408 states.resize(s.size()+1);
00409 for(unsigned i=0; i<s.size(); ++i)
00410 states[i].symbolEdges.push_back(Edge(s[i], s[i], i+1));
00411 states.back().finalValue = finalValue;
00412 startStates.push_back(0);
00413 }
00414
00415 NFA::NFA(unsigned begin, unsigned end, int finalValue)
00416 {
00417 if(begin > end) ARCHON_THROW1(ArgumentException, "Illegal range");
00418 states.resize(2);
00419 states.front().symbolEdges.push_back(Edge(begin, end, 1));
00420 states.back().finalValue = finalValue;
00421 startStates.push_back(0);
00422 }
00423
00424 NFA::NFA(const vector<pair<unsigned, unsigned> > &ranges, int finalValue)
00425 {
00426 states.resize(2);
00427 for(unsigned i=0; i<ranges.size(); ++i)
00428 {
00429 const pair<unsigned, unsigned> &r = ranges[i];
00430 if(r.first > r.second) ARCHON_THROW1(ArgumentException, "Illegal range");
00431 states.front().symbolEdges.push_back(Edge(r.first, r.second, 1));
00432 }
00433 states.back().finalValue = finalValue;
00434 startStates.push_back(0);
00435 }
00436
00437 void NFA::altern(const NFA &a)
00438 {
00439 if(!states.size())
00440 {
00441 *this = a;
00442 return;
00443 }
00444 if(!a.states.size()) return;
00445 if(startStates.size()!=1 || a.startStates.size()!=1)
00446 ARCHON_THROW1(ArgumentException, "Wrong number of start states");
00447
00448 const unsigned n = states.size();
00449 const unsigned m = n + a.states.size();
00450 states.resize(m + 1);
00451 for(unsigned i=0; i<a.states.size(); ++i)
00452 {
00453 State &s = states[n+i] = a.states[i];
00454 for(unsigned j=0; j<s.symbolEdges.size(); ++j)
00455 s.symbolEdges[j].targetState += n;
00456 for(unsigned j=0; j<s.epsilonEdges.size(); ++j)
00457 s.epsilonEdges[j] += n;
00458 }
00459 {
00460 State &s = states[m];
00461 s.epsilonEdges.push_back(startStates.front());
00462 s.epsilonEdges.push_back(n+a.startStates.front());
00463 }
00464 startStates.front() = m;
00465 }
00466
00467 void NFA::concat(const NFA &a)
00468 {
00469 if(!states.size())
00470 {
00471 *this = a;
00472 return;
00473 }
00474 if(!a.states.size()) return;
00475 if(a.startStates.size()!=1)
00476 ARCHON_THROW1(ArgumentException, "Wrong number of start states");
00477
00478 const unsigned n = states.size();
00479 states.resize(n + a.states.size());
00480 const unsigned m = a.startStates.front()+n;
00481 for(unsigned i=0; i<n; ++i)
00482 {
00483 State &s = states[i];
00484 if(s.finalValue < 0) continue;
00485 s.finalValue = -1;
00486 s.epsilonEdges.push_back(m);
00487 }
00488 for(unsigned i=0; i<a.states.size(); ++i)
00489 {
00490 State &s = states[n+i] = a.states[i];
00491 for(unsigned j=0; j<s.symbolEdges.size(); ++j)
00492 s.symbolEdges[j].targetState += n;
00493 for(unsigned j=0; j<s.epsilonEdges.size(); ++j)
00494 s.epsilonEdges[j] += n;
00495 }
00496 }
00497
00498 void NFA::repeat(bool includeZero, int finalValue)
00499 {
00500 if(!states.size()) return;
00501 if(startStates.size()!=1)
00502 ARCHON_THROW1(ArgumentException, "Wrong number of start states");
00503
00504 if(includeZero)
00505 {
00506 const unsigned n = states.size();
00507 states.resize(n+1);
00508 for(unsigned i=0; i<n; ++i)
00509 {
00510 State &s = states[i];
00511 if(s.finalValue < 0) continue;
00512 s.finalValue = -1;
00513 s.epsilonEdges.push_back(n);
00514 }
00515 {
00516 State &s = states[n];
00517 s.epsilonEdges.push_back(startStates.front());
00518 s.finalValue = finalValue;
00519 }
00520 startStates.front() = n;
00521 return;
00522 }
00523
00524
00525 for(unsigned i=0; i<states.size(); ++i)
00526 {
00527 State &s = states[i];
00528 if(s.finalValue < 0) continue;
00529 s.epsilonEdges.push_back(startStates.front());
00530 }
00531 }
00532
00533 void NFA::setFinalValue(int finalValue)
00534 {
00535 for(unsigned i=0; i<states.size(); ++i)
00536 {
00537 State &s = states[i];
00538 if(s.finalValue < 0) continue;
00539 s.finalValue = finalValue;
00540 }
00541 }
00542
00543 NFA::NFA(const DFA &a)
00544 {
00545 a.makeNFA(this);
00546 }
00547
00548 NFA::NFA(const Regex &r, int finalValue)
00549 {
00550 r.makeNfa(*this);
00551 setFinalValue(finalValue);
00552 }
00553
00554 int NFA::addStartState(int finalValue)
00555 {
00556 startStates.push_back(states.size());
00557 return addState(finalValue);
00558 }
00559
00560 int NFA::addState(int finalValue)
00561 {
00562 states.push_back(State(finalValue));
00563 return states.size()-1;
00564 }
00565
00566 void NFA::addEdge(int originalState, int targetState)
00567 {
00568 states[originalState].epsilonEdges.push_back(targetState);
00569 }
00570
00571 void NFA::addEdge(int originalState, int targetState,
00572 unsigned begin, unsigned end)
00573 {
00574 if(begin > end) ARCHON_THROW1(ArgumentException, "Illegal range");
00575 states[originalState].symbolEdges.push_back(Edge(begin, end, targetState));
00576 }
00577
00578 struct DefaultPrinter: NFA::Printer
00579 {
00580 string printSymbol(unsigned s) const
00581 {
00582 return Text::toString(s);
00583 }
00584 };
00585
00586 const NFA::Printer &NFA::getDefaultPrinter()
00587 {
00588 static DefaultPrinter defaultPrinter;
00589 return defaultPrinter;
00590 }
00591
00592 string NFA::print(const Printer &printer, int width) const
00593 {
00594 vector<double> columnWidthFractions;
00595 columnWidthFractions.push_back(0.1);
00596 columnWidthFractions.push_back(0.1);
00597 columnWidthFractions.push_back(0.8);
00598 Text::Table<string> table(states.size()+1, columnWidthFractions);
00599 table(0, 0) = "State";
00600 table(0, 1) = "Final";
00601 table(0, 2) = "NFA-transitions";
00602 for(unsigned i=0; i<states.size(); ++i)
00603 {
00604 const State &s = states[i];
00605 table(i+1, 0) = Text::toString(i);
00606 if(find(startStates.begin(), startStates.end(), static_cast<int>(i))!=startStates.end())
00607 table(i+1, 0) += "*";
00608 if(s.finalValue >= 0) table(i+1, 1) = Text::toString(s.finalValue);
00609 for(unsigned j=0; j<s.symbolEdges.size(); ++j)
00610 {
00611 if(table(i+1, 2).size()) table(i+1, 2) += ", ";
00612 const Edge &e = s.symbolEdges[j];
00613 table(i+1, 2) += "'" + printer.printSymbol(e.begin) + "'";
00614 if(e.begin < e.end)
00615 table(i+1, 2) += "-'" + printer.printSymbol(e.end) + "'";
00616 table(i+1, 2) += " -> " + Text::toString(e.targetState);
00617 }
00618
00619 for(unsigned j=0; j<s.epsilonEdges.size(); ++j)
00620 {
00621 if(table(i+1, 2).size()) table(i+1, 2) += ", ";
00622 table(i+1, 2) += "-> " + Text::toString(s.epsilonEdges[j]);
00623 }
00624 }
00625
00626 return Text::format(table, width, 2, true);
00627 }
00628 }
00629 }