Archon: archon/util/nfa.C Source File

00001 /*
00002  * This file is part of the "Archon" framework.
00003  * (http://files3d.sourceforge.net)
00004  *
00005  * Copyright © 2002 by Kristian Spangsege and Brian Kristiansen.
00006  *
00007  * Permission to use, copy, modify, and distribute this software and
00008  * its documentation under the terms of the GNU General Public License is
00009  * hereby granted. No representations are made about the suitability of
00010  * this software for any purpose. It is provided "as is" without express
00011  * or implied warranty. See the GNU General Public License
00012  * (http://www.gnu.org/copyleft/gpl.html) for more details.
00013  *
00014  * The characters in this file are ISO8859-1 encoded.
00015  *
00016  * The documentation in this file is in "Doxygen" style
00017  * (http://www.doxygen.org).
00018  */
00019 
00020 #include <stack>
00021 #include <algorithm>
00022 
00023 #include <archon/util/text.H>
00024 #include <archon/util/dfa.H>
00025 #include <archon/util/regex.H>
00026 
00027 #include <archon/util/nfa.H>
00028 
00029 using namespace std;
00030 
00031 namespace Archon
00032 {
00033   namespace Utilities
00034   {
00039     bool NFA::closedAdd(int state, set<int> &stateSet) const
00040     {
00041       if(!stateSet.insert(state).second) return false;
00042       stack<int> uncheckedStates;
00043       uncheckedStates.push(state);
00044       while(!uncheckedStates.empty())
00045       {
00046         const State &s = states[uncheckedStates.top()];
00047         uncheckedStates.pop();
00048         for(unsigned i=0; i<s.epsilonEdges.size(); ++i)
00049         {
00050           const int e = s.epsilonEdges[i];
00051           if(stateSet.insert(e).second) uncheckedStates.push(e);
00052         }
00053       }
00054       return true;
00055     }
00056 
00073     void NFA::updateDfaEdgeRange(Edge e, map<unsigned, pair<unsigned, set<int> > > &dEdges) const
00074     {
00075       bool hasRawStateSet = false;
00076       set<int> rawStateSet;
00077 
00078       map<unsigned, pair<unsigned, set<int> > >::iterator i = dEdges.lower_bound(e.begin);
00079 
00080       /*
00081        * Conditions:
00082        *
00083        * - 'i' references the interval that starts where the incoming
00084        * interval starts or if no such interval exists 'i' references
00085        * the first interval starting after the start of the incoming
00086        * interval or if no such interval exists 'i' references a virtual
00087        * interval starting at infinity.
00088        *
00089        * - The incoming interval is not empty.
00090        */
00091 
00092       // Detect an overlap with the existing interval (if it exists)
00093       // preceeding the interval referenced by 'i'.
00094       if(i != dEdges.begin())
00095       {
00096         // We may now assume that e.begin > 0
00097 
00098         map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00099         --j;
00100         if(e.begin <= j->second.first)
00101         {
00102           // The new interval has an overlap with the interval starting
00103           // before it
00104 
00105           // Split the existing interval into two pieces such that the
00106           // second piece starts where the incoming interval starts.
00107           dEdges.insert(i, make_pair(e.begin, j->second));
00108           j->second.first = e.begin-1;
00109 
00110           // Let 'i' refer to the second of the two pieces resulting
00111           // from the split
00112           --i;
00113         }
00114       }
00115 
00116       // General loop
00117       for(;;)
00118       {
00119         /*
00120          * Conditions:
00121          *
00122          * - 'i' references an existing interval or the virtual interval
00123          * succeeding the last real interval. The virtual interval
00124          * conceptually starts at positive infinity.
00125          *
00126          * - The remaining part of the incoming interval does not start
00127          * after the start of the existing real/virtual interval
00128          * referenced by 'i'.
00129          *
00130          * - The remaining part of the incoming interval does not
00131          * overlap with the previous existing real interval 'i-1' if
00132          * such an interval exists.
00133          *
00134          * - The remaining part of the incoming interval is not empty.
00135          */
00136 
00137         /*
00138          * Does the remaining part of the incloming interval start
00139          * before the existing interval referenced by 'i'. In this case
00140          * there is an uncovered range before the interval referenced by
00141          * 'i'. In the following we refer to the uncovered range as the
00142          * gap.
00143          */
00144         if(i == dEdges.end() || e.begin < i->first)
00145         {
00146           /*
00147            * If the incoming interval is adjacent to the previous
00148            * interval 'i-1' and the epsilon closure of the incoming
00149            * state is equal to the state set of the previous interval
00150            * just expand the previous interval.
00151            */
00152           if(!hasRawStateSet) closedAdd(e.targetState, rawStateSet);
00153           bool expanded = false;
00154           const int gapEnd = i == dEdges.end() ? e.end : min(i->first-1, e.end);
00155           if(i != dEdges.begin())
00156           {
00157             map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00158             --j;
00159             // We may now assume that e.begin > j->second.fist
00160             if(e.begin-1 == j->second.first &&
00161                rawStateSet == j->second.second)
00162             {
00163               j->second.first = gapEnd;
00164               expanded = true;
00165             }
00166           }
00167 
00168           if(!expanded)
00169             dEdges.insert(i, make_pair(e.begin, make_pair(gapEnd, rawStateSet)));
00170 
00171           // We may now assume that 'i-1' references a real interval
00172 
00173           /*
00174            * Stop if the remaining part of the incoming interval ends in
00175            * the gap without being adjacent to the next interval (the
00176            * interval referenced by 'i')?
00177            */
00178           if(i == dEdges.end() || e.end < i->first-1) break;
00179 
00180           // We may now assume that 'i' references a real interval
00181 
00182           /*
00183            * Merge intervals if the remaining part of the incoming
00184            * interval is adjacent to the next interval (the interval
00185            * referenced by 'i') and they have equal state sets?
00186            *
00187            * Regardless of state set equality if the remaining part of
00188            * the incoming interval is adjacent to the next interval
00189            * then stop.
00190            */
00191           map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00192           --j;
00193           if(e.end == i->first-1)
00194           {
00195             if(j->second.second == i->second.second)
00196             {
00197               j->second.first = i->second.first;
00198               dEdges.erase(i);
00199             }
00200             break;
00201           }
00202 
00203           /*
00204            * Shorten the incoming interval such that is starts where the
00205            * next existing interval starts (the interval referenced by
00206            * 'i').
00207            */
00208           e.begin = i->first;
00209         }
00210 
00211         ++i;
00212 
00213         /*
00214          * Conditions:
00215          *
00216          * - 'i' references an existing interval or the virtual interval
00217          * succeeding the last real interval. The virtual interval
00218          * conceptually starts at positive infinity.
00219          *
00220          * - The previous real interval exists.
00221          *
00222          * - The remaining part of the incoming interval starts where
00223          * the previous existing interval does.
00224          *
00225          * - The remaining part of the incoming interval is not empty.
00226          */
00227 
00228         // Handle overlap with next interval
00229         map<unsigned, pair<unsigned, set<int> > >::iterator j = i;
00230         --j;
00231 
00232         bool done = false;
00233         set<int> stateSet = j->second.second;
00234         if(closedAdd(e.targetState, stateSet))
00235         {
00236           if(e.end < j->second.first)
00237           {
00238             dEdges.insert(i, make_pair(e.end+1, j->second));
00239             --i;
00240             j->second.first = e.end;
00241             done = true;
00242           }
00243           else
00244           {
00245             if(e.end == j->second.first) done = true;
00246             else e.begin = j->second.first+1;
00247           }
00248           j->second.second = stateSet;
00249         }
00250         else
00251         {
00252           if(e.end <= j->second.first) done = true;
00253           else e.begin = j->second.first+1;
00254         }
00255 
00256         // Should we merge 'i-1' and 'i-2'?
00257         if(j!=dEdges.begin())
00258         {
00259           map<unsigned, pair<unsigned, set<int> > >::iterator k = j;
00260           --k;
00261 
00262           if(k->second.first == j->first-1 &&
00263              k->second.second == j->second.second)
00264           {
00265             k->second.first = j->second.first;
00266             dEdges.erase(j);
00267           }
00268         }
00269 
00270         if(done) break;
00271       }
00272     }
00273 
00274     int NFA::chooseFinalValue(const set<int> &stateSet) const
00275     {
00276       int rule = -1;
00277       for(set<int>::iterator i=stateSet.begin(); i!=stateSet.end(); ++i)
00278       {
00279         const int r = states[*i].finalValue;
00280         if(r > rule) rule = r;
00281       }
00282       return rule;
00283     }
00284 
00285     template<typename T>
00286     void set_include(const set<T> &right, set<T> &left)
00287     {
00288       typename set<T>::iterator i=left.begin();
00289       typename set<T>::const_iterator j=right.begin();
00290       while(j!=right.end())
00291       {
00292         if(i == left.end() || *j < *i) left.insert(i, *j++);
00293         else if(*j == *i) ++j;
00294         else ++i;
00295       }
00296     }
00297 
00326     void NFA::constructDFA(DFA *dfa, vector<set<int> > *stateSets, DFA::AnchorInfo *anchorInfo) const
00327     {
00328       if(!states.size()) return; // Handle the special null NFA
00329 
00330       // Maps NFA state sets to DFA state indices.
00331       map<set<int>, int> dStates;
00332 
00333       stack<pair<set<int>, int> > uncheckedDStates;
00334 
00335       // Add a DFA start state for each NFA start state.
00336       for(unsigned i=0; i<startStates.size(); ++i)
00337       {
00338         set<int> stateSet;
00339         closedAdd(startStates[i], stateSet);
00340         pair<map<set<int>, int>::iterator ,bool> r =
00341           dStates.insert(make_pair(stateSet, 0));
00342         if(r.second)
00343         {
00344           r.first->second = dfa->addStartState(chooseFinalValue(stateSet));
00345           uncheckedDStates.push(*r.first);
00346         }
00347       }
00348 
00349       while(!uncheckedDStates.empty())
00350       {
00351         const pair<set<int>, int> &dState = uncheckedDStates.top();
00352         map<unsigned, pair<unsigned, set<int> > > dEdges;
00353         for(set<int>::iterator i=dState.first.begin(); i!=dState.first.end(); ++i)
00354         {
00355           const State &s = states[*i];
00356           for(unsigned i=0; i<s.symbolEdges.size(); ++i)
00357             updateDfaEdgeRange(s.symbolEdges[i], dEdges);
00358         }
00359 
00360         // Anchor regret
00361         if(anchorInfo) do
00362         {
00363           // Check if there are any anchor symbols among the edges
00364           // leavinge the current state set.
00365           map<unsigned, pair<unsigned, set<int> > >::iterator i =
00366             dEdges.lower_bound(anchorInfo->start);
00367           if(i != dEdges.begin() && (i == dEdges.end() || i->first != anchorInfo->start)) --i;
00368           for(unsigned j=0; j<anchorInfo->numberOf; ++j)
00369           {
00370             unsigned anchor = anchorInfo->start + j;
00371             while(i != dEdges.end() && i->second.first < anchor) ++i;
00372             if(i == dEdges.end()) break;
00373             if(i->first <= anchor)
00374             {
00375               for(set<int>::iterator k=dState.first.begin(); k!=dState.first.end(); ++k)
00376                 updateDfaEdgeRange(Edge(anchor, anchor, *k), dEdges);
00377             }
00378           }
00379         }
00380         while(false);
00381 
00382         const int currentDFAState = dState.second;
00383         uncheckedDStates.pop();
00384         for(map<unsigned, pair<unsigned, set<int> > >::iterator i=dEdges.begin(); i!=dEdges.end(); ++i)
00385         {
00386           pair<map<set<int>, int>::iterator ,bool> r =
00387             dStates.insert(make_pair(i->second.second, 0));
00388           if(r.second)
00389           {
00390             r.first->second = dfa->addState(chooseFinalValue(r.first->first));
00391             uncheckedDStates.push(*r.first);
00392           }
00393           dfa->addEdgeRange(i->first, i->second.first, currentDFAState, r.first->second);
00394         }
00395       }
00396 
00397       if(stateSets)
00398       {
00399         stateSets->clear();
00400         stateSets->resize(dfa->states.size());
00401         for(map<set<int>, int>::iterator i=dStates.begin(); i!=dStates.end(); ++i)
00402           (*stateSets)[i->second] = i->first;
00403       }
00404     }
00405 
00406     NFA::NFA(basic_string<unsigned> s, int finalValue)
00407     {
00408       states.resize(s.size()+1);
00409       for(unsigned i=0; i<s.size(); ++i)
00410         states[i].symbolEdges.push_back(Edge(s[i], s[i], i+1));
00411       states.back().finalValue = finalValue;
00412       startStates.push_back(0);
00413     }
00414 
00415     NFA::NFA(unsigned begin, unsigned end, int finalValue)
00416     {
00417       if(begin > end) ARCHON_THROW1(ArgumentException, "Illegal range");
00418       states.resize(2);
00419       states.front().symbolEdges.push_back(Edge(begin, end, 1));
00420       states.back().finalValue = finalValue;
00421       startStates.push_back(0);
00422     }
00423 
00424     NFA::NFA(const vector<pair<unsigned, unsigned> > &ranges, int finalValue)
00425     {
00426       states.resize(2);
00427       for(unsigned i=0; i<ranges.size(); ++i)
00428       {
00429         const pair<unsigned, unsigned> &r = ranges[i];
00430         if(r.first > r.second) ARCHON_THROW1(ArgumentException, "Illegal range");
00431         states.front().symbolEdges.push_back(Edge(r.first, r.second, 1));
00432       }
00433       states.back().finalValue = finalValue;
00434       startStates.push_back(0);
00435     }
00436 
00437     void NFA::altern(const NFA &a)
00438     {
00439       if(!states.size()) // Handle special null NFA
00440       {
00441         *this = a;
00442         return;
00443       }
00444       if(!a.states.size()) return; // Handle special null NFA
00445       if(startStates.size()!=1 || a.startStates.size()!=1)
00446         ARCHON_THROW1(ArgumentException, "Wrong number of start states");
00447 
00448       const unsigned n = states.size();
00449       const unsigned m = n + a.states.size();
00450       states.resize(m + 1);
00451       for(unsigned i=0; i<a.states.size(); ++i)
00452       {
00453         State &s = states[n+i] = a.states[i];
00454         for(unsigned j=0; j<s.symbolEdges.size(); ++j)
00455           s.symbolEdges[j].targetState += n;
00456         for(unsigned j=0; j<s.epsilonEdges.size(); ++j)
00457           s.epsilonEdges[j] += n;
00458       }
00459       {
00460         State &s = states[m];
00461         s.epsilonEdges.push_back(startStates.front());
00462         s.epsilonEdges.push_back(n+a.startStates.front());
00463       }
00464       startStates.front() = m;
00465     }
00466 
00467     void NFA::concat(const NFA &a)
00468     {
00469       if(!states.size()) // Handle special null NFA
00470       {
00471         *this = a;
00472         return;
00473       }
00474       if(!a.states.size()) return; // Handle special null NFA
00475       if(a.startStates.size()!=1)
00476         ARCHON_THROW1(ArgumentException, "Wrong number of start states");
00477 
00478       const unsigned n = states.size();
00479       states.resize(n + a.states.size());
00480       const unsigned m = a.startStates.front()+n;
00481       for(unsigned i=0; i<n; ++i)
00482       {
00483         State &s = states[i];
00484         if(s.finalValue < 0) continue;
00485         s.finalValue = -1;
00486         s.epsilonEdges.push_back(m);
00487       }
00488       for(unsigned i=0; i<a.states.size(); ++i)
00489       {
00490         State &s = states[n+i] = a.states[i];
00491         for(unsigned j=0; j<s.symbolEdges.size(); ++j)
00492           s.symbolEdges[j].targetState += n;
00493         for(unsigned j=0; j<s.epsilonEdges.size(); ++j)
00494           s.epsilonEdges[j] += n;
00495       }
00496     }
00497 
00498     void NFA::repeat(bool includeZero, int finalValue)
00499     {
00500       if(!states.size()) return; // Handle special null NFA
00501       if(startStates.size()!=1)
00502         ARCHON_THROW1(ArgumentException, "Wrong number of start states");
00503 
00504       if(includeZero)
00505       {
00506         const unsigned n = states.size();
00507         states.resize(n+1);
00508         for(unsigned i=0; i<n; ++i)
00509         {
00510           State &s = states[i];
00511           if(s.finalValue < 0) continue;
00512           s.finalValue = -1;
00513           s.epsilonEdges.push_back(n);
00514         }
00515         {
00516           State &s = states[n];
00517           s.epsilonEdges.push_back(startStates.front());
00518           s.finalValue = finalValue;
00519         }
00520         startStates.front() = n;
00521         return;
00522       }
00523 
00524       // One or more times:
00525       for(unsigned i=0; i<states.size(); ++i)
00526       {
00527         State &s = states[i];
00528         if(s.finalValue < 0) continue;
00529         s.epsilonEdges.push_back(startStates.front());
00530       }
00531     }
00532 
00533     void NFA::setFinalValue(int finalValue)
00534     {
00535       for(unsigned i=0; i<states.size(); ++i)
00536       {
00537         State &s = states[i];
00538         if(s.finalValue < 0) continue;
00539         s.finalValue = finalValue;
00540       }
00541     }
00542 
00543     NFA::NFA(const DFA &a)
00544     {
00545       a.makeNFA(this);
00546     }
00547 
00548     NFA::NFA(const Regex &r, int finalValue)
00549     {
00550       r.makeNfa(*this);
00551       setFinalValue(finalValue);
00552     }
00553 
00554     int NFA::addStartState(int finalValue)
00555     {
00556       startStates.push_back(states.size());
00557       return addState(finalValue);
00558     }
00559 
00560     int NFA::addState(int finalValue)
00561     {
00562       states.push_back(State(finalValue));
00563       return states.size()-1;
00564     }
00565 
00566     void NFA::addEdge(int originalState, int targetState)
00567     {
00568       states[originalState].epsilonEdges.push_back(targetState);
00569     }
00570 
00571     void NFA::addEdge(int originalState, int targetState,
00572                       unsigned begin, unsigned end)
00573     {
00574       if(begin > end) ARCHON_THROW1(ArgumentException, "Illegal range");
00575       states[originalState].symbolEdges.push_back(Edge(begin, end, targetState));
00576     }
00577 
00578     struct DefaultPrinter: NFA::Printer
00579     {
00580       string printSymbol(unsigned s) const
00581       {
00582         return Text::toString(s);
00583       }
00584     };
00585 
00586     const NFA::Printer &NFA::getDefaultPrinter()
00587     {
00588       static DefaultPrinter defaultPrinter;
00589       return defaultPrinter;
00590     }
00591 
00592     string NFA::print(const Printer &printer, int width) const
00593     {
00594       vector<double> columnWidthFractions;
00595       columnWidthFractions.push_back(0.1);
00596       columnWidthFractions.push_back(0.1);
00597       columnWidthFractions.push_back(0.8);
00598       Text::Table<string> table(states.size()+1, columnWidthFractions);
00599       table(0, 0) = "State";
00600       table(0, 1) = "Final";
00601       table(0, 2) = "NFA-transitions";
00602       for(unsigned i=0; i<states.size(); ++i)
00603       {
00604         const State &s = states[i];
00605         table(i+1, 0) = Text::toString(i);
00606         if(find(startStates.begin(), startStates.end(), static_cast<int>(i))!=startStates.end())
00607           table(i+1, 0) += "*";
00608         if(s.finalValue >= 0) table(i+1, 1) = Text::toString(s.finalValue);
00609         for(unsigned j=0; j<s.symbolEdges.size(); ++j)
00610         {
00611           if(table(i+1, 2).size()) table(i+1, 2) += ", ";
00612           const Edge &e = s.symbolEdges[j];
00613           table(i+1, 2) += "'" + printer.printSymbol(e.begin) + "'";
00614           if(e.begin < e.end)
00615             table(i+1, 2) += "-'" + printer.printSymbol(e.end) + "'";
00616           table(i+1, 2) += " -> " + Text::toString(e.targetState);
00617         }
00618 
00619         for(unsigned j=0; j<s.epsilonEdges.size(); ++j)
00620         {
00621           if(table(i+1, 2).size()) table(i+1, 2) += ", ";
00622           table(i+1, 2) += "-> " + Text::toString(s.epsilonEdges[j]);
00623         }
00624       }
00625 
00626       return Text::format(table, width, 2, true);      
00627     }
00628   }
00629 }