regex.C

00001 /*
00002  * This file is part of the "Archon" framework.
00003  * (http://files3d.sourceforge.net)
00004  *
00005  * Copyright © 2002 by Kristian Spangsege and Brian Kristiansen.
00006  *
00007  * Permission to use, copy, modify, and distribute this software and
00008  * its documentation under the terms of the GNU General Public License is
00009  * hereby granted. No representations are made about the suitability of
00010  * this software for any purpose. It is provided "as is" without express
00011  * or implied warranty. See the GNU General Public License
00012  * (http://www.gnu.org/copyleft/gpl.html) for more details.
00013  *
00014  * The characters in this file are ISO8859-1 encoded.
00015  *
00016  * The documentation in this file is in "Doxygen" style
00017  * (http://www.doxygen.org).
00018  */
00019 
00020 #include <iostream>
00021 #include <memory>
00022 #include <algorithm>
00023 #if !defined __GNUC__ || __GNUC__ > 2
00024 #include <limits>
00025 #else
00026 #include <limits.h>
00027 #endif
00028 
00029 #include <archon/util/unicode.H>
00030 #include <archon/util/nfa.H>
00031 #include <archon/util/cfg.H>
00032 #include <archon/util/lexer.H>
00033 #include <archon/util/lr_parser_simple.H>
00034 #include <archon/util/lr_parser_methods.H>
00035 
00036 #include <archon/util/regex.H>
00037 
00038 /*
00039  * Special codepoints used from the "Private Use Area" of the Unicode
00040  * character set.
00041  *
00042  * BOL(^)        0xE000
00043  * EOL($)        0xE001
00044  * BOW([[:<:]])  0xE002
00045  * EOW([[:>:]])  0xE003
00046  */
00047 
00048 using namespace std;
00049 
00050 namespace Archon
00051 {
00052   namespace Utilities
00053   {
00054     string Regex::Altern::print(int p) const
00055     {
00056       string s = e1->print(0) + "|" + e2->print(0);
00057       return p <= 0 ? s : "(" + s + ")";
00058     }
00059 
00060     string Regex::Juxta::print(int p) const
00061     {
00062       string s = e1->print(1) + e2->print(1);
00063       return p <= 1 ? s : "(" + s + ")";
00064     }
00065 
00066     string Regex::Repeat::print(int p) const
00067     {
00068       string s = e->print(2) +
00069         (min == 0 ? max == -1 ? "*" : max == 1 ? "?" : max == 0 ? "{0}" : "{0," + Text::toString(max) + "}" :
00070          min == 1 ? max == -1 ? "+" : max == 1 ? "{1}" : "{1," + Text::toString(max) + "}" :
00071          max == min ? "{" + Text::toString(min) + "}" : max == -1 ? "{" + Text::toString(min) + ",}" :
00072          "{" + Text::toString(min) + "," + Text::toString(max) + "}");
00073       return p <= 2 ? s : "(" + s + ")";
00074     }
00075 
00076     string Regex::String::print(int p) const
00077     {
00078       const int l = s.size();
00079       ustring t;
00080       t.reserve(l+l/5);
00081       static const string specialChars("|(){\\$?*+.^[");
00082       for(int i=0; i<l; ++i)
00083       {
00084         const uchar c = s[i];
00085         if(c<128 && specialChars.find(static_cast<char>(c))!=string::npos) t+='\\';
00086         t += c;
00087       }
00088       const string u = Text::toString(t);
00089       return p <= (l==0 ? -1 : l==1 ? 3 : 1) ? u : "(" + u + ")";
00090     }
00091 
00107     string Regex::Class::print(int) const
00108     {
00109       string s = "[";
00110       if(invert) s+= "^";
00111       bool lbrack = false;
00112       for(unsigned i=0; i < ranges.size(); ++i)
00113       {
00114         const pair<uchar, uchar> &r = ranges[i];
00115         string f = Text::toString(ustring(1, r.first));
00116         if(r.first=='^' && i==0 && !invert ||
00117            r.first==']' && i>0 ||
00118            r.first=='-' && i>0 && (r.second!='-' || i<ranges.size()-1) ||
00119            (r.first=='.' || r.first=='=' || r.first==':') && lbrack)
00120           f = "[." + f + ".]";
00121         if(r.first == r.second) s += f;
00122         else
00123         {
00124           string t = Text::toString(ustring(1, r.second));
00125           if(r.second==']') t = "[." + t + ".]";
00126           s += f + "-" + t;
00127         }
00128         lbrack = r.second=='[';
00129       }
00130       for(unsigned i=0; i <= name_xdigit; ++i)
00131         if(namedClasses[i])
00132         {
00133           const string n =
00134             i == name_alnum ? "alnum" :
00135             i == name_alpha ? "alpha" :
00136             i == name_blank ? "blank" :
00137             i == name_cntrl ? "cntrl" :
00138             i == name_digit ? "digit" :
00139             i == name_graph ? "graph" :
00140             i == name_lower ? "lower" :
00141             i == name_print ? "print" :
00142             i == name_punct ? "punct" :
00143             i == name_space ? "space" :
00144             i == name_upper ? "upper" : "xdigit";
00145           s += "[:" + n + ":]";
00146         }
00147       return s == "[^" ? "." : s + "]";
00148     }
00149 
00150     string Regex::LineBegin::print(int) const
00151     {
00152       return "^";
00153     }
00154 
00155     string Regex::LineEnd::print(int) const
00156     {
00157       return "$";
00158     }
00159 
00160     string Regex::WordBegin::print(int) const
00161     {
00162       return "[[:<:]]";
00163     }
00164 
00165     string Regex::WordEnd::print(int) const
00166     {
00167       return "[[:>:]]";
00168     }
00169 
00170     typedef RefObject<ustring> UAttr;
00171     typedef RefObject<pair<int, int> > IPAttr;
00172 
00173     struct Regex::ParserContext: LrParserBase::Context
00174     {
00175       LexerBase *lexer;
00176       const Environment *env;
00177       Logger *logger;
00178 
00179       Ref<Class> bracket;
00180 
00181       ParserContext(LexerBase *lexer, const Environment *env, Logger *logger):
00182         lexer(lexer), env(env), logger(logger) {}
00183 
00184       void warning(string s)
00185       {
00186         if(logger) logger->log(s);
00187         else ARCHON_THROW1(ArgumentException, s);
00188       }
00189 
00190       void parserError()
00191       {
00192         if(lexer->getType() < 0)
00193           warning("Syntax error. Unexpected end of string.");
00194         else
00195           warning("Syntax error. Unexpected character " +
00196                   Text::toString(Text::escapeNonprintable(lexer->getText())) +
00197                   ".");
00198       }
00199 
00200       Ref<const Exp> altern(const Ref<const Exp> &l,
00201                             const Ref<const Exp> &r)
00202       {
00203         if(!l) return r; // An already detected error
00204         if(!r) return l; // An already detected error
00205         return new Altern(l, r);
00206       }
00207 
00208       Ref<const Exp> empty() { return new String(); }
00209 
00210       Ref<const Exp> str(const Ref<const UAttr> &s)
00211       {
00212         return new String(s->value);
00213       }
00214 
00215       Ref<const Exp> any()        { return new Class(); }
00216       Ref<const Exp> line_begin() { return new LineBegin(); }
00217       Ref<const Exp> line_end()   { return new LineEnd(); }
00218 
00219       Ref<const Exp> word_begin()
00220       {
00221         bracket = 0;
00222         return new WordBegin();
00223       }
00224 
00225       Ref<const Exp> word_end()
00226       {
00227         bracket = 0;
00228         return new WordEnd();
00229       }
00230 
00231       Ref<const Exp> juxta(const Ref<const Exp> &l,
00232                            const Ref<const Exp> &r)
00233       {
00234         if(!l) return r; // An already detected error
00235         if(!r) return l; // An already detected error
00236         return new Juxta(l, r);
00237       }
00238 
00239       Ref<const Exp> repeat(const Ref<const Exp> &e,
00240                             const Ref<const IPAttr> &r)
00241       {
00242         if(!e || !r) return 0; // An already detected error
00243         return new Repeat(e, r->value.first, r->value.second);
00244       }
00245 
00246       Ref<const IPAttr> plus()
00247       {
00248         return new IPAttr(make_pair(1, -1));
00249       }
00250 
00251       Ref<const IPAttr> star()
00252       {
00253         return new IPAttr(make_pair(0, -1));
00254       }
00255 
00256       Ref<const IPAttr> option()
00257       {
00258         return new IPAttr(make_pair(0, 1));
00259       }
00260 
00261       static int parseInt(ustring s)
00262       {
00263         int v = 0;
00264         for(unsigned i=0; i<s.size(); ++i)
00265         {
00266           v *= 10;
00267           v += s[i] - '0';
00268         }
00269         return v;
00270       }
00271 
00272       Ref<const IPAttr> repeat_range(const Ref<const UAttr> &f,
00273                                      const Ref<const UAttr> &t)
00274       {
00275         if(!f) return 0; // An already detected error
00276         int a = parseInt(f->value);
00277         int b = t ? f==t ? a : parseInt(t->value) : -1;
00278         if(t && a > b)
00279         {
00280           warning("Illegal repeat range {" + Text::toString(a) + "," +
00281                   Text::toString(b) + "}: Lower bound is greater "
00282                   "than upper bound");
00283           swap(a, b);
00284         }
00285         return new IPAttr(make_pair(a, b));
00286       }
00287 
00288       void new_bracket()
00289       {
00290         bracket = new Class();
00291       }
00292 
00293       Ref<const Class> pos_bracket()
00294       {
00295         Ref<Class> b = bracket;
00296         bracket = 0;
00297         bool f = true;
00298         for(unsigned i=0; i<=Class::name_xdigit; ++i)
00299           if(b->namedClasses[i])
00300           {
00301             f = false;
00302             break;
00303           }
00304         if(b->ranges.size()==0 && f) return 0; // An already detected error
00305         b->invert = false;
00306         return b;
00307       }
00308 
00309       Ref<const Class> neg_bracket()
00310       {
00311         Ref<Class> b = bracket;
00312         bracket = 0;
00313         return b;
00314       }
00315 
00316       void named_class(const Ref<const UAttr> &n)
00317       {
00318         if(!n)
00319         {
00320           warning("Empty character class name");
00321           return;
00322         }
00323 
00324         const string s = Text::toString(n->value);
00325         const int i =
00326           s == "alnum" ? Class::name_alnum :
00327           s == "alpha" ? Class::name_alpha :
00328           s == "blank" ? Class::name_blank :
00329           s == "cntrl" ? Class::name_cntrl :
00330           s == "digit" ? Class::name_digit :
00331           s == "graph" ? Class::name_graph :
00332           s == "lower" ? Class::name_lower :
00333           s == "print" ? Class::name_print :
00334           s == "punct" ? Class::name_punct :
00335           s == "space" ? Class::name_space :
00336           s == "upper" ? Class::name_upper :
00337           s == "xdigit" ? Class::name_xdigit : -1;
00338         if(i==-1)
00339         {
00340           warning("Illegal character class [:" + s + ":]: Unknown name");
00341           return;
00342         }
00343         bracket->namedClasses[i] = true;
00344       }
00345 
00346       void bracket_range(const Ref<const UAttr> &f, const Ref<const UAttr> &t)
00347       {
00348         if(!f || !t) return; // An already detected error
00349         uchar u = f->value[0];
00350         uchar v = t->value[0];
00351         if(u>v)
00352         {
00353           warning("Illegal range " + Text::toString(ustring(1, u)) + "-" +
00354                   Text::toString(ustring(1, v)) + ": Lower endpoint is greater "
00355                   "than upper endpoint");
00356           swap(u, v);
00357         }
00358         bracket->ranges.push_back(make_pair(u, v));
00359       }
00360 
00361       Ref<const UAttr> collate(const Ref<const UAttr> &e)
00362       {
00363         const ustring s(e->value, 0, e->value.size()-1);
00364         if(s.size() == 0)
00365         {
00366           warning("Empty collation element");
00367           return 0;
00368         }
00369         if(s.size() == 1) return new UAttr(s);
00370         warning("Illegal collation element [." + Text::toString(s) +
00371                 ".]: Collation elements of more than one character are not "
00372                 "supported yet");
00373         return 0;
00374       }
00375 
00376       void equiv(const Ref<const UAttr> &e)
00377       {
00378         const ustring s(e->value, 0, e->value.size()-1);
00379         if(s.size() == 0)
00380         {
00381           warning("Empty collation element in equivalince class");
00382           return;
00383         }
00384         warning("Illegal equivalence class [=" + Text::toString(s) +
00385                 "=]: Equivalence classes are not supported yet");
00386       }
00387 
00388       Ref<const Exp> named_exp(const Ref<const UAttr> &n)
00389       {
00390         if(!n) return 0; // An already detected error
00391         const string s = Text::toString(n->value);
00392         if(!env)
00393         {
00394           warning("Illegal exression reference {" + s + "}: No environment");
00395           return 0;
00396         }
00397         map<string, Ref<const Exp> >::const_iterator i = env->m.find(s);
00398         if(i == env->m.end())
00399         {
00400           warning("Illegal expression reference {" + s + "}: Undefined name");
00401           return 0;
00402         }
00403         return i->second;
00404       }
00405     };
00406 
00407     struct Regex::Parser
00408     {
00409       struct Printer: LrParserBase::Printer
00410       {
00411         virtual string print(const RefAnyConst &a) const
00412         {
00413           if(!a) return "<null>";
00414           if(const UAttr *b = dynamic_cast<const UAttr *>(a.get()))
00415             return "\"" + Text::toString(b->value) + "\"";
00416           if(const IPAttr *b = dynamic_cast<const IPAttr *>(a.get()))
00417             return "(" + Text::toString(b->value.first) + "," +
00418               Text::toString(b->value.second) + ")";
00419           if(const Exp *b = dynamic_cast<const Exp *>(a.get()))
00420             return b->print();
00421           return "<unknown>";
00422         }
00423       };
00424 
00425       auto_ptr<LrParserBase::Actor<ParserContext> > actor;
00426       auto_ptr<Printer> printer;
00427       auto_ptr<SlrParser> lrParser;
00428       int terminalMap[128];
00429 
00430       Parser():
00431         actor(new LrParserBase::Actor<ParserContext>())
00432       {
00433         CFG g(actor.get());
00434 
00435         const int dollar   = g.defineTerminal("$");
00436         const int lpar     = g.defineTerminal("(");
00437         const int rpar     = g.defineTerminal(")");
00438         const int star     = g.defineTerminal("*");
00439         const int plus     = g.defineTerminal("+");
00440         const int comma    = g.defineTerminal(",");
00441         const int hyphen   = g.defineTerminal("-");
00442         const int flstop   = g.defineTerminal(".");
00443         const int colon    = g.defineTerminal(":");
00444         const int less     = g.defineTerminal("<");
00445         const int equal    = g.defineTerminal("=");
00446         const int greater  = g.defineTerminal(">");
00447         const int question = g.defineTerminal("?");
00448         const int lbrack   = g.defineTerminal("[");
00449         const int backsl   = g.defineTerminal("\\");
00450         const int rbrack   = g.defineTerminal("]");
00451         const int caret    = g.defineTerminal("^");
00452         const int undersc  = g.defineTerminal("_");
00453         const int lbrace   = g.defineTerminal("{");
00454         const int vbar     = g.defineTerminal("|");
00455         const int rbrace   = g.defineTerminal("}");
00456         const int ALPHA    = g.defineTerminal("ALPHA");
00457         const int DIGIT    = g.defineTerminal("DIGIT");
00458         const int OTHER    = g.defineTerminal("OTHER");
00459 
00460         const int exp            = g.defineNonterminal("exp");
00461         const int branch         = g.defineNonterminal("branch");
00462         const int atom           = g.defineNonterminal("atom");
00463         const int bracket        = g.defineNonterminal("bracket");
00464         const int br_pos         = g.defineNonterminal("br_pos");
00465         const int br_char_pos    = g.defineNonterminal("br_char_pos");
00466         const int br_char_common = g.defineNonterminal("br_char_common");
00467         const int common_char    = g.defineNonterminal("common_char");
00468         const int br_opt1        = g.defineNonterminal("br_opt1");
00469         const int br_start1      = g.defineNonterminal("br_start1");
00470         const int br_char1       = g.defineNonterminal("br_char1");
00471         const int br_opt2        = g.defineNonterminal("br_opt2");
00472         const int br_start2      = g.defineNonterminal("br_start2");
00473         const int br_char2       = g.defineNonterminal("br_char2");
00474         const int br_end         = g.defineNonterminal("br_end");
00475         const int br_char_end    = g.defineNonterminal("br_char_end");
00476         const int br_start3      = g.defineNonterminal("br_start3");
00477         const int br_start4      = g.defineNonterminal("br_start4");
00478         const int br_collate     = g.defineNonterminal("br_collate");
00479         const int br_collate1    = g.defineNonterminal("br_collate1");
00480         const int br_collate_ch  = g.defineNonterminal("br_collate_ch");
00481         const int br_collate2    = g.defineNonterminal("br_collate2");
00482         const int br_equiv       = g.defineNonterminal("br_equiv");
00483         const int br_equiv1      = g.defineNonterminal("br_equiv1");
00484         const int br_equiv_ch    = g.defineNonterminal("br_equiv_ch");
00485         const int br_equiv2      = g.defineNonterminal("br_equiv2");
00486         const int br_class       = g.defineNonterminal("br_class");
00487         const int id_char_seq    = g.defineNonterminal("id_char_seq");
00488         const int id_char        = g.defineNonterminal("id_char");
00489         const int id_char_init   = g.defineNonterminal("id_char_init");
00490         const int br_neg         = g.defineNonterminal("br_neg");
00491         const int br_char_neg    = g.defineNonterminal("br_char_neg");
00492         const int exp_name       = g.defineNonterminal("exp_name");
00493         const int any_char       = g.defineNonterminal("any_char");
00494         const int atom_char      = g.defineNonterminal("atom_char");
00495         const int branch2        = g.defineNonterminal("branch2");
00496         const int repeat         = g.defineNonterminal("repeat");
00497         const int repeat_range   = g.defineNonterminal("repeat_range");
00498         const int integer        = g.defineNonterminal("integer");
00499 
00500         const int a_altern        =
00501           actor->registerMethod("altern",        &ParserContext::altern);
00502         const int a_empty         =
00503           actor->registerMethod("empty",         &ParserContext::empty);
00504         const int a_string        =
00505           actor->registerMethod("string",        &ParserContext::str);
00506         const int a_any           =
00507           actor->registerMethod("any",           &ParserContext::any);
00508         const int a_line_begin    =
00509           actor->registerMethod("line_begin",    &ParserContext::line_begin);
00510         const int a_line_end      =
00511           actor->registerMethod("line_end",      &ParserContext::line_end);
00512         const int a_word_begin    =
00513           actor->registerMethod("word_begin",    &ParserContext::word_begin);
00514         const int a_word_end      =
00515           actor->registerMethod("word_end",      &ParserContext::word_end);
00516         const int a_juxta         =
00517           actor->registerMethod("juxta",         &ParserContext::juxta);
00518         const int a_repeat        =
00519           actor->registerMethod("repeat",        &ParserContext::repeat);
00520         const int a_option        =
00521           actor->registerMethod("option",        &ParserContext::option);
00522         const int a_star          =
00523           actor->registerMethod("star",          &ParserContext::star);
00524         const int a_plus          =
00525           actor->registerMethod("plus",          &ParserContext::plus);
00526         const int a_repeat_range  =
00527           actor->registerMethod("repeat_range",  &ParserContext::repeat_range);
00528         const int a_new_bracket   =
00529           actor->registerMethod("new_bracket",   &ParserContext::new_bracket);
00530         const int a_pos_bracket   =
00531           actor->registerMethod("pos_bracket",   &ParserContext::pos_bracket);
00532         const int a_neg_bracket   =
00533           actor->registerMethod("neg_bracket",   &ParserContext::neg_bracket);
00534         const int a_named_class   =
00535           actor->registerMethod("named_class",   &ParserContext::named_class);
00536         const int a_bracket_range =
00537           actor->registerMethod("bracket_range", &ParserContext::bracket_range);
00538         const int a_collate       =
00539           actor->registerMethod("collate",       &ParserContext::collate);
00540         const int a_equiv         =
00541           actor->registerMethod("equiv",         &ParserContext::equiv);
00542         const int a_named_exp     =
00543           actor->registerMethod("named_exp",     &ParserContext::named_exp);
00544 
00545         // exp
00546         g.addProd(exp, CFG::nont(branch));
00547         g.addProd(exp, CFG::nont(exp), CFG::term(vbar), CFG::nont(branch),
00548                   CFG::act(a_altern, 2, 0));
00549 
00550         // branch
00551         g.addProd(branch, CFG::nont(atom));
00552         g.addProd(branch, CFG::nont(atom), CFG::nont(branch2), CFG::copy(0));
00553 
00554         // atom
00555         g.addProd(atom, CFG::term(lpar), CFG::term(rpar), CFG::act(a_empty));
00556         g.addProd(atom, CFG::term(lpar), CFG::nont(exp),
00557                   CFG::term(rpar), CFG::copy(1));
00558         g.addProd(atom, CFG::term(lbrack), CFG::act(a_new_bracket),
00559                   CFG::nont(bracket), CFG::term(rbrack), CFG::copy(1));
00560         g.addProd(atom, CFG::term(lbrace), CFG::nont(exp_name), CFG::term(rbrace),
00561                   CFG::act(a_named_exp, 1));
00562         g.addProd(atom, CFG::term(backsl), CFG::nont(any_char),
00563                   CFG::act(a_string, 0));
00564         g.addProd(atom, CFG::term(flstop),    CFG::act(a_any));
00565         g.addProd(atom, CFG::term(caret),     CFG::act(a_line_begin));
00566         g.addProd(atom, CFG::term(dollar),    CFG::act(a_line_end));
00567         g.addProd(atom, CFG::nont(atom_char), CFG::act(a_string, 0));
00568 
00569         // bracket
00570         g.addProd(bracket, CFG::nont(br_pos), CFG::act(a_pos_bracket));
00571         g.addProd(bracket, CFG::term(caret), CFG::nont(br_neg),
00572                   CFG::act(a_neg_bracket));
00573         g.addProd(bracket, CFG::term(lbrack), CFG::term(colon), CFG::term(less),
00574                   CFG::term(colon), CFG::term(rbrack), CFG::act(a_word_begin));
00575         g.addProd(bracket, CFG::term(lbrack), CFG::term(colon), CFG::term(greater),
00576                   CFG::term(colon), CFG::term(rbrack), CFG::act(a_word_end));
00577 
00578         // br_pos
00579         g.addProd(br_pos, CFG::nont(br_char_pos), CFG::nont(br_opt1));
00580         g.addProd(br_pos, CFG::nont(br_opt2));
00581 
00582         // br_char_pos
00583         g.addProd(br_char_pos, CFG::nont(br_char_common));
00584         g.addProd(br_char_pos, CFG::term(rbrack));
00585         g.addProd(br_char_pos, CFG::term(hyphen));
00586         g.addProd(br_char_pos, CFG::term(flstop));
00587         g.addProd(br_char_pos, CFG::term(equal));
00588         g.addProd(br_char_pos, CFG::term(colon));
00589 
00590         // br_char_common
00591         g.addProd(br_char_common, CFG::nont(common_char));
00592         g.addProd(br_char_common, CFG::term(vbar));
00593         g.addProd(br_char_common, CFG::term(lpar));
00594         g.addProd(br_char_common, CFG::term(rpar));
00595         g.addProd(br_char_common, CFG::term(lbrace));
00596         g.addProd(br_char_common, CFG::term(backsl));
00597         g.addProd(br_char_common, CFG::term(dollar));
00598         g.addProd(br_char_common, CFG::term(question));
00599         g.addProd(br_char_common, CFG::term(star));
00600         g.addProd(br_char_common, CFG::term(plus));
00601 
00602         // common_char
00603         g.addProd(common_char, CFG::term(rbrace));
00604         g.addProd(common_char, CFG::term(undersc));
00605         g.addProd(common_char, CFG::term(comma));
00606         g.addProd(common_char, CFG::term(less));
00607         g.addProd(common_char, CFG::term(greater));
00608         g.addProd(common_char, CFG::term(DIGIT));
00609         g.addProd(common_char, CFG::term(ALPHA));
00610         g.addProd(common_char, CFG::term(OTHER));
00611 
00612         // br_opt1
00613         g.addProd(br_opt1, CFG::act(a_bracket_range, 0, 0), CFG::nont(br_start1));
00614         g.addProd(br_opt1, CFG::term(hyphen), CFG::nont(br_end));
00615 
00616         // br_start1
00617         g.addProd(br_start1);
00618         g.addProd(br_start1, CFG::nont(br_char1), CFG::nont(br_opt1));
00619         g.addProd(br_start1, CFG::nont(br_opt2));
00620 
00621         // br_char1
00622         g.addProd(br_char1, CFG::nont(br_char_common));
00623         g.addProd(br_char1, CFG::term(flstop));
00624         g.addProd(br_char1, CFG::term(equal));
00625         g.addProd(br_char1, CFG::term(colon));
00626         g.addProd(br_char1, CFG::term(caret));
00627 
00628         // br_opt2
00629         g.addProd(br_opt2, CFG::term(lbrack), CFG::act(a_bracket_range, 0, 0),
00630                   CFG::nont(br_start2));
00631         g.addProd(br_opt2, CFG::term(lbrack), CFG::term(hyphen),
00632                   CFG::nont(br_end));
00633         g.addProd(br_opt2, CFG::nont(br_collate), CFG::nont(br_opt1));
00634         g.addProd(br_opt2, CFG::nont(br_equiv), CFG::nont(br_start3));
00635         g.addProd(br_opt2, CFG::nont(br_class), CFG::nont(br_start3));
00636 
00637         // br_start2
00638         g.addProd(br_start2);
00639         g.addProd(br_start2, CFG::nont(br_char2), CFG::nont(br_opt1));
00640         g.addProd(br_start2, CFG::nont(br_opt2));
00641 
00642         // br_char2
00643         g.addProd(br_char2, CFG::nont(br_char_common));
00644         g.addProd(br_char2, CFG::term(caret));
00645 
00646         // br_end
00647         g.addProd(br_end, CFG::act(a_bracket_range, 1, 1),
00648                   CFG::act(a_bracket_range, 1, 1));
00649         g.addProd(br_end, CFG::nont(br_char_end), CFG::act(a_bracket_range, 2, 0),
00650                   CFG::nont(br_start3));
00651         g.addProd(br_end, CFG::term(lbrack), CFG::act(a_bracket_range, 2, 0),
00652                   CFG::nont(br_start4));
00653         g.addProd(br_end, CFG::nont(br_collate), CFG::act(a_bracket_range, 2, 0),
00654                   CFG::nont(br_start3));
00655 
00656         // br_char_end
00657         g.addProd(br_char_end, CFG::nont(br_char1));
00658         g.addProd(br_char_end, CFG::term(hyphen));
00659 
00660         // br_start3
00661         g.addProd(br_start3, CFG::term(hyphen), CFG::act(a_bracket_range, 0, 0));
00662         g.addProd(br_start3, CFG::nont(br_start1));
00663 
00664         // br_start4
00665         g.addProd(br_start4, CFG::term(hyphen), CFG::act(a_bracket_range, 0, 0));
00666         g.addProd(br_start4, CFG::nont(br_start2));
00667 
00668         // br_collate
00669         g.addProd(br_collate, CFG::term(lbrack), CFG::term(flstop),
00670                   CFG::nont(br_collate1), CFG::act(a_collate, 0));
00671 
00672         // br_collate1
00673         g.addProd(br_collate1, CFG::nont(br_collate_ch), CFG::nont(br_collate1),
00674                   CFG::concat(1, 0));
00675         g.addProd(br_collate1, CFG::term(flstop), CFG::nont(br_collate2),
00676                   CFG::concat(1, 0));
00677         g.addProd(br_collate1, CFG::term(rbrack),       CFG::nont(br_collate1),
00678                   CFG::concat(1, 0));
00679 
00680         // br_collate_ch
00681         g.addProd(br_collate_ch, CFG::nont(br_char_common));
00682         g.addProd(br_collate_ch, CFG::term(lbrack));
00683         g.addProd(br_collate_ch, CFG::term(hyphen));
00684         g.addProd(br_collate_ch, CFG::term(equal));
00685         g.addProd(br_collate_ch, CFG::term(colon));
00686         g.addProd(br_collate_ch, CFG::term(caret));
00687 
00688         // br_collate2
00689         g.addProd(br_collate2, CFG::nont(br_collate_ch), CFG::nont(br_collate1),
00690                   CFG::concat(1, 0));
00691         g.addProd(br_collate2, CFG::term(flstop), CFG::nont(br_collate2),
00692                   CFG::concat(1, 0));
00693         g.addProd(br_collate2, CFG::term(rbrack), CFG::null());
00694 
00695         // br_equiv
00696         g.addProd(br_equiv, CFG::term(lbrack), CFG::term(equal),
00697                   CFG::nont(br_equiv1), CFG::act(a_equiv, 0));
00698 
00699         // br_equiv1
00700         g.addProd(br_equiv1, CFG::nont(br_equiv_ch), CFG::nont(br_equiv1),
00701                   CFG::concat(1, 0));
00702         g.addProd(br_equiv1, CFG::term(equal), CFG::nont(br_equiv2),
00703                   CFG::concat(1, 0));
00704         g.addProd(br_equiv1, CFG::term(rbrack), CFG::nont(br_equiv1),
00705                   CFG::concat(1, 0));
00706 
00707         // br_equiv_ch
00708         g.addProd(br_equiv_ch, CFG::nont(br_char_common));
00709         g.addProd(br_equiv_ch, CFG::term(lbrack));
00710         g.addProd(br_equiv_ch, CFG::term(hyphen));
00711         g.addProd(br_equiv_ch, CFG::term(flstop));
00712         g.addProd(br_equiv_ch, CFG::term(colon));
00713         g.addProd(br_equiv_ch, CFG::term(caret));
00714 
00715         // br_equiv2
00716         g.addProd(br_equiv2, CFG::nont(br_equiv_ch), CFG::nont(br_equiv1),
00717                   CFG::concat(1, 0));
00718         g.addProd(br_equiv2, CFG::term(equal), CFG::nont(br_equiv2),
00719                   CFG::concat(1, 0));
00720         g.addProd(br_equiv2, CFG::term(rbrack), CFG::null());
00721 
00722         // br_class
00723         g.addProd(br_class, CFG::term(lbrack), CFG::term(colon),
00724                   CFG::nont(id_char_seq), CFG::term(colon), CFG::term(rbrack),
00725                   CFG::act(a_named_class, 2));
00726 
00727         // id_char_seq
00728         g.addProd(id_char_seq);
00729         g.addProd(id_char_seq, CFG::nont(id_char_seq), CFG::nont(id_char),
00730                   CFG::concat(1, 0));
00731 
00732         // id_char
00733         g.addProd(id_char, CFG::nont(id_char_init));
00734         g.addProd(id_char, CFG::term(DIGIT));
00735 
00736         // id_char_init
00737         g.addProd(id_char_init, CFG::term(undersc));
00738         g.addProd(id_char_init, CFG::term(ALPHA));
00739 
00740         // br_neg
00741         g.addProd(br_neg, CFG::nont(br_char_neg), CFG::nont(br_opt1));
00742         g.addProd(br_neg, CFG::nont(br_opt2));
00743 
00744         // br_char_neg
00745         g.addProd(br_char_neg, CFG::nont(br_char_pos));
00746         g.addProd(br_char_neg, CFG::term(caret));
00747 
00748         // exp_name
00749         g.addProd(exp_name, CFG::nont(id_char_init), CFG::nont(id_char_seq),
00750                   CFG::concat(1, 0));
00751 
00752         // any_char
00753         g.addProd(any_char, CFG::nont(br_char_neg));
00754         g.addProd(any_char, CFG::term(lbrack));
00755 
00756         // atom_char
00757         g.addProd(atom_char, CFG::nont(common_char));
00758         g.addProd(atom_char, CFG::term(rbrack));
00759         g.addProd(atom_char, CFG::term(hyphen));
00760         g.addProd(atom_char, CFG::term(colon));
00761         g.addProd(atom_char, CFG::term(equal));
00762 
00763         // branch2
00764         g.addProd(branch2, CFG::nont(branch), CFG::act(a_juxta, 1, 0));
00765         g.addProd(branch2, CFG::nont(repeat), CFG::act(a_repeat, 1, 0));
00766         g.addProd(branch2, CFG::nont(repeat), CFG::act(a_repeat, 1, 0),
00767                   CFG::nont(branch), CFG::act(a_juxta, 1, 0));
00768 
00769         // repeat
00770         g.addProd(repeat, CFG::term(question), CFG::act(a_option));
00771         g.addProd(repeat, CFG::term(star), CFG::act(a_star));
00772         g.addProd(repeat, CFG::term(plus), CFG::act(a_plus));
00773         g.addProd(repeat, CFG::term(lbrace), CFG::nont(repeat_range),
00774                   CFG::term(rbrace), CFG::copy(1));
00775 
00776         // repeat_range
00777         g.addProd(repeat_range, CFG::nont(integer),
00778                   CFG::act(a_repeat_range, 0, 0));
00779         g.addProd(repeat_range, CFG::nont(integer), CFG::term(comma),
00780                   CFG::act(a_repeat_range, 1, -1));
00781         g.addProd(repeat_range, CFG::nont(integer), CFG::term(comma),
00782                   CFG::nont(integer), CFG::act(a_repeat_range, 2, 0));
00783 
00784         // integer
00785         g.addProd(integer, CFG::nont(integer), CFG::term(DIGIT),
00786                   CFG::concat(1, 0));
00787         g.addProd(integer, CFG::term(DIGIT));
00788 
00789         printer = auto_ptr<Printer>(new Printer);
00790         lrParser = auto_ptr<SlrParser>(new SlrParser(g, actor.get(), printer.get()));
00791 
00792         for(int i=0; i<128; ++i) terminalMap[i] = OTHER;
00793         for(int i='0'; i<='9'; ++i) terminalMap[i] = DIGIT;
00794         for(int i='A'; i<='Z'; ++i) terminalMap[i] = ALPHA;
00795         for(int i='a'; i<='z'; ++i) terminalMap[i] = ALPHA;
00796 
00797         terminalMap[static_cast<int>('$')] = dollar;
00798         terminalMap[static_cast<int>('(')] = lpar;
00799         terminalMap[static_cast<int>(')')] = rpar;
00800         terminalMap[static_cast<int>('*')] = star;
00801         terminalMap[static_cast<int>('+')] = plus;
00802         terminalMap[static_cast<int>(',')] = comma;
00803         terminalMap[static_cast<int>('-')] = hyphen;
00804         terminalMap[static_cast<int>('.')] = flstop;
00805         terminalMap[static_cast<int>(':')] = colon;
00806         terminalMap[static_cast<int>('<')] = less;
00807         terminalMap[static_cast<int>('=')] = equal;
00808         terminalMap[static_cast<int>('>')] = greater;
00809         terminalMap[static_cast<int>('?')] = question;
00810         terminalMap[static_cast<int>('[')] = lbrack;
00811         terminalMap[static_cast<int>('\\')] = backsl;
00812         terminalMap[static_cast<int>(']')] = rbrack;
00813         terminalMap[static_cast<int>('^')] = caret;
00814         terminalMap[static_cast<int>('_')] = undersc;
00815         terminalMap[static_cast<int>('{')] = lbrace;
00816         terminalMap[static_cast<int>('|')] = vbar;
00817         terminalMap[static_cast<int>('}')] = rbrace;
00818       }
00819     };
00820 
00821     struct Regex::Lexer: LexerBase
00822     {
00823       ustring s;
00824       unsigned i;
00825       const Parser &p;
00826       ustring text;
00827       int type;
00828 
00829       Lexer(ustring s, const Parser &p): s(s), i(0), p(p) {}
00830 
00831       void getNext(LexerBase::Lexeme &l)
00832       {
00833         if(i == s.size())
00834         {
00835           l.type = type = -1;
00836           return;
00837         }
00838         uchar c = s[i++];
00839         text = ustring(1, c);
00840         l.type  = type = p.terminalMap[c < 128 ? c : 0];
00841         l.value = new RefObject<ustring>(text);
00842       }
00843 
00844       ustring getText() const
00845       {
00846         return text;
00847       }
00848 
00849       int getType() const
00850       {
00851         return type;
00852       }
00853     };
00854 
00855     const Regex::Parser &Regex::getParser()
00856     {
00857       static Parser p;
00858       return p;
00859     } 
00860 
00861     void Regex::parse(ustring s, Logger *logger, const Environment *e)
00862     {
00863       const Parser &parser = getParser();
00864       Lexer lexer(s, parser);
00865       ParserContext context(&lexer, e, logger);
00866       parser.lrParser->parse(lexer, context, exp, logger); // Comment in the logger for debugging
00867       if(!exp) ARCHON_THROW1(ArgumentException,
00868                              "Could not make any sense of input. "
00869                              "Problems were reported to the logger.");
00870     }
00871 
00872     Regex Regex::repeat(Regex r, int min, int max)
00873     {
00874       if(min < 0) ARCHON_THROW1(ArgumentException, "'min' was negative");
00875       if(max < min) ARCHON_THROW1(ArgumentException, "'max' was less than 'min'");
00876       return Regex(new Repeat(r.exp, min, max));
00877     }
00878 
00879     Regex Regex::repeat(Regex r, int n, bool orMore)
00880     {
00881       if(n < 0) ARCHON_THROW1(ArgumentException, "'n' was negative");
00882       return Regex(new Repeat(r.exp, n, orMore ? -1 : n));
00883     }
00884 
00885     Regex Regex::bracket(const vector<pair<uchar, uchar> > &ranges,
00886                          const vector<string> &namedClasses,
00887                          bool invert)
00888     {
00889       for(unsigned i=0; i<ranges.size(); ++i)
00890       {
00891         const pair<uchar, uchar> &p = ranges[i];
00892         if(p.second < p.first)
00893           ARCHON_THROW1(ArgumentException,
00894                         "A range has a first component greater that "
00895                         "its second component");
00896       }
00897       vector<bool> n(Class::name_xdigit+1);
00898       for(unsigned i=0; i<namedClasses.size(); ++i)
00899       {
00900         const string &s = namedClasses[i];
00901         const int j =
00902           s == "alnum" ? Class::name_alnum :
00903           s == "alpha" ? Class::name_alpha :
00904           s == "blank" ? Class::name_blank :
00905           s == "cntrl" ? Class::name_cntrl :
00906           s == "digit" ? Class::name_digit :
00907           s == "graph" ? Class::name_graph :
00908           s == "lower" ? Class::name_lower :
00909           s == "print" ? Class::name_print :
00910           s == "punct" ? Class::name_punct :
00911           s == "space" ? Class::name_space :
00912           s == "upper" ? Class::name_upper :
00913           s == "xdigit" ? Class::name_xdigit : -1;
00914         if(j==-1)
00915           ARCHON_THROW1(ArgumentException, "Unknown class name \"" + s + "\"");
00916         n[j] = true;
00917       }
00918       if(!invert && ranges.size() == 0 && namedClasses.size() == 0)
00919         ARCHON_THROW1(ArgumentException, "Empty positive bracket");
00920       return Regex(new Class(ranges, n, invert));
00921     }
00922 
00926     void Regex::makeNfa(NFA &a, const Exp *e, int context) const
00927     {
00928       if(const Altern *f = dynamic_cast<const Altern *>(e))
00929       {
00930         if(context != 2)
00931         {
00932           makeNfa(a, f->e1.get(), context);
00933           makeNfa(a, f->e2.get(), 1);
00934           return;
00935         }
00936 
00937         NFA b;
00938         makeNfa(b, f->e1.get(), 0);
00939         makeNfa(b, f->e2.get(), 1);
00940         a.concat(b);
00941         return;
00942       }
00943 
00944       if(const Juxta *f = dynamic_cast<const Juxta *>(e))
00945       {
00946         if(context != 1)
00947         {
00948           makeNfa(a, f->e1.get(), context);
00949           makeNfa(a, f->e2.get(), 2);
00950           return;
00951         }
00952 
00953         NFA b;
00954         makeNfa(b, f->e1.get(), 0);
00955         makeNfa(b, f->e2.get(), 2);
00956         a.altern(b);
00957         return;
00958       }
00959 
00960       if(const Repeat *f = dynamic_cast<const Repeat *>(e))
00961       {
00962         //  x{n}    =  xx{n-1}      or
00963         //  x{n,}   =  xx{n-1,}     or
00964         //  x{n,m}  =  xx{n-1,m-1}  where 0 < n <= m
00965         if(f->min > 0)
00966         {
00967           Ref<Juxta> g =
00968             new Juxta(f->e, new Repeat(f->e, f->min-1, f->max == -1 ? -1 : f->max-1));
00969           makeNfa(a, g.get(), context);
00970           return;
00971         }
00972 
00973         //  x{0,n}  =  ()|xx{0,n-1}  where 0 < n
00974         if(f->max > 0)
00975         {
00976           Ref<Altern> g =
00977             new Altern(new String(), new Juxta(f->e, new Repeat(f->e, 0, f->max-1)));
00978           makeNfa(a, g.get(), context);
00979           return;
00980         }
00981 
00982         // Handle  x{0}
00983         if(f->max == 0) return;
00984 
00985 
00986         // Handle  x{0,}
00987 
00988         if(context == 0)
00989         {
00990           makeNfa(a, f->e.get(), 0);
00991           a.repeat(true);
00992           return;
00993         }
00994 
00995         NFA b;
00996         makeNfa(b, f->e.get(), 0);
00997         b.repeat(true);
00998         if(context == 1) a.altern(b);
00999         else a.concat(b);
01000         return;
01001       }
01002 
01003       if(const String *f = dynamic_cast<const String *>(e))
01004       {
01005         NFA b(f->s); // Well - its actually a coincidence that unsigned == uchar
01006         switch(context)
01007         {
01008         case 0: a = b;       break;
01009         case 1: a.altern(b); break;
01010         case 2: a.concat(b);  break;
01011         }
01012         return;
01013       }
01014 
01015       if(const Class *f = dynamic_cast<const Class *>(e))
01016       {
01017         for(unsigned i=0; i<f->namedClasses.size(); ++i)
01018           if(f->namedClasses[i])
01019             ARCHON_THROW1(ArgumentException, "Named classes supported yet");
01020 
01021         vector<pair<unsigned, unsigned> > ranges;
01022 
01023         if(f->invert)
01024         {
01025           ranges.push_back(make_pair(0u, 0xDFFFu));
01026 #if !defined __GNUC__ || __GNUC__ > 2
01027           ranges.push_back(make_pair(0xF900, numeric_limits<unsigned>::max()));
01028 #else
01029           ranges.push_back(make_pair(0xF900, UINT_MAX));
01030 #endif
01031           for(unsigned i=0; i<f->ranges.size(); ++i)
01032           {
01033             const pair<uchar, uchar> &r = f->ranges[i];
01034             unsigned j=0;
01035             while(j<ranges.size() && ranges[j].second < r.first) ++j;
01036             if(j == ranges.size()) continue;
01037             if(ranges[j].first < r.first)
01038             {
01039               if(ranges[j].second > r.second)
01040               {
01041                 ranges.insert(ranges.begin()+j+1, make_pair(r.second+1, ranges[j].second));
01042                 ranges[j].second = r.first-1;
01043                 continue;
01044               }
01045               ranges[j].second = r.first-1;
01046               ++j;
01047               if(j == ranges.size()) continue;
01048             }
01049             while(j<ranges.size() && ranges[j].second <= r.second)
01050               ranges.erase(ranges.begin()+j);
01051             if(j == ranges.size()) continue;
01052             if(ranges[j].first > r.second) continue;
01053             ranges[j].first = r.second+1;
01054           }
01055         }
01056         else
01057         {
01058           for(unsigned i=0; i<f->ranges.size(); ++i)
01059           {
01060             const pair<uchar, uchar> &r = f->ranges[i];
01061             ranges.push_back(make_pair(r.first, r.second));
01062           }
01063         }
01064 
01065         NFA b(ranges);
01066 
01067         switch(context)
01068         {
01069         case 0: a = b;       break;
01070         case 1: a.altern(b); break;
01071         case 2: a.concat(b);  break;
01072         }
01073         return;
01074       }
01075 
01076       if(dynamic_cast<const LineBegin *>(e))
01077       {
01078         NFA b(0xE000, 0xE000);
01079         switch(context)
01080         {
01081         case 0: a = b;       break;
01082         case 1: a.altern(b); break;
01083         case 2: a.concat(b);  break;
01084         }
01085         return;
01086       }
01087 
01088 
01089       if(dynamic_cast<const LineEnd *>(e))
01090       {
01091         NFA b(0xE001, 0xE001);
01092         switch(context)
01093         {
01094         case 0: a = b;       break;
01095         case 1: a.altern(b); break;
01096         case 2: a.concat(b);  break;
01097         }
01098         return;
01099       }
01100 
01101       if(dynamic_cast<const WordBegin *>(e))
01102       {
01103         NFA b(0xE002, 0xE002);
01104         switch(context)
01105         {
01106         case 0: a = b;       break;
01107         case 1: a.altern(b); break;
01108         case 2: a.concat(b);  break;
01109         }
01110         return;
01111       }
01112 
01113 
01114       if(dynamic_cast<const WordEnd *>(e))
01115       {
01116         NFA b(0xE003, 0xE003);
01117         switch(context)
01118         {
01119         case 0: a = b;       break;
01120         case 1: a.altern(b); break;
01121         case 2: a.concat(b);  break;
01122         }
01123         return;
01124       }
01125 
01126       ARCHON_THROW1(ArgumentException, "Unsupported elements in Regex");
01127     }
01128 
01129     void Regex::makeNfa(NFA &a) const
01130     {
01131       makeNfa(a, exp.get(), 0);
01132     }
01133   }
01134 }
01135 

Generated on Sun Jul 30 22:55:45 2006 for Archon by  doxygen 1.4.4