00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <iostream>
00021 #include <memory>
00022 #include <algorithm>
00023 #if !defined __GNUC__ || __GNUC__ > 2
00024 #include <limits>
00025 #else
00026 #include <limits.h>
00027 #endif
00028
00029 #include <archon/util/unicode.H>
00030 #include <archon/util/nfa.H>
00031 #include <archon/util/cfg.H>
00032 #include <archon/util/lexer.H>
00033 #include <archon/util/lr_parser_simple.H>
00034 #include <archon/util/lr_parser_methods.H>
00035
00036 #include <archon/util/regex.H>
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048 using namespace std;
00049
00050 namespace Archon
00051 {
00052 namespace Utilities
00053 {
00054 string Regex::Altern::print(int p) const
00055 {
00056 string s = e1->print(0) + "|" + e2->print(0);
00057 return p <= 0 ? s : "(" + s + ")";
00058 }
00059
00060 string Regex::Juxta::print(int p) const
00061 {
00062 string s = e1->print(1) + e2->print(1);
00063 return p <= 1 ? s : "(" + s + ")";
00064 }
00065
00066 string Regex::Repeat::print(int p) const
00067 {
00068 string s = e->print(2) +
00069 (min == 0 ? max == -1 ? "*" : max == 1 ? "?" : max == 0 ? "{0}" : "{0," + Text::toString(max) + "}" :
00070 min == 1 ? max == -1 ? "+" : max == 1 ? "{1}" : "{1," + Text::toString(max) + "}" :
00071 max == min ? "{" + Text::toString(min) + "}" : max == -1 ? "{" + Text::toString(min) + ",}" :
00072 "{" + Text::toString(min) + "," + Text::toString(max) + "}");
00073 return p <= 2 ? s : "(" + s + ")";
00074 }
00075
00076 string Regex::String::print(int p) const
00077 {
00078 const int l = s.size();
00079 ustring t;
00080 t.reserve(l+l/5);
00081 static const string specialChars("|(){\\$?*+.^[");
00082 for(int i=0; i<l; ++i)
00083 {
00084 const uchar c = s[i];
00085 if(c<128 && specialChars.find(static_cast<char>(c))!=string::npos) t+='\\';
00086 t += c;
00087 }
00088 const string u = Text::toString(t);
00089 return p <= (l==0 ? -1 : l==1 ? 3 : 1) ? u : "(" + u + ")";
00090 }
00091
00107 string Regex::Class::print(int) const
00108 {
00109 string s = "[";
00110 if(invert) s+= "^";
00111 bool lbrack = false;
00112 for(unsigned i=0; i < ranges.size(); ++i)
00113 {
00114 const pair<uchar, uchar> &r = ranges[i];
00115 string f = Text::toString(ustring(1, r.first));
00116 if(r.first=='^' && i==0 && !invert ||
00117 r.first==']' && i>0 ||
00118 r.first=='-' && i>0 && (r.second!='-' || i<ranges.size()-1) ||
00119 (r.first=='.' || r.first=='=' || r.first==':') && lbrack)
00120 f = "[." + f + ".]";
00121 if(r.first == r.second) s += f;
00122 else
00123 {
00124 string t = Text::toString(ustring(1, r.second));
00125 if(r.second==']') t = "[." + t + ".]";
00126 s += f + "-" + t;
00127 }
00128 lbrack = r.second=='[';
00129 }
00130 for(unsigned i=0; i <= name_xdigit; ++i)
00131 if(namedClasses[i])
00132 {
00133 const string n =
00134 i == name_alnum ? "alnum" :
00135 i == name_alpha ? "alpha" :
00136 i == name_blank ? "blank" :
00137 i == name_cntrl ? "cntrl" :
00138 i == name_digit ? "digit" :
00139 i == name_graph ? "graph" :
00140 i == name_lower ? "lower" :
00141 i == name_print ? "print" :
00142 i == name_punct ? "punct" :
00143 i == name_space ? "space" :
00144 i == name_upper ? "upper" : "xdigit";
00145 s += "[:" + n + ":]";
00146 }
00147 return s == "[^" ? "." : s + "]";
00148 }
00149
00150 string Regex::LineBegin::print(int) const
00151 {
00152 return "^";
00153 }
00154
00155 string Regex::LineEnd::print(int) const
00156 {
00157 return "$";
00158 }
00159
00160 string Regex::WordBegin::print(int) const
00161 {
00162 return "[[:<:]]";
00163 }
00164
00165 string Regex::WordEnd::print(int) const
00166 {
00167 return "[[:>:]]";
00168 }
00169
00170 typedef RefObject<ustring> UAttr;
00171 typedef RefObject<pair<int, int> > IPAttr;
00172
00173 struct Regex::ParserContext: LrParserBase::Context
00174 {
00175 LexerBase *lexer;
00176 const Environment *env;
00177 Logger *logger;
00178
00179 Ref<Class> bracket;
00180
00181 ParserContext(LexerBase *lexer, const Environment *env, Logger *logger):
00182 lexer(lexer), env(env), logger(logger) {}
00183
00184 void warning(string s)
00185 {
00186 if(logger) logger->log(s);
00187 else ARCHON_THROW1(ArgumentException, s);
00188 }
00189
00190 void parserError()
00191 {
00192 if(lexer->getType() < 0)
00193 warning("Syntax error. Unexpected end of string.");
00194 else
00195 warning("Syntax error. Unexpected character " +
00196 Text::toString(Text::escapeNonprintable(lexer->getText())) +
00197 ".");
00198 }
00199
00200 Ref<const Exp> altern(const Ref<const Exp> &l,
00201 const Ref<const Exp> &r)
00202 {
00203 if(!l) return r;
00204 if(!r) return l;
00205 return new Altern(l, r);
00206 }
00207
00208 Ref<const Exp> empty() { return new String(); }
00209
00210 Ref<const Exp> str(const Ref<const UAttr> &s)
00211 {
00212 return new String(s->value);
00213 }
00214
00215 Ref<const Exp> any() { return new Class(); }
00216 Ref<const Exp> line_begin() { return new LineBegin(); }
00217 Ref<const Exp> line_end() { return new LineEnd(); }
00218
00219 Ref<const Exp> word_begin()
00220 {
00221 bracket = 0;
00222 return new WordBegin();
00223 }
00224
00225 Ref<const Exp> word_end()
00226 {
00227 bracket = 0;
00228 return new WordEnd();
00229 }
00230
00231 Ref<const Exp> juxta(const Ref<const Exp> &l,
00232 const Ref<const Exp> &r)
00233 {
00234 if(!l) return r;
00235 if(!r) return l;
00236 return new Juxta(l, r);
00237 }
00238
00239 Ref<const Exp> repeat(const Ref<const Exp> &e,
00240 const Ref<const IPAttr> &r)
00241 {
00242 if(!e || !r) return 0;
00243 return new Repeat(e, r->value.first, r->value.second);
00244 }
00245
00246 Ref<const IPAttr> plus()
00247 {
00248 return new IPAttr(make_pair(1, -1));
00249 }
00250
00251 Ref<const IPAttr> star()
00252 {
00253 return new IPAttr(make_pair(0, -1));
00254 }
00255
00256 Ref<const IPAttr> option()
00257 {
00258 return new IPAttr(make_pair(0, 1));
00259 }
00260
00261 static int parseInt(ustring s)
00262 {
00263 int v = 0;
00264 for(unsigned i=0; i<s.size(); ++i)
00265 {
00266 v *= 10;
00267 v += s[i] - '0';
00268 }
00269 return v;
00270 }
00271
00272 Ref<const IPAttr> repeat_range(const Ref<const UAttr> &f,
00273 const Ref<const UAttr> &t)
00274 {
00275 if(!f) return 0;
00276 int a = parseInt(f->value);
00277 int b = t ? f==t ? a : parseInt(t->value) : -1;
00278 if(t && a > b)
00279 {
00280 warning("Illegal repeat range {" + Text::toString(a) + "," +
00281 Text::toString(b) + "}: Lower bound is greater "
00282 "than upper bound");
00283 swap(a, b);
00284 }
00285 return new IPAttr(make_pair(a, b));
00286 }
00287
00288 void new_bracket()
00289 {
00290 bracket = new Class();
00291 }
00292
00293 Ref<const Class> pos_bracket()
00294 {
00295 Ref<Class> b = bracket;
00296 bracket = 0;
00297 bool f = true;
00298 for(unsigned i=0; i<=Class::name_xdigit; ++i)
00299 if(b->namedClasses[i])
00300 {
00301 f = false;
00302 break;
00303 }
00304 if(b->ranges.size()==0 && f) return 0;
00305 b->invert = false;
00306 return b;
00307 }
00308
00309 Ref<const Class> neg_bracket()
00310 {
00311 Ref<Class> b = bracket;
00312 bracket = 0;
00313 return b;
00314 }
00315
00316 void named_class(const Ref<const UAttr> &n)
00317 {
00318 if(!n)
00319 {
00320 warning("Empty character class name");
00321 return;
00322 }
00323
00324 const string s = Text::toString(n->value);
00325 const int i =
00326 s == "alnum" ? Class::name_alnum :
00327 s == "alpha" ? Class::name_alpha :
00328 s == "blank" ? Class::name_blank :
00329 s == "cntrl" ? Class::name_cntrl :
00330 s == "digit" ? Class::name_digit :
00331 s == "graph" ? Class::name_graph :
00332 s == "lower" ? Class::name_lower :
00333 s == "print" ? Class::name_print :
00334 s == "punct" ? Class::name_punct :
00335 s == "space" ? Class::name_space :
00336 s == "upper" ? Class::name_upper :
00337 s == "xdigit" ? Class::name_xdigit : -1;
00338 if(i==-1)
00339 {
00340 warning("Illegal character class [:" + s + ":]: Unknown name");
00341 return;
00342 }
00343 bracket->namedClasses[i] = true;
00344 }
00345
00346 void bracket_range(const Ref<const UAttr> &f, const Ref<const UAttr> &t)
00347 {
00348 if(!f || !t) return;
00349 uchar u = f->value[0];
00350 uchar v = t->value[0];
00351 if(u>v)
00352 {
00353 warning("Illegal range " + Text::toString(ustring(1, u)) + "-" +
00354 Text::toString(ustring(1, v)) + ": Lower endpoint is greater "
00355 "than upper endpoint");
00356 swap(u, v);
00357 }
00358 bracket->ranges.push_back(make_pair(u, v));
00359 }
00360
00361 Ref<const UAttr> collate(const Ref<const UAttr> &e)
00362 {
00363 const ustring s(e->value, 0, e->value.size()-1);
00364 if(s.size() == 0)
00365 {
00366 warning("Empty collation element");
00367 return 0;
00368 }
00369 if(s.size() == 1) return new UAttr(s);
00370 warning("Illegal collation element [." + Text::toString(s) +
00371 ".]: Collation elements of more than one character are not "
00372 "supported yet");
00373 return 0;
00374 }
00375
00376 void equiv(const Ref<const UAttr> &e)
00377 {
00378 const ustring s(e->value, 0, e->value.size()-1);
00379 if(s.size() == 0)
00380 {
00381 warning("Empty collation element in equivalince class");
00382 return;
00383 }
00384 warning("Illegal equivalence class [=" + Text::toString(s) +
00385 "=]: Equivalence classes are not supported yet");
00386 }
00387
00388 Ref<const Exp> named_exp(const Ref<const UAttr> &n)
00389 {
00390 if(!n) return 0;
00391 const string s = Text::toString(n->value);
00392 if(!env)
00393 {
00394 warning("Illegal exression reference {" + s + "}: No environment");
00395 return 0;
00396 }
00397 map<string, Ref<const Exp> >::const_iterator i = env->m.find(s);
00398 if(i == env->m.end())
00399 {
00400 warning("Illegal expression reference {" + s + "}: Undefined name");
00401 return 0;
00402 }
00403 return i->second;
00404 }
00405 };
00406
00407 struct Regex::Parser
00408 {
00409 struct Printer: LrParserBase::Printer
00410 {
00411 virtual string print(const RefAnyConst &a) const
00412 {
00413 if(!a) return "<null>";
00414 if(const UAttr *b = dynamic_cast<const UAttr *>(a.get()))
00415 return "\"" + Text::toString(b->value) + "\"";
00416 if(const IPAttr *b = dynamic_cast<const IPAttr *>(a.get()))
00417 return "(" + Text::toString(b->value.first) + "," +
00418 Text::toString(b->value.second) + ")";
00419 if(const Exp *b = dynamic_cast<const Exp *>(a.get()))
00420 return b->print();
00421 return "<unknown>";
00422 }
00423 };
00424
00425 auto_ptr<LrParserBase::Actor<ParserContext> > actor;
00426 auto_ptr<Printer> printer;
00427 auto_ptr<SlrParser> lrParser;
00428 int terminalMap[128];
00429
00430 Parser():
00431 actor(new LrParserBase::Actor<ParserContext>())
00432 {
00433 CFG g(actor.get());
00434
00435 const int dollar = g.defineTerminal("$");
00436 const int lpar = g.defineTerminal("(");
00437 const int rpar = g.defineTerminal(")");
00438 const int star = g.defineTerminal("*");
00439 const int plus = g.defineTerminal("+");
00440 const int comma = g.defineTerminal(",");
00441 const int hyphen = g.defineTerminal("-");
00442 const int flstop = g.defineTerminal(".");
00443 const int colon = g.defineTerminal(":");
00444 const int less = g.defineTerminal("<");
00445 const int equal = g.defineTerminal("=");
00446 const int greater = g.defineTerminal(">");
00447 const int question = g.defineTerminal("?");
00448 const int lbrack = g.defineTerminal("[");
00449 const int backsl = g.defineTerminal("\\");
00450 const int rbrack = g.defineTerminal("]");
00451 const int caret = g.defineTerminal("^");
00452 const int undersc = g.defineTerminal("_");
00453 const int lbrace = g.defineTerminal("{");
00454 const int vbar = g.defineTerminal("|");
00455 const int rbrace = g.defineTerminal("}");
00456 const int ALPHA = g.defineTerminal("ALPHA");
00457 const int DIGIT = g.defineTerminal("DIGIT");
00458 const int OTHER = g.defineTerminal("OTHER");
00459
00460 const int exp = g.defineNonterminal("exp");
00461 const int branch = g.defineNonterminal("branch");
00462 const int atom = g.defineNonterminal("atom");
00463 const int bracket = g.defineNonterminal("bracket");
00464 const int br_pos = g.defineNonterminal("br_pos");
00465 const int br_char_pos = g.defineNonterminal("br_char_pos");
00466 const int br_char_common = g.defineNonterminal("br_char_common");
00467 const int common_char = g.defineNonterminal("common_char");
00468 const int br_opt1 = g.defineNonterminal("br_opt1");
00469 const int br_start1 = g.defineNonterminal("br_start1");
00470 const int br_char1 = g.defineNonterminal("br_char1");
00471 const int br_opt2 = g.defineNonterminal("br_opt2");
00472 const int br_start2 = g.defineNonterminal("br_start2");
00473 const int br_char2 = g.defineNonterminal("br_char2");
00474 const int br_end = g.defineNonterminal("br_end");
00475 const int br_char_end = g.defineNonterminal("br_char_end");
00476 const int br_start3 = g.defineNonterminal("br_start3");
00477 const int br_start4 = g.defineNonterminal("br_start4");
00478 const int br_collate = g.defineNonterminal("br_collate");
00479 const int br_collate1 = g.defineNonterminal("br_collate1");
00480 const int br_collate_ch = g.defineNonterminal("br_collate_ch");
00481 const int br_collate2 = g.defineNonterminal("br_collate2");
00482 const int br_equiv = g.defineNonterminal("br_equiv");
00483 const int br_equiv1 = g.defineNonterminal("br_equiv1");
00484 const int br_equiv_ch = g.defineNonterminal("br_equiv_ch");
00485 const int br_equiv2 = g.defineNonterminal("br_equiv2");
00486 const int br_class = g.defineNonterminal("br_class");
00487 const int id_char_seq = g.defineNonterminal("id_char_seq");
00488 const int id_char = g.defineNonterminal("id_char");
00489 const int id_char_init = g.defineNonterminal("id_char_init");
00490 const int br_neg = g.defineNonterminal("br_neg");
00491 const int br_char_neg = g.defineNonterminal("br_char_neg");
00492 const int exp_name = g.defineNonterminal("exp_name");
00493 const int any_char = g.defineNonterminal("any_char");
00494 const int atom_char = g.defineNonterminal("atom_char");
00495 const int branch2 = g.defineNonterminal("branch2");
00496 const int repeat = g.defineNonterminal("repeat");
00497 const int repeat_range = g.defineNonterminal("repeat_range");
00498 const int integer = g.defineNonterminal("integer");
00499
00500 const int a_altern =
00501 actor->registerMethod("altern", &ParserContext::altern);
00502 const int a_empty =
00503 actor->registerMethod("empty", &ParserContext::empty);
00504 const int a_string =
00505 actor->registerMethod("string", &ParserContext::str);
00506 const int a_any =
00507 actor->registerMethod("any", &ParserContext::any);
00508 const int a_line_begin =
00509 actor->registerMethod("line_begin", &ParserContext::line_begin);
00510 const int a_line_end =
00511 actor->registerMethod("line_end", &ParserContext::line_end);
00512 const int a_word_begin =
00513 actor->registerMethod("word_begin", &ParserContext::word_begin);
00514 const int a_word_end =
00515 actor->registerMethod("word_end", &ParserContext::word_end);
00516 const int a_juxta =
00517 actor->registerMethod("juxta", &ParserContext::juxta);
00518 const int a_repeat =
00519 actor->registerMethod("repeat", &ParserContext::repeat);
00520 const int a_option =
00521 actor->registerMethod("option", &ParserContext::option);
00522 const int a_star =
00523 actor->registerMethod("star", &ParserContext::star);
00524 const int a_plus =
00525 actor->registerMethod("plus", &ParserContext::plus);
00526 const int a_repeat_range =
00527 actor->registerMethod("repeat_range", &ParserContext::repeat_range);
00528 const int a_new_bracket =
00529 actor->registerMethod("new_bracket", &ParserContext::new_bracket);
00530 const int a_pos_bracket =
00531 actor->registerMethod("pos_bracket", &ParserContext::pos_bracket);
00532 const int a_neg_bracket =
00533 actor->registerMethod("neg_bracket", &ParserContext::neg_bracket);
00534 const int a_named_class =
00535 actor->registerMethod("named_class", &ParserContext::named_class);
00536 const int a_bracket_range =
00537 actor->registerMethod("bracket_range", &ParserContext::bracket_range);
00538 const int a_collate =
00539 actor->registerMethod("collate", &ParserContext::collate);
00540 const int a_equiv =
00541 actor->registerMethod("equiv", &ParserContext::equiv);
00542 const int a_named_exp =
00543 actor->registerMethod("named_exp", &ParserContext::named_exp);
00544
00545
00546 g.addProd(exp, CFG::nont(branch));
00547 g.addProd(exp, CFG::nont(exp), CFG::term(vbar), CFG::nont(branch),
00548 CFG::act(a_altern, 2, 0));
00549
00550
00551 g.addProd(branch, CFG::nont(atom));
00552 g.addProd(branch, CFG::nont(atom), CFG::nont(branch2), CFG::copy(0));
00553
00554
00555 g.addProd(atom, CFG::term(lpar), CFG::term(rpar), CFG::act(a_empty));
00556 g.addProd(atom, CFG::term(lpar), CFG::nont(exp),
00557 CFG::term(rpar), CFG::copy(1));
00558 g.addProd(atom, CFG::term(lbrack), CFG::act(a_new_bracket),
00559 CFG::nont(bracket), CFG::term(rbrack), CFG::copy(1));
00560 g.addProd(atom, CFG::term(lbrace), CFG::nont(exp_name), CFG::term(rbrace),
00561 CFG::act(a_named_exp, 1));
00562 g.addProd(atom, CFG::term(backsl), CFG::nont(any_char),
00563 CFG::act(a_string, 0));
00564 g.addProd(atom, CFG::term(flstop), CFG::act(a_any));
00565 g.addProd(atom, CFG::term(caret), CFG::act(a_line_begin));
00566 g.addProd(atom, CFG::term(dollar), CFG::act(a_line_end));
00567 g.addProd(atom, CFG::nont(atom_char), CFG::act(a_string, 0));
00568
00569
00570 g.addProd(bracket, CFG::nont(br_pos), CFG::act(a_pos_bracket));
00571 g.addProd(bracket, CFG::term(caret), CFG::nont(br_neg),
00572 CFG::act(a_neg_bracket));
00573 g.addProd(bracket, CFG::term(lbrack), CFG::term(colon), CFG::term(less),
00574 CFG::term(colon), CFG::term(rbrack), CFG::act(a_word_begin));
00575 g.addProd(bracket, CFG::term(lbrack), CFG::term(colon), CFG::term(greater),
00576 CFG::term(colon), CFG::term(rbrack), CFG::act(a_word_end));
00577
00578
00579 g.addProd(br_pos, CFG::nont(br_char_pos), CFG::nont(br_opt1));
00580 g.addProd(br_pos, CFG::nont(br_opt2));
00581
00582
00583 g.addProd(br_char_pos, CFG::nont(br_char_common));
00584 g.addProd(br_char_pos, CFG::term(rbrack));
00585 g.addProd(br_char_pos, CFG::term(hyphen));
00586 g.addProd(br_char_pos, CFG::term(flstop));
00587 g.addProd(br_char_pos, CFG::term(equal));
00588 g.addProd(br_char_pos, CFG::term(colon));
00589
00590
00591 g.addProd(br_char_common, CFG::nont(common_char));
00592 g.addProd(br_char_common, CFG::term(vbar));
00593 g.addProd(br_char_common, CFG::term(lpar));
00594 g.addProd(br_char_common, CFG::term(rpar));
00595 g.addProd(br_char_common, CFG::term(lbrace));
00596 g.addProd(br_char_common, CFG::term(backsl));
00597 g.addProd(br_char_common, CFG::term(dollar));
00598 g.addProd(br_char_common, CFG::term(question));
00599 g.addProd(br_char_common, CFG::term(star));
00600 g.addProd(br_char_common, CFG::term(plus));
00601
00602
00603 g.addProd(common_char, CFG::term(rbrace));
00604 g.addProd(common_char, CFG::term(undersc));
00605 g.addProd(common_char, CFG::term(comma));
00606 g.addProd(common_char, CFG::term(less));
00607 g.addProd(common_char, CFG::term(greater));
00608 g.addProd(common_char, CFG::term(DIGIT));
00609 g.addProd(common_char, CFG::term(ALPHA));
00610 g.addProd(common_char, CFG::term(OTHER));
00611
00612
00613 g.addProd(br_opt1, CFG::act(a_bracket_range, 0, 0), CFG::nont(br_start1));
00614 g.addProd(br_opt1, CFG::term(hyphen), CFG::nont(br_end));
00615
00616
00617 g.addProd(br_start1);
00618 g.addProd(br_start1, CFG::nont(br_char1), CFG::nont(br_opt1));
00619 g.addProd(br_start1, CFG::nont(br_opt2));
00620
00621
00622 g.addProd(br_char1, CFG::nont(br_char_common));
00623 g.addProd(br_char1, CFG::term(flstop));
00624 g.addProd(br_char1, CFG::term(equal));
00625 g.addProd(br_char1, CFG::term(colon));
00626 g.addProd(br_char1, CFG::term(caret));
00627
00628
00629 g.addProd(br_opt2, CFG::term(lbrack), CFG::act(a_bracket_range, 0, 0),
00630 CFG::nont(br_start2));
00631 g.addProd(br_opt2, CFG::term(lbrack), CFG::term(hyphen),
00632 CFG::nont(br_end));
00633 g.addProd(br_opt2, CFG::nont(br_collate), CFG::nont(br_opt1));
00634 g.addProd(br_opt2, CFG::nont(br_equiv), CFG::nont(br_start3));
00635 g.addProd(br_opt2, CFG::nont(br_class), CFG::nont(br_start3));
00636
00637
00638 g.addProd(br_start2);
00639 g.addProd(br_start2, CFG::nont(br_char2), CFG::nont(br_opt1));
00640 g.addProd(br_start2, CFG::nont(br_opt2));
00641
00642
00643 g.addProd(br_char2, CFG::nont(br_char_common));
00644 g.addProd(br_char2, CFG::term(caret));
00645
00646
00647 g.addProd(br_end, CFG::act(a_bracket_range, 1, 1),
00648 CFG::act(a_bracket_range, 1, 1));
00649 g.addProd(br_end, CFG::nont(br_char_end), CFG::act(a_bracket_range, 2, 0),
00650 CFG::nont(br_start3));
00651 g.addProd(br_end, CFG::term(lbrack), CFG::act(a_bracket_range, 2, 0),
00652 CFG::nont(br_start4));
00653 g.addProd(br_end, CFG::nont(br_collate), CFG::act(a_bracket_range, 2, 0),
00654 CFG::nont(br_start3));
00655
00656
00657 g.addProd(br_char_end, CFG::nont(br_char1));
00658 g.addProd(br_char_end, CFG::term(hyphen));
00659
00660
00661 g.addProd(br_start3, CFG::term(hyphen), CFG::act(a_bracket_range, 0, 0));
00662 g.addProd(br_start3, CFG::nont(br_start1));
00663
00664
00665 g.addProd(br_start4, CFG::term(hyphen), CFG::act(a_bracket_range, 0, 0));
00666 g.addProd(br_start4, CFG::nont(br_start2));
00667
00668
00669 g.addProd(br_collate, CFG::term(lbrack), CFG::term(flstop),
00670 CFG::nont(br_collate1), CFG::act(a_collate, 0));
00671
00672
00673 g.addProd(br_collate1, CFG::nont(br_collate_ch), CFG::nont(br_collate1),
00674 CFG::concat(1, 0));
00675 g.addProd(br_collate1, CFG::term(flstop), CFG::nont(br_collate2),
00676 CFG::concat(1, 0));
00677 g.addProd(br_collate1, CFG::term(rbrack), CFG::nont(br_collate1),
00678 CFG::concat(1, 0));
00679
00680
00681 g.addProd(br_collate_ch, CFG::nont(br_char_common));
00682 g.addProd(br_collate_ch, CFG::term(lbrack));
00683 g.addProd(br_collate_ch, CFG::term(hyphen));
00684 g.addProd(br_collate_ch, CFG::term(equal));
00685 g.addProd(br_collate_ch, CFG::term(colon));
00686 g.addProd(br_collate_ch, CFG::term(caret));
00687
00688
00689 g.addProd(br_collate2, CFG::nont(br_collate_ch), CFG::nont(br_collate1),
00690 CFG::concat(1, 0));
00691 g.addProd(br_collate2, CFG::term(flstop), CFG::nont(br_collate2),
00692 CFG::concat(1, 0));
00693 g.addProd(br_collate2, CFG::term(rbrack), CFG::null());
00694
00695
00696 g.addProd(br_equiv, CFG::term(lbrack), CFG::term(equal),
00697 CFG::nont(br_equiv1), CFG::act(a_equiv, 0));
00698
00699
00700 g.addProd(br_equiv1, CFG::nont(br_equiv_ch), CFG::nont(br_equiv1),
00701 CFG::concat(1, 0));
00702 g.addProd(br_equiv1, CFG::term(equal), CFG::nont(br_equiv2),
00703 CFG::concat(1, 0));
00704 g.addProd(br_equiv1, CFG::term(rbrack), CFG::nont(br_equiv1),
00705 CFG::concat(1, 0));
00706
00707
00708 g.addProd(br_equiv_ch, CFG::nont(br_char_common));
00709 g.addProd(br_equiv_ch, CFG::term(lbrack));
00710 g.addProd(br_equiv_ch, CFG::term(hyphen));
00711 g.addProd(br_equiv_ch, CFG::term(flstop));
00712 g.addProd(br_equiv_ch, CFG::term(colon));
00713 g.addProd(br_equiv_ch, CFG::term(caret));
00714
00715
00716 g.addProd(br_equiv2, CFG::nont(br_equiv_ch), CFG::nont(br_equiv1),
00717 CFG::concat(1, 0));
00718 g.addProd(br_equiv2, CFG::term(equal), CFG::nont(br_equiv2),
00719 CFG::concat(1, 0));
00720 g.addProd(br_equiv2, CFG::term(rbrack), CFG::null());
00721
00722
00723 g.addProd(br_class, CFG::term(lbrack), CFG::term(colon),
00724 CFG::nont(id_char_seq), CFG::term(colon), CFG::term(rbrack),
00725 CFG::act(a_named_class, 2));
00726
00727
00728 g.addProd(id_char_seq);
00729 g.addProd(id_char_seq, CFG::nont(id_char_seq), CFG::nont(id_char),
00730 CFG::concat(1, 0));
00731
00732
00733 g.addProd(id_char, CFG::nont(id_char_init));
00734 g.addProd(id_char, CFG::term(DIGIT));
00735
00736
00737 g.addProd(id_char_init, CFG::term(undersc));
00738 g.addProd(id_char_init, CFG::term(ALPHA));
00739
00740
00741 g.addProd(br_neg, CFG::nont(br_char_neg), CFG::nont(br_opt1));
00742 g.addProd(br_neg, CFG::nont(br_opt2));
00743
00744
00745 g.addProd(br_char_neg, CFG::nont(br_char_pos));
00746 g.addProd(br_char_neg, CFG::term(caret));
00747
00748
00749 g.addProd(exp_name, CFG::nont(id_char_init), CFG::nont(id_char_seq),
00750 CFG::concat(1, 0));
00751
00752
00753 g.addProd(any_char, CFG::nont(br_char_neg));
00754 g.addProd(any_char, CFG::term(lbrack));
00755
00756
00757 g.addProd(atom_char, CFG::nont(common_char));
00758 g.addProd(atom_char, CFG::term(rbrack));
00759 g.addProd(atom_char, CFG::term(hyphen));
00760 g.addProd(atom_char, CFG::term(colon));
00761 g.addProd(atom_char, CFG::term(equal));
00762
00763
00764 g.addProd(branch2, CFG::nont(branch), CFG::act(a_juxta, 1, 0));
00765 g.addProd(branch2, CFG::nont(repeat), CFG::act(a_repeat, 1, 0));
00766 g.addProd(branch2, CFG::nont(repeat), CFG::act(a_repeat, 1, 0),
00767 CFG::nont(branch), CFG::act(a_juxta, 1, 0));
00768
00769
00770 g.addProd(repeat, CFG::term(question), CFG::act(a_option));
00771 g.addProd(repeat, CFG::term(star), CFG::act(a_star));
00772 g.addProd(repeat, CFG::term(plus), CFG::act(a_plus));
00773 g.addProd(repeat, CFG::term(lbrace), CFG::nont(repeat_range),
00774 CFG::term(rbrace), CFG::copy(1));
00775
00776
00777 g.addProd(repeat_range, CFG::nont(integer),
00778 CFG::act(a_repeat_range, 0, 0));
00779 g.addProd(repeat_range, CFG::nont(integer), CFG::term(comma),
00780 CFG::act(a_repeat_range, 1, -1));
00781 g.addProd(repeat_range, CFG::nont(integer), CFG::term(comma),
00782 CFG::nont(integer), CFG::act(a_repeat_range, 2, 0));
00783
00784
00785 g.addProd(integer, CFG::nont(integer), CFG::term(DIGIT),
00786 CFG::concat(1, 0));
00787 g.addProd(integer, CFG::term(DIGIT));
00788
00789 printer = auto_ptr<Printer>(new Printer);
00790 lrParser = auto_ptr<SlrParser>(new SlrParser(g, actor.get(), printer.get()));
00791
00792 for(int i=0; i<128; ++i) terminalMap[i] = OTHER;
00793 for(int i='0'; i<='9'; ++i) terminalMap[i] = DIGIT;
00794 for(int i='A'; i<='Z'; ++i) terminalMap[i] = ALPHA;
00795 for(int i='a'; i<='z'; ++i) terminalMap[i] = ALPHA;
00796
00797 terminalMap[static_cast<int>('$')] = dollar;
00798 terminalMap[static_cast<int>('(')] = lpar;
00799 terminalMap[static_cast<int>(')')] = rpar;
00800 terminalMap[static_cast<int>('*')] = star;
00801 terminalMap[static_cast<int>('+')] = plus;
00802 terminalMap[static_cast<int>(',')] = comma;
00803 terminalMap[static_cast<int>('-')] = hyphen;
00804 terminalMap[static_cast<int>('.')] = flstop;
00805 terminalMap[static_cast<int>(':')] = colon;
00806 terminalMap[static_cast<int>('<')] = less;
00807 terminalMap[static_cast<int>('=')] = equal;
00808 terminalMap[static_cast<int>('>')] = greater;
00809 terminalMap[static_cast<int>('?')] = question;
00810 terminalMap[static_cast<int>('[')] = lbrack;
00811 terminalMap[static_cast<int>('\\')] = backsl;
00812 terminalMap[static_cast<int>(']')] = rbrack;
00813 terminalMap[static_cast<int>('^')] = caret;
00814 terminalMap[static_cast<int>('_')] = undersc;
00815 terminalMap[static_cast<int>('{')] = lbrace;
00816 terminalMap[static_cast<int>('|')] = vbar;
00817 terminalMap[static_cast<int>('}')] = rbrace;
00818 }
00819 };
00820
00821 struct Regex::Lexer: LexerBase
00822 {
00823 ustring s;
00824 unsigned i;
00825 const Parser &p;
00826 ustring text;
00827 int type;
00828
00829 Lexer(ustring s, const Parser &p): s(s), i(0), p(p) {}
00830
00831 void getNext(LexerBase::Lexeme &l)
00832 {
00833 if(i == s.size())
00834 {
00835 l.type = type = -1;
00836 return;
00837 }
00838 uchar c = s[i++];
00839 text = ustring(1, c);
00840 l.type = type = p.terminalMap[c < 128 ? c : 0];
00841 l.value = new RefObject<ustring>(text);
00842 }
00843
00844 ustring getText() const
00845 {
00846 return text;
00847 }
00848
00849 int getType() const
00850 {
00851 return type;
00852 }
00853 };
00854
00855 const Regex::Parser &Regex::getParser()
00856 {
00857 static Parser p;
00858 return p;
00859 }
00860
00861 void Regex::parse(ustring s, Logger *logger, const Environment *e)
00862 {
00863 const Parser &parser = getParser();
00864 Lexer lexer(s, parser);
00865 ParserContext context(&lexer, e, logger);
00866 parser.lrParser->parse(lexer, context, exp, logger);
00867 if(!exp) ARCHON_THROW1(ArgumentException,
00868 "Could not make any sense of input. "
00869 "Problems were reported to the logger.");
00870 }
00871
00872 Regex Regex::repeat(Regex r, int min, int max)
00873 {
00874 if(min < 0) ARCHON_THROW1(ArgumentException, "'min' was negative");
00875 if(max < min) ARCHON_THROW1(ArgumentException, "'max' was less than 'min'");
00876 return Regex(new Repeat(r.exp, min, max));
00877 }
00878
00879 Regex Regex::repeat(Regex r, int n, bool orMore)
00880 {
00881 if(n < 0) ARCHON_THROW1(ArgumentException, "'n' was negative");
00882 return Regex(new Repeat(r.exp, n, orMore ? -1 : n));
00883 }
00884
00885 Regex Regex::bracket(const vector<pair<uchar, uchar> > &ranges,
00886 const vector<string> &namedClasses,
00887 bool invert)
00888 {
00889 for(unsigned i=0; i<ranges.size(); ++i)
00890 {
00891 const pair<uchar, uchar> &p = ranges[i];
00892 if(p.second < p.first)
00893 ARCHON_THROW1(ArgumentException,
00894 "A range has a first component greater that "
00895 "its second component");
00896 }
00897 vector<bool> n(Class::name_xdigit+1);
00898 for(unsigned i=0; i<namedClasses.size(); ++i)
00899 {
00900 const string &s = namedClasses[i];
00901 const int j =
00902 s == "alnum" ? Class::name_alnum :
00903 s == "alpha" ? Class::name_alpha :
00904 s == "blank" ? Class::name_blank :
00905 s == "cntrl" ? Class::name_cntrl :
00906 s == "digit" ? Class::name_digit :
00907 s == "graph" ? Class::name_graph :
00908 s == "lower" ? Class::name_lower :
00909 s == "print" ? Class::name_print :
00910 s == "punct" ? Class::name_punct :
00911 s == "space" ? Class::name_space :
00912 s == "upper" ? Class::name_upper :
00913 s == "xdigit" ? Class::name_xdigit : -1;
00914 if(j==-1)
00915 ARCHON_THROW1(ArgumentException, "Unknown class name \"" + s + "\"");
00916 n[j] = true;
00917 }
00918 if(!invert && ranges.size() == 0 && namedClasses.size() == 0)
00919 ARCHON_THROW1(ArgumentException, "Empty positive bracket");
00920 return Regex(new Class(ranges, n, invert));
00921 }
00922
00926 void Regex::makeNfa(NFA &a, const Exp *e, int context) const
00927 {
00928 if(const Altern *f = dynamic_cast<const Altern *>(e))
00929 {
00930 if(context != 2)
00931 {
00932 makeNfa(a, f->e1.get(), context);
00933 makeNfa(a, f->e2.get(), 1);
00934 return;
00935 }
00936
00937 NFA b;
00938 makeNfa(b, f->e1.get(), 0);
00939 makeNfa(b, f->e2.get(), 1);
00940 a.concat(b);
00941 return;
00942 }
00943
00944 if(const Juxta *f = dynamic_cast<const Juxta *>(e))
00945 {
00946 if(context != 1)
00947 {
00948 makeNfa(a, f->e1.get(), context);
00949 makeNfa(a, f->e2.get(), 2);
00950 return;
00951 }
00952
00953 NFA b;
00954 makeNfa(b, f->e1.get(), 0);
00955 makeNfa(b, f->e2.get(), 2);
00956 a.altern(b);
00957 return;
00958 }
00959
00960 if(const Repeat *f = dynamic_cast<const Repeat *>(e))
00961 {
00962
00963
00964
00965 if(f->min > 0)
00966 {
00967 Ref<Juxta> g =
00968 new Juxta(f->e, new Repeat(f->e, f->min-1, f->max == -1 ? -1 : f->max-1));
00969 makeNfa(a, g.get(), context);
00970 return;
00971 }
00972
00973
00974 if(f->max > 0)
00975 {
00976 Ref<Altern> g =
00977 new Altern(new String(), new Juxta(f->e, new Repeat(f->e, 0, f->max-1)));
00978 makeNfa(a, g.get(), context);
00979 return;
00980 }
00981
00982
00983 if(f->max == 0) return;
00984
00985
00986
00987
00988 if(context == 0)
00989 {
00990 makeNfa(a, f->e.get(), 0);
00991 a.repeat(true);
00992 return;
00993 }
00994
00995 NFA b;
00996 makeNfa(b, f->e.get(), 0);
00997 b.repeat(true);
00998 if(context == 1) a.altern(b);
00999 else a.concat(b);
01000 return;
01001 }
01002
01003 if(const String *f = dynamic_cast<const String *>(e))
01004 {
01005 NFA b(f->s);
01006 switch(context)
01007 {
01008 case 0: a = b; break;
01009 case 1: a.altern(b); break;
01010 case 2: a.concat(b); break;
01011 }
01012 return;
01013 }
01014
01015 if(const Class *f = dynamic_cast<const Class *>(e))
01016 {
01017 for(unsigned i=0; i<f->namedClasses.size(); ++i)
01018 if(f->namedClasses[i])
01019 ARCHON_THROW1(ArgumentException, "Named classes supported yet");
01020
01021 vector<pair<unsigned, unsigned> > ranges;
01022
01023 if(f->invert)
01024 {
01025 ranges.push_back(make_pair(0u, 0xDFFFu));
01026 #if !defined __GNUC__ || __GNUC__ > 2
01027 ranges.push_back(make_pair(0xF900, numeric_limits<unsigned>::max()));
01028 #else
01029 ranges.push_back(make_pair(0xF900, UINT_MAX));
01030 #endif
01031 for(unsigned i=0; i<f->ranges.size(); ++i)
01032 {
01033 const pair<uchar, uchar> &r = f->ranges[i];
01034 unsigned j=0;
01035 while(j<ranges.size() && ranges[j].second < r.first) ++j;
01036 if(j == ranges.size()) continue;
01037 if(ranges[j].first < r.first)
01038 {
01039 if(ranges[j].second > r.second)
01040 {
01041 ranges.insert(ranges.begin()+j+1, make_pair(r.second+1, ranges[j].second));
01042 ranges[j].second = r.first-1;
01043 continue;
01044 }
01045 ranges[j].second = r.first-1;
01046 ++j;
01047 if(j == ranges.size()) continue;
01048 }
01049 while(j<ranges.size() && ranges[j].second <= r.second)
01050 ranges.erase(ranges.begin()+j);
01051 if(j == ranges.size()) continue;
01052 if(ranges[j].first > r.second) continue;
01053 ranges[j].first = r.second+1;
01054 }
01055 }
01056 else
01057 {
01058 for(unsigned i=0; i<f->ranges.size(); ++i)
01059 {
01060 const pair<uchar, uchar> &r = f->ranges[i];
01061 ranges.push_back(make_pair(r.first, r.second));
01062 }
01063 }
01064
01065 NFA b(ranges);
01066
01067 switch(context)
01068 {
01069 case 0: a = b; break;
01070 case 1: a.altern(b); break;
01071 case 2: a.concat(b); break;
01072 }
01073 return;
01074 }
01075
01076 if(dynamic_cast<const LineBegin *>(e))
01077 {
01078 NFA b(0xE000, 0xE000);
01079 switch(context)
01080 {
01081 case 0: a = b; break;
01082 case 1: a.altern(b); break;
01083 case 2: a.concat(b); break;
01084 }
01085 return;
01086 }
01087
01088
01089 if(dynamic_cast<const LineEnd *>(e))
01090 {
01091 NFA b(0xE001, 0xE001);
01092 switch(context)
01093 {
01094 case 0: a = b; break;
01095 case 1: a.altern(b); break;
01096 case 2: a.concat(b); break;
01097 }
01098 return;
01099 }
01100
01101 if(dynamic_cast<const WordBegin *>(e))
01102 {
01103 NFA b(0xE002, 0xE002);
01104 switch(context)
01105 {
01106 case 0: a = b; break;
01107 case 1: a.altern(b); break;
01108 case 2: a.concat(b); break;
01109 }
01110 return;
01111 }
01112
01113
01114 if(dynamic_cast<const WordEnd *>(e))
01115 {
01116 NFA b(0xE003, 0xE003);
01117 switch(context)
01118 {
01119 case 0: a = b; break;
01120 case 1: a.altern(b); break;
01121 case 2: a.concat(b); break;
01122 }
01123 return;
01124 }
01125
01126 ARCHON_THROW1(ArgumentException, "Unsupported elements in Regex");
01127 }
01128
01129 void Regex::makeNfa(NFA &a) const
01130 {
01131 makeNfa(a, exp.get(), 0);
01132 }
01133 }
01134 }
01135