00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #ifndef ARCHON_UTILITIES_REGEX_H
00034 #define ARCHON_UTILITIES_REGEX_H
00035
00036 #include <string>
00037 #include <vector>
00038 #include <map>
00039
00040 #include <archon/util/unicode.H>
00041 #include <archon/util/ref.H>
00042 #include <archon/util/logger.H>
00043
00044 namespace Archon
00045 {
00046 namespace Utilities
00047 {
00048 using namespace std;
00049
00050 class NFA;
00051
00070 class Regex
00071 {
00072 friend struct NFA;
00073 struct Lexer;
00074 struct Parser;
00075 struct ParserContext;
00076
00077 struct Exp: virtual RefObjectBase
00078 {
00079 virtual string print(int contextPrecedence) const = 0;
00080 string print() const { return print(0); }
00081 };
00082
00083 struct Altern: Exp
00084 {
00085 Ref<const Exp> e1, e2;
00086
00087 Altern(Ref<const Exp> e1, Ref<const Exp> e2): e1(e1), e2(e2) {}
00088 string print(int) const;
00089 };
00090
00091 struct Juxta: Exp
00092 {
00093 Ref<const Exp> e1, e2;
00094
00095 Juxta(Ref<const Exp> e1, Ref<const Exp> e2): e1(e1), e2(e2) {}
00096 string print(int) const;
00097 };
00098
00099 struct Repeat: Exp
00100 {
00101 Ref<const Exp> e;
00102 int min;
00103 int max;
00104
00105 Repeat(Ref<const Exp> e, int min, int max): e(e), min(min), max(max) {}
00106 string print(int) const;
00107 };
00108
00109 struct String: Exp
00110 {
00111 ustring s;
00112
00113 String(ustring s): s(s) {}
00114
00118 String() {}
00119
00120 string print(int) const;
00121 };
00122
00123 struct Class: Exp
00124 {
00125 enum Name
00126 {
00127 name_alnum = 0,
00128 name_alpha,
00129 name_blank,
00130 name_cntrl,
00131 name_digit,
00132 name_graph,
00133 name_lower,
00134 name_print,
00135 name_punct,
00136 name_space,
00137 name_upper,
00138 name_xdigit
00139 };
00140
00141 vector<pair<uchar, uchar> > ranges;
00142 vector<bool> namedClasses;
00143 bool invert;
00144
00145 Class(const vector<pair<uchar, uchar> > ranges,
00146 const vector<bool> namedClasses, bool invert):
00147 ranges(ranges), namedClasses(namedClasses), invert(invert) {}
00148
00152 Class(): namedClasses(name_xdigit+1), invert(true) {}
00153
00154 string print(int) const;
00155 };
00156
00160 struct LineBegin: Exp
00161 {
00162 string print(int) const;
00163 };
00164
00168 struct LineEnd: Exp
00169 {
00170 string print(int) const;
00171 };
00172
00176 struct WordBegin: Exp
00177 {
00178 string print(int) const;
00179 };
00180
00184 struct WordEnd: Exp
00185 {
00186 string print(int) const;
00187 };
00188
00189 public:
00193 static Regex altern(Regex r1, Regex r2)
00194 {
00195 return Regex(new Altern(r1.exp, r2.exp));
00196 }
00197
00201 static Regex juxta(Regex r1, Regex r2)
00202 {
00203 return Regex(new Juxta(r1.exp, r2.exp));
00204 }
00205
00209 static Regex repeat(Regex r, int min, int max);
00210
00216 static Regex repeat(Regex r, int n, bool orMore);
00217
00218 static Regex star(Regex r) { return Regex(new Repeat(r.exp, 0, -1)); }
00219 static Regex plus(Regex r) { return Regex(new Repeat(r.exp, 1, -1)); }
00220 static Regex option(Regex r) { return Regex(new Repeat(r.exp, 0, 1)); }
00221
00222 static Regex str(ustring s) { return Regex(new String(s)); }
00223
00227 static Regex str(string s) { return Regex(new String(Unicode::decodeUtf8(s))); }
00228
00232 static Regex empty() { return Regex(new String()); }
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246 static Regex bracket(const vector<pair<uchar, uchar> > &ranges,
00247 const vector<string> &namedClasses,
00248 bool invert=false);
00249
00253 static Regex range(uchar from, uchar to, bool invert=false)
00254 {
00255 vector<pair<uchar, uchar> > r;
00256 r.push_back(make_pair(from, to));
00257 vector<string> n;
00258 return bracket(r, n, invert);
00259 }
00260
00265 static Regex namedClass(string name, bool invert=false)
00266 {
00267 vector<pair<uchar, uchar> > r;
00268 vector<string> n;
00269 n.push_back(name);
00270 return bracket(r, n, invert);
00271 }
00272
00276 static Regex anyChar() { return Regex(new Class()); }
00277
00278 static Regex lineBegin() { return Regex(new LineBegin()); }
00279 static Regex lineEnd() { return Regex(new LineEnd()); }
00280 static Regex wordBegin() { return Regex(new WordBegin()); }
00281 static Regex wordEnd() { return Regex(new WordEnd()); }
00282
00283
00284 class Environment
00285 {
00286 friend class ParserContext;
00287 map<string, Ref<const Exp> > m;
00288
00289 public:
00290 void define(string name, const Regex &r)
00291 {
00292 m[name] = r.exp;
00293 }
00294 };
00295
00304 Regex(ustring s, Logger *l=0)
00305 {
00306 parse(s, l, 0);
00307 }
00308
00316 Regex(ustring s, const Environment &e, Logger *l=0)
00317 {
00318 parse(s, l, &e);
00319 }
00320
00327 Regex(string s, Logger *l=0)
00328 {
00329 parse(Unicode::decodeUtf8(s), l, 0);
00330 }
00331
00339 Regex(string s, const Environment &e, Logger *l=0)
00340 {
00341 parse(Unicode::decodeUtf8(s), l, &e);
00342 }
00343
00348 string print() const { return exp->print(); }
00349
00350 private:
00351 Ref<const Exp> exp;
00352
00353 Regex(Ref<const Exp> exp): exp(exp) {}
00354
00355 static const Parser &getParser();
00356
00357 void parse(ustring, Logger *, const Environment *);
00358
00359 void makeNfa(NFA &, const Exp *, int context) const;
00360 void makeNfa(NFA &) const;
00361 };
00362 }
00363 }
00364
00365 #endif // ARCHON_UTILITIES_REGEX_H