regex.H

00001 /*
00002  * This file is part of the "Archon" framework.
00003  * (http://files3d.sourceforge.net)
00004  *
00005  * Copyright © 2002 by Kristian Spangsege and Brian Kristiansen.
00006  *
00007  * Permission to use, copy, modify, and distribute this software and
00008  * its documentation under the terms of the GNU General Public License is
00009  * hereby granted. No representations are made about the suitability of
00010  * this software for any purpose. It is provided "as is" without express
00011  * or implied warranty. See the GNU General Public License
00012  * (http://www.gnu.org/copyleft/gpl.html) for more details.
00013  *
00014  * The characters in this file are ISO8859-1 encoded.
00015  *
00016  * The documentation in this file is in "Doxygen" style
00017  * (http://www.doxygen.org).
00018  */
00019 
00020 /*
00021 
00022 Features:
00023 
00024 - Syntactically identical to POSIX 1003.2 regular expressions except
00025 for the added ability to include a reference to a previously named
00026 expression in 'Lex' style - eg. '({foo}|{bar})*' where 'foo' and
00027 'bar' are previously defined expressions.
00028 
00029 - Is Unicode enabled.
00030 
00031 */
00032 
00033 #ifndef ARCHON_UTILITIES_REGEX_H
00034 #define ARCHON_UTILITIES_REGEX_H
00035 
00036 #include <string>
00037 #include <vector>
00038 #include <map>
00039 
00040 #include <archon/util/unicode.H>
00041 #include <archon/util/ref.H>
00042 #include <archon/util/logger.H>
00043 
00044 namespace Archon
00045 {
00046   namespace Utilities
00047   {
00048     using namespace std;
00049 
00050     class NFA; // See "nfa.H"
00051 
00070     class Regex
00071     {
00072       friend struct NFA;
00073       struct Lexer;
00074       struct Parser;
00075       struct ParserContext;
00076 
00077       struct Exp: virtual RefObjectBase
00078       {
00079         virtual string print(int contextPrecedence) const = 0;
00080         string print() const { return print(0); }
00081       };
00082 
00083       struct Altern: Exp
00084       {
00085         Ref<const Exp> e1, e2;
00086 
00087         Altern(Ref<const Exp> e1, Ref<const Exp> e2): e1(e1), e2(e2) {}
00088         string print(int) const;
00089       };
00090 
00091       struct Juxta: Exp
00092       {
00093         Ref<const Exp> e1, e2;
00094 
00095         Juxta(Ref<const Exp> e1, Ref<const Exp> e2): e1(e1), e2(e2) {}
00096         string print(int) const;
00097       };
00098 
00099       struct Repeat: Exp
00100       {
00101         Ref<const Exp> e;
00102         int min;
00103         int max; // -1 if no max
00104 
00105         Repeat(Ref<const Exp> e, int min, int max): e(e), min(min), max(max) {}
00106         string print(int) const;
00107       };
00108 
00109       struct String: Exp
00110       {
00111         ustring s;
00112 
00113         String(ustring s): s(s) {}
00114 
00118         String() {}
00119 
00120         string print(int) const;
00121       };
00122 
00123       struct Class: Exp
00124       {
00125         enum Name
00126         {
00127           name_alnum = 0,
00128           name_alpha,
00129           name_blank,
00130           name_cntrl,
00131           name_digit,
00132           name_graph,
00133           name_lower,
00134           name_print,
00135           name_punct,
00136           name_space,
00137           name_upper,
00138           name_xdigit
00139         };
00140 
00141         vector<pair<uchar, uchar> > ranges;
00142         vector<bool> namedClasses;
00143         bool invert;
00144 
00145         Class(const vector<pair<uchar, uchar> > ranges,
00146               const vector<bool> namedClasses, bool invert):
00147           ranges(ranges), namedClasses(namedClasses), invert(invert) {}
00148 
00152         Class(): namedClasses(name_xdigit+1), invert(true) {}
00153 
00154         string print(int) const;
00155       };
00156 
00160       struct LineBegin: Exp
00161       {
00162         string print(int) const;
00163       };
00164 
00168       struct LineEnd: Exp
00169       {
00170         string print(int) const;
00171       };
00172 
00176       struct WordBegin: Exp
00177       {
00178         string print(int) const;
00179       };
00180 
00184       struct WordEnd: Exp
00185       {
00186         string print(int) const;
00187       };
00188 
00189     public:
00193       static Regex altern(Regex r1, Regex r2)
00194       {
00195         return Regex(new Altern(r1.exp, r2.exp));
00196       }
00197 
00201       static Regex juxta(Regex r1, Regex r2)
00202       {
00203         return Regex(new Juxta(r1.exp, r2.exp));
00204       }
00205 
00209       static Regex repeat(Regex r, int min, int max);
00210 
00216       static Regex repeat(Regex r, int n, bool orMore);
00217 
00218       static Regex star(Regex r) { return Regex(new Repeat(r.exp, 0, -1)); }
00219       static Regex plus(Regex r) { return Regex(new Repeat(r.exp, 1, -1)); }
00220       static Regex option(Regex r) { return Regex(new Repeat(r.exp, 0, 1)); }
00221 
00222       static Regex str(ustring s) { return Regex(new String(s)); }
00223 
00227       static Regex str(string s) { return Regex(new String(Unicode::decodeUtf8(s))); }
00228 
00232       static Regex empty() { return Regex(new String()); }
00233 
00234       /*
00235        * Recognized class names are:
00236        *   alnum, alpha, blank, cntrl,
00237        *   digit, graph, lower, print,
00238        *   punct, space, upper, xdigit
00239        *
00240        * Each range must have a first component less than or equal to
00241        * its second component.
00242        *
00243        * It is illegal to request a non-inverted bracket with no ranges
00244        * and no named classes.
00245        */
00246       static Regex bracket(const vector<pair<uchar, uchar> > &ranges,
00247                            const vector<string> &namedClasses,
00248                            bool invert=false);
00249 
00253       static Regex range(uchar from, uchar to, bool invert=false)
00254       {
00255         vector<pair<uchar, uchar> > r;
00256         r.push_back(make_pair(from, to));
00257         vector<string> n;
00258         return bracket(r, n, invert);
00259       }
00260 
00265       static Regex namedClass(string name, bool invert=false)
00266       {
00267         vector<pair<uchar, uchar> > r;
00268         vector<string> n;
00269         n.push_back(name);
00270         return bracket(r, n, invert);
00271       }
00272 
00276       static Regex anyChar() { return Regex(new Class()); }
00277 
00278       static Regex lineBegin() { return Regex(new LineBegin()); }
00279       static Regex lineEnd()   { return Regex(new LineEnd()); }
00280       static Regex wordBegin() { return Regex(new WordBegin()); }
00281       static Regex wordEnd()   { return Regex(new WordEnd()); }
00282 
00283 
00284       class Environment
00285       {
00286         friend class ParserContext;
00287         map<string, Ref<const Exp> > m;
00288 
00289       public:
00290         void define(string name, const Regex &r)
00291         {
00292           m[name] = r.exp;
00293         }
00294       };
00295 
00304       Regex(ustring s, Logger *l=0)
00305       {
00306         parse(s, l, 0);
00307       }
00308 
00316       Regex(ustring s, const Environment &e, Logger *l=0)
00317       {
00318         parse(s, l, &e);
00319       }
00320 
00327       Regex(string s, Logger *l=0)
00328       {
00329         parse(Unicode::decodeUtf8(s), l, 0);
00330       }
00331 
00339       Regex(string s, const Environment &e, Logger *l=0)
00340       {
00341         parse(Unicode::decodeUtf8(s), l, &e);
00342       }
00343 
00348       string print() const { return exp->print(); }
00349 
00350     private:
00351       Ref<const Exp> exp;
00352 
00353       Regex(Ref<const Exp> exp): exp(exp) {}
00354 
00355       static const Parser &getParser();
00356 
00357       void parse(ustring, Logger *, const Environment *);
00358 
00359       void makeNfa(NFA &, const Exp *, int context) const;
00360       void makeNfa(NFA &) const;
00361     };
00362   }
00363 }
00364 
00365 #endif // ARCHON_UTILITIES_REGEX_H

Generated on Sun Jul 30 22:55:45 2006 for Archon by  doxygen 1.4.4