00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef ARCHON_UTILITIES_LEXER_H
00021 #define ARCHON_UTILITIES_LEXER_H
00022
00023 #include <memory>
00024 #include <vector>
00025
00026 #include <archon/util/ref.H>
00027 #include <archon/util/unicode.H>
00028 #include <archon/util/stream.H>
00029 #include <archon/util/dfa.H>
00030
00031 #include <archon/util/regex.H>
00032
00033 namespace Archon
00034 {
00035 namespace Utilities
00036 {
00037 using namespace std;
00038
00050 struct LexerBase
00051 {
00052 struct InputException: virtual Utilities::Exception
00053 {
00054 int lineNumber;
00055 ustring text;
00056 InputException(int lineNumber, ustring text):
00057 lineNumber(lineNumber), text(text) {}
00058 virtual ~InputException() throw() {}
00059 string getMessage() const throw();
00060 };
00061
00062 struct Lexeme
00063 {
00064 int type;
00065 RefAny value;
00066 Lexeme(int type = -1, RefAny value = 0): type(type), value(value) {}
00067 };
00068
00069 virtual ~LexerBase() {}
00070
00077 virtual void getNext(Lexeme &l) = 0;
00078
00084 virtual ustring getText() const = 0;
00085
00090 virtual int getType() const = 0;
00091 };
00092
00120 class Lexer: public LexerBase
00121 {
00122 public:
00132 struct Context
00133 {
00134 virtual ~Context() {}
00135 virtual void lexerError() = 0;
00136 };
00137
00143 class ActorBase
00144 {
00145 friend class Lexer;
00146
00147 protected:
00148 struct MethodBase: virtual RefObjectBase
00149 {
00150 virtual ~MethodBase() {}
00151 virtual void call(Context *, Lexeme &) const = 0;
00152 };
00153
00154 vector<Ref<const MethodBase> > methods;
00155
00156 virtual ~ActorBase() {}
00157
00158 void call(int method, Context *c, Lexeme &l) const
00159 {
00160 methods[method]->call(c, l);
00161 }
00162
00163 virtual bool verifyContext(const Context &) const = 0;
00164 };
00165
00177 template<class C>
00178 class Actor: public ActorBase
00179 {
00180 bool verifyContext(const Context &c) const
00181 {
00182 return dynamic_cast<const C *>(&c);
00183 }
00184
00185 class Method0: public MethodBase
00186 {
00187 void (C::*method)();
00188
00189 public:
00190 Method0(void (C::*method)()): method(method) {}
00191
00192 void call(Lexer::Context *c, Lexer::Lexeme &l) const
00193 {
00194 (static_cast<C *>(c)->*method)();
00195 l.type = -1;
00196 }
00197 };
00198
00199 class Method1: public MethodBase
00200 {
00201 void (C::*method)(Lexeme &);
00202
00203 public:
00204 Method1(void (C::*method)(Lexeme &)): method(method) {}
00205
00206 void call(Context *c, Lexeme &l) const
00207 {
00208 (static_cast<C *>(c)->*method)(l);
00209 }
00210 };
00211
00212 template<typename R>
00213 class Method2: public MethodBase
00214 {
00215 R (C::*method)();
00216
00217 public:
00218 Method2(R (C::*method)()): method(method) {}
00219
00220 void call(Context *c, Lexeme &l) const
00221 {
00222 l.value = new RefObject<R>((static_cast<C *>(c)->*method)());
00223 }
00224 };
00225
00226 public:
00227 int registerMethod(void (C::*method)())
00228 {
00229 methods.push_back(new Method0(method));
00230 return methods.size()-1;
00231 }
00232
00233 int registerMethod(void (C::*method)(Lexeme &))
00234 {
00235 methods.push_back(new Method1(method));
00236 return methods.size()-1;
00237 }
00238
00239 template<typename R>
00240 int registerMethod(R (C::*method)())
00241 {
00242 methods.push_back(new Method2<R>(method));
00243 return methods.size()-1;
00244 }
00245 };
00246
00247 class Engine;
00248
00258 class RuleSet
00259 {
00260 friend class Engine;
00261
00262 struct Rule
00263 {
00264 Regex regex;
00265 int method;
00266 Lexeme lexeme;
00267 Rule(const Regex &r, int method, Lexeme lexeme):
00268 regex(r), method(method), lexeme(lexeme) {}
00269 };
00270
00271 vector<Rule> rules;
00272
00273 public:
00285 template<typename L>
00286 void add(const Regex &r, int method, int terminal, L lexemeValue)
00287 {
00288 rules.push_back(Rule(r, method,
00289 Lexeme(terminal,
00290 new RefObject<L>(lexemeValue))));
00291 }
00292
00293 template<typename L>
00294 void add(const Regex &r, int method, int terminal, Ref<L> lexemeValue)
00295 {
00296 rules.push_back(Rule(r, method, Lexeme(terminal, lexemeValue)));
00297 }
00298
00299 void add(const Regex &r, int method, int terminal, RefAny lexemeValue)
00300 {
00301 rules.push_back(Rule(r, method, Lexeme(terminal, lexemeValue)));
00302 }
00303
00304 void add(const Regex &r, int method = -1, int terminal = -1)
00305 {
00306 rules.push_back(Rule(r, method, Lexeme(terminal, 0)));
00307 }
00308 };
00309
00315 class Engine
00316 {
00317 friend class Lexer;
00318
00319 struct Rule
00320 {
00321 int method;
00322 Lexeme lexeme;
00323
00324 Rule(): method(-1) {}
00325 Rule(int method, Lexeme lexeme): method(method), lexeme(lexeme) {}
00326 };
00327
00328 const ActorBase *actor;
00329 auto_ptr<DFA> dfa;
00330 vector<Rule> rules;
00331
00332
00333 vector<unsigned> anchorMasks;
00334
00335
00336 Engine(const Engine &);
00337 Engine &operator=(const Engine &);
00338
00339 public:
00349 Engine(const RuleSet &, const ActorBase * =0);
00350 };
00351
00352 private:
00353 const Engine &engine;
00354 Ref<Stream::UReader> &reader;
00355 Context *context;
00356 ustring buffer;
00357 int bufferSize;
00358 int charCount;
00359 int type;
00360 int anchorState;
00361 int lineNumber;
00362
00363
00364 public:
00377 Lexer(const Engine &e, Ref<Stream::UReader> &r, Context *c=0):
00378 engine(e), reader(r), context(c), bufferSize(0), charCount(0),
00379 type(-1), anchorState(0), lineNumber(1) {}
00380
00387 void getNext(Lexeme &);
00388
00394 ustring getText() const
00395 {
00396 return buffer.substr(0, charCount);
00397 }
00398
00403 int getType() const
00404 {
00405 return type;
00406 }
00407
00412 int getLineNumber() const { return lineNumber; }
00413 };
00414 }
00415 }
00416
00417 #endif // ARCHON_UTILITIES_LEXER_H