00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <string>
00021
00022 #include <archon/util/file.H>
00023 #include <archon/util/text.H>
00024
00025 #include <archon/util/uri.H>
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050 namespace Archon
00051 {
00052 namespace Utilities
00053 {
00054 using namespace std;
00055
00056
00057 enum Scheme
00058 {
00059 scheme_file,
00060 scheme_http,
00061 scheme_ftp
00062 };
00063
00064
00065 string schemeName(Scheme scheme)
00066 {
00067 switch(scheme)
00068 {
00069 case scheme_file: return "FILE";
00070 case scheme_http: return "HTTP";
00071 case scheme_ftp: return "FTP";
00072 }
00073 return "<unknown>";
00074 }
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092 const unsigned lowalphaMask = 1;
00093 const unsigned upalphaMask = 2;
00094 const unsigned digitMask = 4;
00095 const unsigned markMask = 8;
00096 const unsigned commonMask = 16;
00097 const unsigned slashMask = 32;
00098 const unsigned colonMask = 64;
00099 const unsigned semicolonMask = 128;
00100 const unsigned equalMask = 256;
00101 const unsigned questionmarkMask = 512;
00102 const unsigned commercialAtMask = 1024;
00103 const unsigned otherMask = 2048;
00104
00105 const unsigned alphaMask = lowalphaMask|upalphaMask;
00106 const unsigned alphanumMask = alphaMask|digitMask;
00107 const unsigned unreservedMask = alphanumMask|markMask;
00108 const unsigned reservedMask = commonMask|slashMask|colonMask|
00109 semicolonMask|equalMask|
00110 questionmarkMask|commercialAtMask;
00111
00112 const unsigned userMask = unreservedMask|commonMask|semicolonMask|
00113 equalMask;
00114 const unsigned userInfoMask = userMask|colonMask;
00115
00116 const unsigned pcharNoEqualMask = unreservedMask|commonMask|colonMask|
00117 commercialAtMask;
00118 const unsigned pcharMask = pcharNoEqualMask|equalMask;
00119 const unsigned uricNoSlashMask = pcharMask|semicolonMask|questionmarkMask;
00120 const unsigned uricMask = uricNoSlashMask|slashMask;
00121
00122 const unsigned anyMask = uricMask|otherMask;
00123
00128 unsigned charClassTable[127-32] =
00129 {
00130 2048, 8, 2048, 2048,
00131 16, 2048, 16, 8,
00132 8, 8, 8, 16,
00133 16, 8, 8, 32,
00134 4, 4, 4, 4,
00135 4, 4, 4, 4,
00136 4, 4, 64, 128,
00137 2048, 256, 2048, 512,
00138 1024, 2, 2, 2,
00139 2, 2, 2, 2,
00140 2, 2, 2, 2,
00141 2, 2, 2, 2,
00142 2, 2, 2, 2,
00143 2, 2, 2, 2,
00144 2, 2, 2, 2048,
00145 2048, 2048, 2048, 8,
00146 2048, 1, 1, 1,
00147 1, 1, 1, 1,
00148 1, 1, 1, 1,
00149 1, 1, 1, 1,
00150 1, 1, 1, 1,
00151 1, 1, 1, 1,
00152 1, 1, 1, 2048,
00153 2048, 2048, 8
00154 };
00155
00156
00157 static void validatePart(const string &v, const string &uri, int offset,
00158 unsigned charClassMask, bool allowEscape,
00159 const string &partName)
00160 {
00161 for(string::size_type i=0; i<v.size(); ++i)
00162 {
00163 unsigned char c = static_cast<unsigned char>(v[i]);
00164 if(allowEscape && c == '%')
00165 {
00166 if(v.size() <= i+2)
00167 ARCHON_THROW4(Uri::SyntaxException,
00168 "Un-terminated escape sequence "
00169 "within " + partName, uri, offset+i,
00170 offset + v.size());
00171 if(!isxdigit(static_cast<unsigned char>(v[i+1])) ||
00172 !isxdigit(static_cast<unsigned char>(v[i+2])))
00173 ARCHON_THROW4(Uri::SyntaxException,
00174 "Invalid escape sequence "
00175 "within " + partName, uri,
00176 offset+i, offset+i+3);
00177 i += 2;
00178 }
00179 else if(c < '\x20' || c > '\x7e' ||
00180 !(charClassTable[c-'\x20']&charClassMask))
00181 ARCHON_THROW4(Uri::SyntaxException,
00182 "Illegal character within " +
00183 partName, uri, offset+i, offset+i+1);
00184 }
00185 }
00186
00187
00188 static void validateUser(const string &v, const string &uri,
00189 int offset, Scheme scheme)
00190 {
00191 validatePart(v, uri, offset, userMask, true, "username");
00192 }
00193
00194
00195 static void validatePassword(const string &v, const string &uri,
00196 int offset, Scheme scheme)
00197 {
00198
00199
00200
00201
00202 validatePart(v, uri, offset, userMask, true, "password");
00203 }
00204
00205
00206 static void validateUserinfo(const string &v, const string &uri,
00207 int offset, Scheme scheme)
00208 {
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219 string::size_type i = v.find(':');
00220 if(i != string::npos)
00221 {
00222 validateUser(v.substr(0, i), uri, offset, scheme);
00223 validatePassword(v.substr(i+1), uri, offset+i+1, scheme);
00224 }
00225 else validateUser(v, uri, offset, scheme);
00226 }
00227
00228
00229 static void validateHost(const string &v, const string &uri,
00230 int offset, Scheme scheme)
00231 {
00232 string::size_type i = v.rfind('.');
00233 if(i != string::npos && i+1 < v.size() && isdigit(static_cast<unsigned char>(v[i+1])))
00234 {
00235
00236 const string::size_type i1 =
00237 v.find('.');
00238 const string::size_type i2 =
00239 i1 == string::npos ? string::npos : v.find('.', i1+1);
00240 const string::size_type i3 =
00241 i2 == string::npos ? string::npos : v.find('.', i2+1);
00242 const string::size_type i4 =
00243 i3 == string::npos ? string::npos : v.find('.', i3+1);
00244 if(i4 != string::npos || i3 == string::npos)
00245 ARCHON_THROW4(Uri::SyntaxException,
00246 "Wrong number of fields in IPv4 address",
00247 uri, offset, offset+v.size());
00248 if(i1 == 0 || i1+1 == i2 || i2+1 == i3 || i3+1 == v.size())
00249 ARCHON_THROW4(Uri::SyntaxException,
00250 "Empty fields in IPv4 address",
00251 uri, offset, offset+v.size());
00252 for(i=0; i<v.size(); ++i)
00253 if(!isdigit(v[i]) && v[i] != '.')
00254 ARCHON_THROW4(Uri::SyntaxException,
00255 "Illegal character within IPv4 address",
00256 uri, offset+i, offset+i+1);
00257 }
00258 else
00259 {
00260
00261
00262 i = 0;
00263 string::size_type j;
00264
00265
00266
00267
00268
00269 for(;;)
00270 {
00271 j = v.find('.', i);
00272 if(j == string::npos) j = v.size();
00273
00274 if(j == i && j < v.size())
00275 ARCHON_THROW4(Uri::SyntaxException,
00276 "Empty label within domain name",
00277 uri, offset+i, offset+i+1);
00278 for(string::size_type k = i; k<j; ++k)
00279 if(!isalnum(v[k]) && (v[k] != '-' || k == i || k == j-1))
00280 ARCHON_THROW4(Uri::SyntaxException,
00281 "Illegal character within domain name",
00282 uri, offset+k, offset+k+1);
00283 if(j == v.size()) break;
00284
00285 i = j+1;
00286 }
00287 }
00288 }
00289
00290
00291 static void validatePort(const string &v, const string &uri,
00292 int offset, Scheme scheme)
00293 {
00294 validatePart(v, uri, offset, digitMask, false, "port number");
00295 }
00296
00297
00298 static void validateHostPort(const string &v, const string &uri,
00299 int offset, Scheme scheme)
00300 {
00301 string::size_type i = v.rfind(':');
00302 if(i != string::npos)
00303 {
00304 validateHost(v.substr(0, i), uri, offset, scheme);
00305
00306 if(scheme == scheme_file)
00307 ARCHON_THROW4(Uri::SyntaxException,
00308 "Port number not allowed in " +
00309 schemeName(scheme) + " URI",
00310 uri, offset+i, offset+v.size());
00311
00312 validatePort(v.substr(i+1), uri, offset+i+1, scheme);
00313 }
00314 else validateHost(v, uri, offset, scheme);
00315 }
00316
00317
00318 static void validateAuthority(string v, const string &uri,
00319 int offset, Scheme scheme)
00320 {
00321 if(v.empty()) return;
00322 v.erase(0, 2);
00323 offset += 2;
00324 string::size_type i = v.rfind('@');
00325 if(i != string::npos)
00326 {
00327 if(scheme != scheme_ftp)
00328 ARCHON_THROW4(Uri::SyntaxException,
00329 "User info not allowed in " +
00330 schemeName(scheme) + " URI",
00331 uri, offset, offset+i+1);
00332
00333 validateUserinfo(v.substr(0, i), uri, offset, scheme);
00334 validateHostPort(v.substr(i+1), uri, offset+i+1, scheme);
00335 }
00336 else validateHostPort(v, uri, offset, scheme);
00337 }
00338
00339
00340 static void validatePathSegmentParameter(const string &v, const string &uri,
00341 int offset, Scheme scheme)
00342 {
00343 if(scheme == scheme_ftp)
00344 {
00345 if(v.size() == 0)
00346 ARCHON_THROW4(Uri::SyntaxException,
00347 "Path segment parameter expexted",
00348 uri, offset, offset+1);
00349
00350 if(v.substr(0, 5) == "type=")
00351 {
00352 const string t = v.substr(5);
00353 if(t == "a" || t == "A" ||
00354 t == "i" || t == "I" ||
00355 t == "d" || t == "D") return;
00356 }
00357 ARCHON_THROW4(Uri::SyntaxException,
00358 "A path segment parameter must "
00359 "have the form 'type=[aidAID]' in the " +
00360 schemeName(scheme) +
00361 " URI scheme", uri, offset,
00362 offset+v.size());
00363 }
00364 else
00365 {
00366 validatePart(v, uri, offset, pcharMask, true, "path segment parameter ");
00367 }
00368 }
00369
00370
00371 static void validatePathSegment(const string &v, const string &uri,
00372 int offset, Scheme scheme,
00373 bool lastSegment)
00374 {
00375 string::size_type i = v.find(';');
00376 if(i != string::npos)
00377 {
00378 if(scheme == scheme_http ||
00379 scheme == scheme_file)
00380 ARCHON_THROW4(Uri::SyntaxException,
00381 "Path segment parameters not "
00382 "allowed in " + schemeName(scheme) +
00383 " URI scheme", uri, offset+i,
00384 offset+v.size());
00385
00386 if(scheme == scheme_ftp && !lastSegment)
00387 ARCHON_THROW4(Uri::SyntaxException,
00388 "Path segment parameters may "
00389 "only be applied to the final path segemnt "
00390 "in the " + schemeName(scheme) +
00391 " URI scheme", uri, offset+i,
00392 offset+v.size());
00393
00394
00395
00396 string::size_type j=i+1, k;
00397
00398 for(;;)
00399 {
00400 k = v.find(';', j);
00401 if(k == string::npos) k = v.size();
00402
00403 validatePathSegmentParameter(v.substr(j, k-j), uri, offset+j,
00404 scheme);
00405
00406 if(k == v.size()) break;
00407
00408 if(scheme == scheme_ftp)
00409 ARCHON_THROW4(Uri::SyntaxException,
00410 "Only one path segemnt parameter "
00411 "allowed in the " + schemeName(scheme) +
00412 " URI scheme", uri, offset+k,
00413 offset+v.size());
00414
00415 j = k+1;
00416 }
00417 }
00418 else i = v.size();
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435 validatePart(v.substr(0, i), uri, offset, uricMask, true,
00436 "path segment");
00437 }
00438
00439
00440 static void validatePath(const string &v, const string &uri,
00441 int offset, Scheme scheme)
00442 {
00443
00444
00445 string::size_type i=0, j;
00446
00447 for(;;)
00448 {
00449 j = v.find('/', i);
00450 if(j == string::npos) j = v.size();
00451
00452 validatePathSegment(v.substr(i, j-i), uri, offset+i, scheme,
00453 j == v.size());
00454
00455 if(j == v.size()) break;
00456
00457 i = j+1;
00458 }
00459 }
00460
00461
00462 static void validateQuery(string v, const string &uri,
00463 int offset, Scheme scheme)
00464 {
00465 if(v.empty()) return;
00466 v.erase(0, 1);
00467 ++offset;
00468 validatePart(v, uri, offset, uricMask, true, "query");
00469 }
00470
00471
00472 static void validateFragmentIdentifier(string v, const string &uri,
00473 int offset)
00474 {
00475 if(v.empty()) return;
00476 v.erase(0, 1);
00477 ++offset;
00478 validatePart(v, uri, offset, uricMask, true, "fragment identifier");
00479 }
00480
00481
00482 static string makePointerLine(int from, int to)
00483 {
00484 return string(from, ' ') + string(to-from, '^') + "\n";
00485 }
00486
00487
00488
00534 void Uri::canonicalizePath()
00535 {
00536 string::size_type i = 0;
00537 for(;;)
00538 {
00539 i = path.find("/.", i);
00540 if(i == string::npos) break;
00541 if(i == path.size()-2 || path[i+2] == '/') path.erase(i+1, 2);
00542 else if(path[i+2] == '.' && (i == path.size()-3 || path[i+3] == '/') &&
00543 i>0)
00544 {
00545 string::size_type j = path.rfind('/', i-1)+1;
00546
00547 path.erase(j, i+4-j);
00548 i = j-1;
00549 }
00550 else i += 2;
00551 }
00552 }
00553
00554
00562 void Uri::resolveRelative(const Uri &baseUri)
00563 {
00564 scheme = baseUri.scheme;
00565 if(!authority.empty()) return;
00566 authority = baseUri.authority;
00567 if(!path.empty() && path[0] == '/') return;
00568 if(!path.empty())
00569 {
00570
00571 path = (baseUri.path.empty() ? "/" :
00572 baseUri.path.substr(0, baseUri.path.rfind('/')+1)) + path;
00573 canonicalizePath();
00574 return;
00575 }
00576 query = baseUri.query;
00577 }
00578
00579
00580 Uri::Uri()
00581 {
00582 scheme = "file:";
00583
00584 string cwd = File::getCWD();
00585 if(cwd[cwd.size()-1] != '/') cwd += '/';
00586
00587
00588
00589 string::size_type i=0, j;
00590
00591 for(;;)
00592 {
00593 j = cwd.find('/', i);
00594 if(j == string::npos) j = cwd.size();
00595
00596 path.append(encode(cwd.substr(i, j-i), false));
00597
00598 if(j == cwd.size()) break;
00599
00600 path += '/';
00601
00602 i = j+1;
00603 }
00604 }
00605
00606
00607 Uri::Uri(const string &uri, const Uri &baseUri)
00608 {
00609
00610
00611
00612 string::size_type i = uri.find_first_of(":/?#");
00613 if(i != string::npos && uri[i] == ':') scheme = uri.substr(0, ++i);
00614 else i = 0;
00615
00616
00617 const string::size_type iAuthority = i;
00618 if(uri.size() >= i+2 && uri.substr(i, 2) == "//")
00619 {
00620 string::size_type j = uri.find_first_of("/?#", i+2);
00621 if(j == string::npos) j = uri.size();
00622 authority = uri.substr(i, j-i);
00623 i = j;
00624 }
00625
00626
00627 const string::size_type iPath = i;
00628 {
00629 string::size_type j = uri.find_first_of("?#", i);
00630 if(j == string::npos) j = uri.size();
00631 path = uri.substr(i, j-i);
00632 i = j;
00633 }
00634
00635
00636 const string::size_type iQuery = i;
00637 if(uri.size() >= i+1 && uri[i] == '?')
00638 {
00639 string::size_type j = uri.find('#', i+1);
00640 if(j == string::npos) j = uri.size();
00641 query = uri.substr(i, j-i);
00642 i = j;
00643 }
00644
00645 if(i < uri.size())
00646 ARCHON_THROW4(SyntaxException,
00647 "Illegal character in URI (this looks more "
00648 "like a URI Reference)", uri, i, i+1);
00649
00650 string effectiveScheme = scheme.empty() ? baseUri.scheme : scheme;
00651 effectiveScheme.erase(effectiveScheme.size()-1);
00652 if(Text::compareIgnoreCase(effectiveScheme, "file") == 0)
00653 {
00654 validateAuthority(authority, uri, iAuthority, scheme_file);
00655 validatePath(path, uri, iPath, scheme_file);
00656
00657 if(!query.empty())
00658 ARCHON_THROW4(SyntaxException,
00659 "Queries are not supported by the " +
00660 schemeName(scheme_file) + " URI scheme",
00661 uri, iQuery, uri.size());
00662
00663 if(scheme.empty()) resolveRelative(baseUri);
00664 }
00665 else if(Text::compareIgnoreCase(effectiveScheme, "http") == 0 ||
00666 Text::compareIgnoreCase(effectiveScheme, "https") == 0)
00667 {
00668 validateAuthority(authority, uri, iAuthority, scheme_http);
00669 validatePath(path, uri, iPath, scheme_http);
00670 validateQuery(query, uri, iQuery, scheme_http);
00671
00672 if(scheme.empty()) resolveRelative(baseUri);
00673 else if(authority.size() < 3)
00674 ARCHON_THROW4(SyntaxException,
00675 "Authority (hostname) required "
00676 "for absolute " + schemeName(scheme_http) +
00677 " URI", uri,
00678 iAuthority, iAuthority+1);
00679 }
00680 else if(Text::compareIgnoreCase(effectiveScheme, "ftp") == 0)
00681 {
00682 validateAuthority(authority, uri, iAuthority, scheme_ftp);
00683 validatePath(path, uri, iPath, scheme_ftp);
00684
00685 if(!query.empty())
00686 ARCHON_THROW4(SyntaxException,
00687 "Queries are not supported by the " +
00688 schemeName(scheme_ftp) + " URI scheme",
00689 uri, iQuery, uri.size());
00690
00691 if(scheme.empty()) resolveRelative(baseUri);
00692 else if(authority.size() < 3)
00693 ARCHON_THROW4(SyntaxException,
00694 "Authority (hostname) required "
00695 "for absolute " + schemeName(scheme_ftp) +
00696 " URI", uri,
00697 iAuthority, iAuthority+1);
00698 }
00699 else
00700 ARCHON_THROW4(SyntaxException,
00701 "Invalid or unsupported URI scheme",
00702 uri, 0, iAuthority);
00703 }
00704
00705 bool Uri::isFileScheme() const
00706 {
00707 return Text::compareIgnoreCase(scheme, "file:") == 0;
00708 }
00709
00710 string Uri::getFile() const
00711 {
00712 const string::size_type i = path.rfind('/');
00713 return path.substr(i == string::npos ? 0 : i+1);
00714 }
00715
00716
00717 static const string hexDigits = "0123456789abcdef";
00718
00719 string Uri::encode(const string &v, bool plusForSpace)
00720 {
00721 string result;
00722 result.reserve(string::size_type(v.size()*1.5));
00723 for(unsigned i=0; i<v.size(); ++i)
00724 {
00725 unsigned char c = static_cast<unsigned char>(v[i]);
00726 if(c == ' ' && plusForSpace) result += '+';
00727 else if(c < '\x20' || c > '\x7e' ||
00728 !(charClassTable[c-'\x20']&unreservedMask))
00729 {
00730 result += "%";
00731 result += hexDigits[c >> 4];
00732 result += hexDigits[c & 15];
00733 }
00734 else result += c;
00735 }
00736
00737 return result;
00738 }
00739
00740
00741 string Uri::decode(const string &v, bool plusForSpace)
00742 {
00743 string result;
00744 result.reserve(v.size());
00745 for(unsigned i=0; i<v.size(); ++i)
00746 {
00747 unsigned char c = static_cast<unsigned char>(v[i]);
00748 if(c == '+' && plusForSpace) c = ' ';
00749 else if(c == '%')
00750 {
00751 if(v.size() <= i+2)
00752 ARCHON_THROW1(ArgumentException,
00753 "Un-terminated escape sequence '" + v + "'");
00754
00755 int p;
00756 c = static_cast<unsigned char>(v[++i]);
00757 if(c >= '0' && c <= '9') p = c - '0';
00758 else if(c >= 'A' && c <= 'F') p = c - 'A' + 10;
00759 else if(c >= 'a' && c <= 'f') p = c - 'a' + 10;
00760 else ARCHON_THROW1(ArgumentException,
00761 "Invalid escape sequence '" + v + "'");
00762 p *= 16;
00763 c = static_cast<unsigned char>(v[++i]);
00764 if(c >= '0' && c <= '9') p += c - '0';
00765 else if(c >= 'A' && c <= 'F') p += c - 'A' + 10;
00766 else if(c >= 'a' && c <= 'f') p += c - 'a' + 10;
00767 else ARCHON_THROW1(ArgumentException,
00768 "Invalid escape sequence '" + v + "'");
00769
00770 c = static_cast<unsigned char>(p);
00771 }
00772
00773 result += static_cast<char>(c);
00774 }
00775
00776 return result;
00777 }
00778
00779
00780 string Uri::explain(const SyntaxException &e)
00781 {
00782 return
00783 "Malformed URI: " + e.getMessage() + "\n" +
00784 e.val + "\n" +
00785 makePointerLine(e.indexFrom, e.indexTo);
00786 }
00787
00788
00789 string Uri::toString() const
00790 {
00791 return scheme + authority + path + query;
00792 }
00793
00794 ostream &operator<<(ostream &out, const Uri &u)
00795 {
00796 out << u.toString();
00797 return out;
00798 }
00799
00800
00801 UriReference::UriReference(const string &uriReference,
00802 const UriReference &baseReference)
00803 {
00804 if(uriReference.empty())
00805 {
00806 uri = baseReference.uri;
00807 fragmentIdentifier = baseReference.fragmentIdentifier;
00808 return;
00809 }
00810
00811 string::size_type i = uriReference.find('#');
00812 if(i == string::npos) uri = Uri(uriReference, baseReference.uri);
00813 else
00814 {
00815 uri = Uri(uriReference.substr(0, i), baseReference.uri);
00816 fragmentIdentifier = uriReference.substr(i);
00817
00818 validateFragmentIdentifier(fragmentIdentifier, uriReference, i);
00819 }
00820 }
00821
00822
00823 string UriReference::toString() const
00824 {
00825 return uri.toString() + fragmentIdentifier;
00826 }
00827
00828
00829 ostream &operator<<(ostream &out, const UriReference &r)
00830 {
00831 out << r.toString();
00832 return out;
00833 }
00834 }
00835 }