00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <errno.h>
00021 #include <string.h>
00022 #include <iconv.h>
00023 #include <iostream>
00024
00025 #include <archon/util/exception.H>
00026 #include <archon/util/text.H>
00027 #include <archon/util/charenc.H>
00028
00029 using namespace std;
00030
00031 namespace
00032 {
00033 inline bool isLittleEndianArch()
00034 {
00035 short s = 1;
00036 return *reinterpret_cast<char *>(&s);
00037 }
00038
00039 inline string getUcsEncoding()
00040 {
00041 if(sizeof(wchar_t) == 2) return isLittleEndianArch() ? "UCS-2LE" : "UCS-2BE";
00042 if(sizeof(wchar_t) == 4) return isLittleEndianArch() ? "UCS-4LE" : "UCS-4BE";
00043 ARCHON_THROW1(Archon::Utilities::ResourceException,
00044 "Unsupported number of bits in wchar_t");
00045 }
00046 }
00047
00048 namespace Archon
00049 {
00050 namespace Utilities
00051 {
00052 const string CharEnc::US_ASCII = "US-ASCII";
00053 const string CharEnc::ISO_8859_1 = "ISO-8859-1";
00054 const string CharEnc::ISO_8859_15 = "ISO-8859-15";
00055 const string CharEnc::UTF_8 = "UTF-8";
00056 const string CharEnc::UTF_16LE = "UTF-16LE";
00057 const string CharEnc::UTF_16BE = "UTF-16BE";
00058 const string CharEnc::UTF_32LE = "UTF-32LE";
00059 const string CharEnc::UTF_32BE = "UTF-32BE";
00060 const string CharEnc::WINDOWS_1252 = "WINDOWS-1252";
00061
00062 string CharEnc::encode(wstring s, string encoding, bool fail) throw(TranscodeException)
00063 {
00064 if(s.empty()) return string();
00065
00066 CharEnc transcoder(getUcsEncoding(), encoding);
00067 const char *inbuf = reinterpret_cast<const char *>(s.data());
00068 size_t inbytesleft = s.size()*sizeof(wchar_t);
00069 const size_t buffersize = 512;
00070 char buffer[buffersize];
00071
00072 string t;
00073 while(inbytesleft)
00074 {
00075 char *outbuf = buffer;
00076 size_t outbytesleft = buffersize;
00077 transcoder.transcode(inbuf, inbytesleft, outbuf, outbytesleft, fail);
00078 t.append(buffer, buffersize - outbytesleft);
00079 }
00080
00081 return t;
00082 }
00083
00084 wstring CharEnc::decode(string s, string encoding, bool fail) throw(TranscodeException)
00085 {
00086 if(s.empty()) return wstring();
00087
00088 CharEnc transcoder(encoding, getUcsEncoding());
00089 const char *inbuf = s.data();
00090 size_t inbytesleft = s.size();
00091 const size_t buffersize = 512;
00092 char buffer[buffersize];
00093
00094 wstring t;
00095 while(inbytesleft)
00096 {
00097 char *outbuf = buffer;
00098 size_t outbytesleft = buffersize;
00099 transcoder.transcode(inbuf, inbytesleft, outbuf, outbytesleft, fail);
00100 t.append(reinterpret_cast<wchar_t *>(buffer), (buffersize - outbytesleft)/sizeof(wchar_t));
00101 }
00102
00103 return t;
00104 }
00105
00106 string CharEnc::transcode(string s, string fromEnc, string toEnc, bool fail) throw(TranscodeException)
00107 {
00108 if(s.empty()) return string();
00109
00110 CharEnc transcoder(fromEnc, toEnc);
00111 const char *inbuf = s.data();
00112 size_t inbytesleft = s.size();
00113 const size_t buffersize = 512;
00114 char buffer[buffersize];
00115
00116 string t;
00117 while(inbytesleft)
00118 {
00119 char *outbuf = buffer;
00120 size_t outbytesleft = buffersize;
00121 transcoder.transcode(inbuf, inbytesleft, outbuf, outbytesleft, fail);
00122 t.append(buffer, buffersize - outbytesleft);
00123 }
00124
00125 return t;
00126 }
00127
00128 CharEnc::CharEnc(string fromEnc, string toEnc):
00129 targetEncoding(toEnc), needReplacementCharacter(true)
00130 {
00131 if(numeric_limits<unsigned char>::digits != 8)
00132 ARCHON_THROW1(ResourceException,
00133 "Unsupported number of bits in char: " +
00134 Text::toString(numeric_limits<unsigned char>::digits));
00135 iconvState = iconv_open(toEnc.c_str(), fromEnc.c_str());
00136 if(iconvState==reinterpret_cast<iconv_t>(-1))
00137 ARCHON_THROW1(ResourceException,
00138 string("Unexpected error from 'iconv_open': ") +
00139 strerror(errno));
00140 }
00141
00142 CharEnc::~CharEnc()
00143 {
00144 iconv_close(iconvState);
00145 }
00146
00167 void CharEnc::transcode(const char *&in, size_t &inbytes, char *&out, size_t &outbytes, bool fail) throw(TranscodeException)
00168 {
00169 char *inbuf = const_cast<char *>(in);
00170 size_t inbytesleft = inbytes;
00171 char *outbuf = out;
00172 size_t outbytesleft = outbytes;
00173
00174 while(inbytesleft)
00175 {
00176 size_t n = iconv(iconvState, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
00177 if(n==static_cast<size_t>(-1))
00178 {
00179 if(errno == E2BIG || errno == EINVAL) break;
00180
00181 if(fail) ARCHON_THROW(TranscodeException);
00182
00183 if(needReplacementCharacter)
00184 {
00185 try
00186 {
00187 replacementCharacter = CharEnc::transcode("\xEF\xBF\xBD", UTF_8, targetEncoding, true);
00188 }
00189 catch(TranscodeException &)
00190 {
00191 try
00192 {
00193 replacementCharacter = CharEnc::transcode("?", US_ASCII, targetEncoding, true);
00194 }
00195 catch(TranscodeException &)
00196 {
00197 try
00198 {
00199 replacementCharacter = CharEnc::transcode(" ", US_ASCII, targetEncoding, true);
00200 }
00201 catch(TranscodeException &)
00202 {
00203 replacementCharacter = "";
00204 }
00205 }
00206 }
00207 needReplacementCharacter = false;
00208 }
00209
00210 n = replacementCharacter.size();
00211 if(outbytesleft < n) break;
00212 if(n) replacementCharacter.copy(outbuf, string::npos);
00213 outbytesleft -= n;
00214 outbuf += n;
00215 --inbytesleft;
00216 ++inbuf;
00217 }
00218 }
00219
00220 in = inbuf;
00221 inbytes = inbytesleft;
00222 out = outbuf;
00223 outbytes = outbytesleft;
00224 }
00225 }
00226 }