charenc.C

00001 /*
00002  * This file is part of the "Archon" framework.
00003  * (http://files3d.sourceforge.net)
00004  *
00005  * Copyright © 2002 by Kristian Spangsege and Brian Kristiansen.
00006  *
00007  * Permission to use, copy, modify, and distribute this software and
00008  * its documentation under the terms of the GNU General Public License is
00009  * hereby granted. No representations are made about the suitability of
00010  * this software for any purpose. It is provided "as is" without express
00011  * or implied warranty. See the GNU General Public License
00012  * (http://www.gnu.org/copyleft/gpl.html) for more details.
00013  *
00014  * The characters in this file are ISO8859-1 encoded.
00015  *
00016  * The documentation in this file is in "Doxygen" style
00017  * (http://www.doxygen.org).
00018  */
00019 
00020 #include <errno.h>
00021 #include <string.h>
00022 #include <iconv.h>
00023 #include <iostream>
00024 
00025 #include <archon/util/exception.H>
00026 #include <archon/util/text.H>
00027 #include <archon/util/charenc.H>
00028 
00029 using namespace std;
00030 
00031 namespace
00032 {
00033   inline bool isLittleEndianArch()
00034   {
00035     short s = 1;
00036     return *reinterpret_cast<char *>(&s);
00037   }
00038 
00039   inline string getUcsEncoding()
00040   {
00041       if(sizeof(wchar_t) == 2) return isLittleEndianArch() ? "UCS-2LE" : "UCS-2BE";
00042       if(sizeof(wchar_t) == 4) return isLittleEndianArch() ? "UCS-4LE" : "UCS-4BE";
00043       ARCHON_THROW1(Archon::Utilities::ResourceException,
00044                     "Unsupported number of bits in wchar_t");
00045   }
00046 }
00047 
00048 namespace Archon
00049 {
00050   namespace Utilities
00051   {
00052     const string CharEnc::US_ASCII     = "US-ASCII";
00053     const string CharEnc::ISO_8859_1   = "ISO-8859-1";
00054     const string CharEnc::ISO_8859_15  = "ISO-8859-15";
00055     const string CharEnc::UTF_8        = "UTF-8";
00056     const string CharEnc::UTF_16LE     = "UTF-16LE";
00057     const string CharEnc::UTF_16BE     = "UTF-16BE";
00058     const string CharEnc::UTF_32LE     = "UTF-32LE";
00059     const string CharEnc::UTF_32BE     = "UTF-32BE";
00060     const string CharEnc::WINDOWS_1252 = "WINDOWS-1252";
00061 
00062     string CharEnc::encode(wstring s, string encoding, bool fail) throw(TranscodeException)
00063     {
00064       if(s.empty()) return string();
00065 
00066       CharEnc transcoder(getUcsEncoding(), encoding);
00067       const char *inbuf  = reinterpret_cast<const char *>(s.data());
00068       size_t inbytesleft = s.size()*sizeof(wchar_t);
00069       const size_t buffersize = 512;
00070       char buffer[buffersize];
00071 
00072       string t;
00073       while(inbytesleft)
00074       {
00075         char *outbuf = buffer;
00076         size_t outbytesleft = buffersize;       
00077         transcoder.transcode(inbuf, inbytesleft, outbuf, outbytesleft, fail);
00078         t.append(buffer, buffersize - outbytesleft);
00079       }
00080 
00081       return t;
00082     }
00083 
00084     wstring CharEnc::decode(string s, string encoding, bool fail) throw(TranscodeException)
00085     {
00086       if(s.empty()) return wstring();
00087 
00088       CharEnc transcoder(encoding, getUcsEncoding());
00089       const char *inbuf  = s.data();
00090       size_t inbytesleft = s.size();
00091       const size_t buffersize = 512;
00092       char buffer[buffersize];
00093 
00094       wstring t;
00095       while(inbytesleft)
00096       {
00097         char *outbuf = buffer;
00098         size_t outbytesleft = buffersize;       
00099         transcoder.transcode(inbuf, inbytesleft, outbuf, outbytesleft, fail);
00100         t.append(reinterpret_cast<wchar_t *>(buffer), (buffersize - outbytesleft)/sizeof(wchar_t));
00101       }
00102 
00103       return t;
00104     }
00105 
00106     string CharEnc::transcode(string s, string fromEnc, string toEnc, bool fail) throw(TranscodeException)
00107     {
00108       if(s.empty()) return string();
00109 
00110       CharEnc transcoder(fromEnc, toEnc);
00111       const char *inbuf  = s.data();
00112       size_t inbytesleft = s.size();
00113       const size_t buffersize = 512;
00114       char buffer[buffersize];
00115 
00116       string t;
00117       while(inbytesleft)
00118       {
00119         char *outbuf = buffer;
00120         size_t outbytesleft = buffersize;       
00121         transcoder.transcode(inbuf, inbytesleft, outbuf, outbytesleft, fail);
00122         t.append(buffer, buffersize - outbytesleft);
00123       }
00124 
00125       return t;
00126     }
00127 
00128     CharEnc::CharEnc(string fromEnc, string toEnc):
00129       targetEncoding(toEnc), needReplacementCharacter(true)
00130     {
00131       if(numeric_limits<unsigned char>::digits != 8)
00132         ARCHON_THROW1(ResourceException,
00133                       "Unsupported number of bits in char: " +
00134                       Text::toString(numeric_limits<unsigned char>::digits));
00135       iconvState = iconv_open(toEnc.c_str(), fromEnc.c_str());
00136       if(iconvState==reinterpret_cast<iconv_t>(-1))
00137         ARCHON_THROW1(ResourceException,
00138                       string("Unexpected error from 'iconv_open': ") +
00139                       strerror(errno));
00140     }
00141 
00142     CharEnc::~CharEnc()
00143     {
00144       iconv_close(iconvState);
00145     }
00146 
00167     void CharEnc::transcode(const char *&in, size_t &inbytes, char *&out, size_t &outbytes, bool fail) throw(TranscodeException)
00168     {
00169       char *inbuf = const_cast<char *>(in);
00170       size_t inbytesleft = inbytes;
00171       char *outbuf = out;
00172       size_t outbytesleft = outbytes;
00173 
00174       while(inbytesleft)
00175       {
00176         size_t n = iconv(iconvState, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
00177         if(n==static_cast<size_t>(-1))
00178         {
00179           if(errno == E2BIG || errno == EINVAL) break;
00180 
00181           if(fail) ARCHON_THROW(TranscodeException);
00182 
00183           if(needReplacementCharacter)
00184           {
00185             try
00186             {
00187               replacementCharacter = CharEnc::transcode("\xEF\xBF\xBD", UTF_8, targetEncoding, true);
00188             }
00189             catch(TranscodeException &)
00190             {
00191               try
00192               {
00193                 replacementCharacter = CharEnc::transcode("?", US_ASCII, targetEncoding, true);
00194               }
00195               catch(TranscodeException &)
00196               {
00197                 try
00198                 {
00199                   replacementCharacter = CharEnc::transcode(" ", US_ASCII, targetEncoding, true);
00200                 }
00201                 catch(TranscodeException &)
00202                 {
00203                   replacementCharacter = "";
00204                 }
00205               }
00206             }
00207             needReplacementCharacter = false;
00208           }
00209 
00210           n = replacementCharacter.size();
00211           if(outbytesleft < n) break;
00212           if(n) replacementCharacter.copy(outbuf, string::npos);
00213           outbytesleft -= n;
00214           outbuf += n;
00215           --inbytesleft;
00216           ++inbuf;
00217         }
00218       }
00219 
00220       in       = inbuf;
00221       inbytes  = inbytesleft;
00222       out      = outbuf;
00223       outbytes = outbytesleft;
00224     }
00225   }
00226 }

Generated on Sun Jul 30 22:55:44 2006 for Archon by  doxygen 1.4.4