root/lib/util/charset/charcnv.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. charset_name
  2. close_iconv_convenience
  3. smb_iconv_convenience_init
  4. get_conv_handle
  5. iconv_talloc
  6. convert_string_convenience
  7. convert_string_talloc_convenience
  8. next_codepoint_convenience
  9. push_codepoint

   1 /* 
   2    Unix SMB/CIFS implementation.
   3    Character set conversion Extensions
   4    Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
   5    Copyright (C) Andrew Tridgell 2001
   6    Copyright (C) Simo Sorce 2001
   7    Copyright (C) Jelmer Vernooij 2007
   8    
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13    
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18    
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 
  22 */
  23 #include "includes.h"
  24 #include "system/iconv.h"
  25 
  26 /**
  27  * @file
  28  *
  29  * @brief Character-set conversion routines built on our iconv.
  30  * 
  31  * @note Samba's internal character set (at least in the 3.0 series)
  32  * is always the same as the one for the Unix filesystem.  It is
  33  * <b>not</b> necessarily UTF-8 and may be different on machines that
  34  * need i18n filenames to be compatible with Unix software.  It does
  35  * have to be a superset of ASCII.  All multibyte sequences must start
  36  * with a byte with the high bit set.
  37  *
  38  * @sa lib/iconv.c
  39  */
  40 
  41 struct smb_iconv_convenience {
  42         const char *unix_charset;
  43         const char *dos_charset;
  44         bool native_iconv;
  45         smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
  46 };
  47 
  48 
  49 /**
  50  * Return the name of a charset to give to iconv().
  51  **/
  52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
     /* [<][>][^][v][top][bottom][index][help] */
  53 {
  54         switch (ch) {
  55         case CH_UTF16: return "UTF-16LE";
  56         case CH_UNIX: return ic->unix_charset;
  57         case CH_DOS: return ic->dos_charset;
  58         case CH_UTF8: return "UTF8";
  59         case CH_UTF16BE: return "UTF-16BE";
  60         case CH_UTF16MUNGED: return "UTF16_MUNGED";
  61         default:
  62         return "ASCII";
  63         }
  64 }
  65 
  66 /**
  67  re-initialize iconv conversion descriptors
  68 **/
  69 static int close_iconv_convenience(struct smb_iconv_convenience *data)
     /* [<][>][^][v][top][bottom][index][help] */
  70 {
  71         unsigned c1, c2;
  72         for (c1=0;c1<NUM_CHARSETS;c1++) {
  73                 for (c2=0;c2<NUM_CHARSETS;c2++) {
  74                         if (data->conv_handles[c1][c2] != NULL) {
  75                                 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
  76                                         smb_iconv_close(data->conv_handles[c1][c2]);
  77                                 }
  78                                 data->conv_handles[c1][c2] = NULL;
  79                         }
  80                 }
  81         }
  82 
  83         return 0;
  84 }
  85 
  86 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
     /* [<][>][^][v][top][bottom][index][help] */
  87                                                          const char *dos_charset,
  88                                                          const char *unix_charset,
  89                                                          bool native_iconv)
  90 {
  91         struct smb_iconv_convenience *ret = talloc_zero(mem_ctx, 
  92                                         struct smb_iconv_convenience);
  93 
  94         if (ret == NULL) {
  95                 return NULL;
  96         }
  97 
  98         talloc_set_destructor(ret, close_iconv_convenience);
  99 
 100         ret->dos_charset = talloc_strdup(ret, dos_charset);
 101         ret->unix_charset = talloc_strdup(ret, unix_charset);
 102         ret->native_iconv = native_iconv;
 103 
 104         return ret;
 105 }
 106 
 107 /*
 108   on-demand initialisation of conversion handles
 109 */
 110 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
     /* [<][>][^][v][top][bottom][index][help] */
 111                                    charset_t from, charset_t to)
 112 {
 113         const char *n1, *n2;
 114         static bool initialised;
 115 
 116         if (initialised == false) {
 117                 initialised = true;
 118                 
 119 #ifdef LC_ALL
 120                 /* we set back the locale to C to get ASCII-compatible
 121                    toupper/lower functions.  For now we do not need
 122                    any other POSIX localisations anyway. When we
 123                    should really need localized string functions one
 124                    day we need to write our own ascii_tolower etc.
 125                 */
 126                 setlocale(LC_ALL, "C");
 127 #endif
 128         }
 129 
 130         if (ic->conv_handles[from][to]) {
 131                 return ic->conv_handles[from][to];
 132         }
 133 
 134         n1 = charset_name(ic, from);
 135         n2 = charset_name(ic, to);
 136 
 137         ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1, 
 138                                                        ic->native_iconv);
 139         
 140         if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
 141                 if ((from == CH_DOS || to == CH_DOS) &&
 142                     strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
 143                         DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
 144                                  charset_name(ic, CH_DOS)));
 145                         ic->dos_charset = "ASCII";
 146 
 147                         n1 = charset_name(ic, from);
 148                         n2 = charset_name(ic, to);
 149                         
 150                         ic->conv_handles[from][to] = 
 151                                 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
 152                 }
 153         }
 154 
 155         return ic->conv_handles[from][to];
 156 }
 157 
 158 /**
 159  * Convert string from one encoding to another, making error checking etc
 160  *
 161  * @param mem_ctx Memory context
 162  * @param cd Iconv handle
 163  * @param src pointer to source string (multibyte or singlebyte)
 164  * @param srclen length of the source string in bytes
 165  * @param dest pointer to destination string (multibyte or singlebyte)
 166  * @param destlen maximal length allowed for string
 167  * @returns the number of bytes occupied in the destination
 168  **/
 169 _PUBLIC_ ssize_t iconv_talloc(TALLOC_CTX *ctx, 
     /* [<][>][^][v][top][bottom][index][help] */
 170                                        smb_iconv_t cd,
 171                                        void const *src, size_t srclen, 
 172                                        void *dst)
 173 {
 174         size_t i_len, o_len, destlen;
 175         void **dest = (void **)dst;
 176         size_t retval;
 177         const char *inbuf = (const char *)src;
 178         char *outbuf, *ob;
 179 
 180         *dest = NULL;
 181 
 182         /* it is _very_ rare that a conversion increases the size by
 183            more than 3x */
 184         destlen = srclen;
 185         outbuf = NULL;
 186 convert:
 187         destlen = 2 + (destlen*3);
 188         ob = talloc_realloc(ctx, outbuf, char, destlen);
 189         if (!ob) {
 190                 DEBUG(0, ("iconv_talloc: realloc failed!\n"));
 191                 talloc_free(outbuf);
 192                 return (size_t)-1;
 193         } else {
 194                 outbuf = ob;
 195         }
 196 
 197         /* we give iconv 2 less bytes to allow us to terminate at the
 198            end */
 199         i_len = srclen;
 200         o_len = destlen-2;
 201         retval = smb_iconv(cd,
 202                            &inbuf, &i_len,
 203                            &outbuf, &o_len);
 204         if(retval == (size_t)-1)                {
 205                 const char *reason="unknown error";
 206                 switch(errno) {
 207                         case EINVAL:
 208                                 reason="Incomplete multibyte sequence";
 209                                 break;
 210                         case E2BIG:
 211                                 goto convert;           
 212                         case EILSEQ:
 213                                 reason="Illegal multibyte sequence";
 214                                 break;
 215                 }
 216                 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
 217                 talloc_free(ob);
 218                 return (size_t)-1;
 219         }
 220         
 221         destlen = (destlen-2) - o_len;
 222 
 223         /* guarantee null termination in all charsets */
 224         SSVAL(ob, destlen, 0);
 225 
 226         *dest = ob;
 227 
 228         return destlen;
 229 
 230 }
 231 
 232 /**
 233  * Convert string from one encoding to another, making error checking etc
 234  *
 235  * @param src pointer to source string (multibyte or singlebyte)
 236  * @param srclen length of the source string in bytes
 237  * @param dest pointer to destination string (multibyte or singlebyte)
 238  * @param destlen maximal length allowed for string
 239  * @returns the number of bytes occupied in the destination
 240  **/
 241 _PUBLIC_ bool convert_string_convenience(struct smb_iconv_convenience *ic,
     /* [<][>][^][v][top][bottom][index][help] */
 242                                 charset_t from, charset_t to,
 243                                 void const *src, size_t srclen, 
 244                                 void *dest, size_t destlen, size_t *converted_size,
 245                                 bool allow_badcharcnv)
 246 {
 247         size_t i_len, o_len;
 248         size_t retval;
 249         const char* inbuf = (const char*)src;
 250         char* outbuf = (char*)dest;
 251         smb_iconv_t descriptor;
 252 
 253         if (allow_badcharcnv) {
 254                 /* Not implemented yet */
 255                 return false;
 256         }
 257 
 258         if (srclen == (size_t)-1)
 259                 srclen = strlen(inbuf)+1;
 260 
 261         descriptor = get_conv_handle(ic, from, to);
 262 
 263         if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
 264                 /* conversion not supported, use as is */
 265                 size_t len = MIN(srclen,destlen);
 266                 memcpy(dest,src,len);
 267                 *converted_size = len;
 268                 return true;
 269         }
 270 
 271         i_len=srclen;
 272         o_len=destlen;
 273         retval = smb_iconv(descriptor,  &inbuf, &i_len, &outbuf, &o_len);
 274         if(retval==(size_t)-1) {
 275                 const char *reason;
 276                 switch(errno) {
 277                         case EINVAL:
 278                                 reason="Incomplete multibyte sequence";
 279                                 return false;
 280                         case E2BIG:
 281                                 reason="No more room"; 
 282                                 if (from == CH_UNIX) {
 283                                         DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
 284                                                  charset_name(ic, from), charset_name(ic, to),
 285                                                  (int)srclen, (int)destlen, 
 286                                                  (const char *)src));
 287                                 } else {
 288                                         DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
 289                                                  charset_name(ic, from), charset_name(ic, to),
 290                                                  (int)srclen, (int)destlen));
 291                                 }
 292                                return false;
 293                         case EILSEQ:
 294                                reason="Illegal multibyte sequence";
 295                                return false;
 296                 }
 297                 /* smb_panic(reason); */
 298         }
 299         if (converted_size != NULL)
 300                 *converted_size = destlen-o_len;
 301         return true;
 302 }
 303         
 304 /**
 305  * Convert between character sets, allocating a new buffer using talloc for the result.
 306  *
 307  * @param srclen length of source buffer.
 308  * @param dest always set at least to NULL
 309  * @note -1 is not accepted for srclen.
 310  *
 311  * @returns Size in bytes of the converted string; or -1 in case of error.
 312  **/
 313 
 314 _PUBLIC_ bool convert_string_talloc_convenience(TALLOC_CTX *ctx, 
     /* [<][>][^][v][top][bottom][index][help] */
 315                                        struct smb_iconv_convenience *ic, 
 316                                        charset_t from, charset_t to, 
 317                                        void const *src, size_t srclen, 
 318                                        void *dst, size_t *converted_size, 
 319                                            bool allow_badcharcnv)
 320 {
 321         void **dest = (void **)dst;
 322         smb_iconv_t descriptor;
 323         ssize_t ret;
 324 
 325         if (allow_badcharcnv)
 326                 return false; /* Not implemented yet */
 327 
 328         *dest = NULL;
 329 
 330         if (src == NULL || srclen == (size_t)-1 || srclen == 0)
 331                 return false;
 332 
 333         descriptor = get_conv_handle(ic, from, to);
 334 
 335         if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
 336                 /* conversion not supported, return -1*/
 337                 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
 338                           charset_name(ic, from), 
 339                           charset_name(ic, to)));
 340                 return false;
 341         }
 342 
 343         ret = iconv_talloc(ctx, descriptor, src, srclen, dest);
 344         if (ret == -1)
 345                 return false;
 346         if (converted_size != NULL)
 347                 *converted_size = ret;
 348         return true;
 349 }
 350 
 351 /*
 352   return the unicode codepoint for the next multi-byte CH_UNIX character
 353   in the string
 354 
 355   also return the number of bytes consumed (which tells the caller
 356   how many bytes to skip to get to the next CH_UNIX character)
 357 
 358   return INVALID_CODEPOINT if the next character cannot be converted
 359 */
 360 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic, 
     /* [<][>][^][v][top][bottom][index][help] */
 361                                     const char *str, size_t *size)
 362 {
 363         /* it cannot occupy more than 4 bytes in UTF16 format */
 364         uint8_t buf[4];
 365         smb_iconv_t descriptor;
 366         size_t ilen_orig;
 367         size_t ilen;
 368         size_t olen;
 369         char *outbuf;
 370 
 371         if ((str[0] & 0x80) == 0) {
 372                 *size = 1;
 373                 return (codepoint_t)str[0];
 374         }
 375 
 376         /* we assume that no multi-byte character can take
 377            more than 5 bytes. This is OK as we only
 378            support codepoints up to 1M */
 379         ilen_orig = strnlen(str, 5);
 380         ilen = ilen_orig;
 381 
 382         descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
 383         if (descriptor == (smb_iconv_t)-1) {
 384                 *size = 1;
 385                 return INVALID_CODEPOINT;
 386         }
 387 
 388         /* this looks a little strange, but it is needed to cope
 389            with codepoints above 64k */
 390         olen = 2;
 391         outbuf = (char *)buf;
 392         smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
 393         if (olen == 2) {
 394                 olen = 4;
 395                 outbuf = (char *)buf;
 396                 smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
 397                 if (olen == 4) {
 398                         /* we didn't convert any bytes */
 399                         *size = 1;
 400                         return INVALID_CODEPOINT;
 401                 }
 402                 olen = 4 - olen;
 403         } else {
 404                 olen = 2 - olen;
 405         }
 406 
 407         *size = ilen_orig - ilen;
 408 
 409         if (olen == 2) {
 410                 return (codepoint_t)SVAL(buf, 0);
 411         }
 412         if (olen == 4) {
 413                 /* decode a 4 byte UTF16 character manually */
 414                 return (codepoint_t)0x10000 + 
 415                         (buf[2] | ((buf[3] & 0x3)<<8) | 
 416                          (buf[0]<<10) | ((buf[1] & 0x3)<<18));
 417         }
 418 
 419         /* no other length is valid */
 420         return INVALID_CODEPOINT;
 421 }
 422 
 423 /*
 424   push a single codepoint into a CH_UNIX string the target string must
 425   be able to hold the full character, which is guaranteed if it is at
 426   least 5 bytes in size. The caller may pass less than 5 bytes if they
 427   are sure the character will fit (for example, you can assume that
 428   uppercase/lowercase of a character will not add more than 1 byte)
 429 
 430   return the number of bytes occupied by the CH_UNIX character, or
 431   -1 on failure
 432 */
 433 _PUBLIC_ ssize_t push_codepoint(struct smb_iconv_convenience *ic, 
     /* [<][>][^][v][top][bottom][index][help] */
 434                                 char *str, codepoint_t c)
 435 {
 436         smb_iconv_t descriptor;
 437         uint8_t buf[4];
 438         size_t ilen, olen;
 439         const char *inbuf;
 440         
 441         if (c < 128) {
 442                 *str = c;
 443                 return 1;
 444         }
 445 
 446         descriptor = get_conv_handle(ic, 
 447                                      CH_UTF16, CH_UNIX);
 448         if (descriptor == (smb_iconv_t)-1) {
 449                 return -1;
 450         }
 451 
 452         if (c < 0x10000) {
 453                 ilen = 2;
 454                 olen = 5;
 455                 inbuf = (char *)buf;
 456                 SSVAL(buf, 0, c);
 457                 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 458                 if (ilen != 0) {
 459                         return -1;
 460                 }
 461                 return 5 - olen;
 462         }
 463 
 464         c -= 0x10000;
 465 
 466         buf[0] = (c>>10) & 0xFF;
 467         buf[1] = (c>>18) | 0xd8;
 468         buf[2] = c & 0xFF;
 469         buf[3] = ((c>>8) & 0x3) | 0xdc;
 470 
 471         ilen = 4;
 472         olen = 5;
 473         inbuf = (char *)buf;
 474 
 475         smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 476         if (ilen != 0) {
 477                 return -1;
 478         }
 479         return 5 - olen;
 480 }

/* [<][>][^][v][top][bottom][index][help] */