root/lib/util/charset/iconv.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. charset_register_backend
  2. sys_iconv
  3. smb_iconv
  4. is_utf16
  5. smb_iconv_open_ex
  6. smb_iconv_open
  7. smb_iconv_close
  8. ascii_pull
  9. ascii_push
  10. latin1_push
  11. ucs2hex_pull
  12. ucs2hex_push
  13. iconv_swab
  14. iconv_copy
  15. utf8_pull
  16. utf8_push
  17. utf16_munged_pull

   1 /* 
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6    
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11    
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16    
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20 
  21 #include "includes.h"
  22 #include "../lib/util/dlinklist.h"
  23 #include "system/iconv.h"
  24 #include "system/filesys.h"
  25 #undef strcasecmp
  26 
  27 
  28 /**
  29  * @file
  30  *
  31  * @brief Samba wrapper/stub for iconv character set conversion.
  32  *
  33  * iconv is the XPG2 interface for converting between character
  34  * encodings.  This file provides a Samba wrapper around it, and also
  35  * a simple reimplementation that is used if the system does not
  36  * implement iconv.
  37  *
  38  * Samba only works with encodings that are supersets of ASCII: ascii
  39  * characters like whitespace can be tested for directly, multibyte
  40  * sequences start with a byte with the high bit set, and strings are
  41  * terminated by a nul byte.
  42  *
  43  * Note that the only function provided by iconv is conversion between
  44  * characters.  It doesn't directly support operations like
  45  * uppercasing or comparison.  We have to convert to UTF-16LE and
  46  * compare there.
  47  *
  48  * @sa Samba Developers Guide
  49  **/
  50 
  51 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  52 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  53 static size_t latin1_push (void *,const char **, size_t *, char **, size_t *);
  54 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  55 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  56 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
  57 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  58 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  59 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  60 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  61 
  62 static const struct charset_functions builtin_functions[] = {
  63         /* windows is closest to UTF-16 */
  64         {"UCS-2LE",  iconv_copy, iconv_copy},
  65         {"UTF-16LE",  iconv_copy, iconv_copy},
  66         {"UCS-2BE",  iconv_swab, iconv_swab},
  67         {"UTF-16BE",  iconv_swab, iconv_swab},
  68 
  69         /* we include the UTF-8 alias to cope with differing locale settings */
  70         {"UTF8",   utf8_pull,  utf8_push},
  71         {"UTF-8",   utf8_pull,  utf8_push},
  72 
  73         /* this handles the munging needed for String2Key */
  74         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
  75 
  76         {"ASCII", ascii_pull, ascii_push},
  77         {"646", ascii_pull, ascii_push},
  78         {"ISO-8859-1", ascii_pull, latin1_push},
  79         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
  80 };
  81 
  82 static struct charset_functions *charsets = NULL;
  83 
  84 bool charset_register_backend(const void *_funcs) 
     /* [<][>][^][v][top][bottom][index][help] */
  85 {
  86         struct charset_functions *funcs = (struct charset_functions *)memdup(_funcs,sizeof(struct charset_functions));
  87         struct charset_functions *c;
  88 
  89         /* Check whether we already have this charset... */
  90         for (c = charsets; c != NULL; c = c->next) {
  91                 if(!strcasecmp(c->name, funcs->name)) { 
  92                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
  93                         return false;
  94                 }
  95         }
  96 
  97         funcs->next = funcs->prev = NULL;
  98         DLIST_ADD(charsets, funcs);
  99         return true;
 100 }
 101 
 102 #ifdef HAVE_NATIVE_ICONV
 103 /* if there was an error then reset the internal state,
 104    this ensures that we don't have a shift state remaining for
 105    character sets like SJIS */
 106 static size_t sys_iconv(void *cd, 
     /* [<][>][^][v][top][bottom][index][help] */
 107                         const char **inbuf, size_t *inbytesleft,
 108                         char **outbuf, size_t *outbytesleft)
 109 {
 110         size_t ret = iconv((iconv_t)cd, 
 111                            discard_const_p(char *, inbuf), inbytesleft, 
 112                            outbuf, outbytesleft);
 113         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 114         return ret;
 115 }
 116 #endif
 117 
 118 /**
 119  * This is a simple portable iconv() implementaion.
 120  *
 121  * It only knows about a very small number of character sets - just
 122  * enough that Samba works on systems that don't have iconv.
 123  **/
 124 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd, 
     /* [<][>][^][v][top][bottom][index][help] */
 125                  const char **inbuf, size_t *inbytesleft,
 126                  char **outbuf, size_t *outbytesleft)
 127 {
 128         char cvtbuf[2048];
 129         size_t bufsize;
 130 
 131         /* in many cases we can go direct */
 132         if (cd->direct) {
 133                 return cd->direct(cd->cd_direct, 
 134                                   inbuf, inbytesleft, outbuf, outbytesleft);
 135         }
 136 
 137 
 138         /* otherwise we have to do it chunks at a time */
 139         while (*inbytesleft > 0) {
 140                 char *bufp1 = cvtbuf;
 141                 const char *bufp2 = cvtbuf;
 142 
 143                 bufsize = sizeof(cvtbuf);
 144                 
 145                 if (cd->pull(cd->cd_pull, 
 146                              inbuf, inbytesleft, &bufp1, &bufsize) == -1
 147                     && errno != E2BIG) return -1;
 148 
 149                 bufsize = sizeof(cvtbuf) - bufsize;
 150 
 151                 if (cd->push(cd->cd_push, 
 152                              &bufp2, &bufsize, 
 153                              outbuf, outbytesleft) == -1) return -1;
 154         }
 155 
 156         return 0;
 157 }
 158 
 159 static bool is_utf16(const char *name)
     /* [<][>][^][v][top][bottom][index][help] */
 160 {
 161         return strcasecmp(name, "UCS-2LE") == 0 ||
 162                 strcasecmp(name, "UTF-16LE") == 0;
 163 }
 164 
 165 
 166 
 167 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, 
     /* [<][>][^][v][top][bottom][index][help] */
 168                               const char *fromcode, bool native_iconv)
 169 {
 170         smb_iconv_t ret;
 171         const struct charset_functions *from=NULL, *to=NULL;
 172         int i;
 173 
 174         ret = (smb_iconv_t)talloc_named(mem_ctx,
 175                                         sizeof(*ret), 
 176                                         "iconv(%s,%s)", tocode, fromcode);
 177         if (!ret) {
 178                 errno = ENOMEM;
 179                 return (smb_iconv_t)-1;
 180         }
 181         memset(ret, 0, sizeof(*ret));
 182 
 183         /* check for the simplest null conversion */
 184         if (strcmp(fromcode, tocode) == 0) {
 185                 ret->direct = iconv_copy;
 186                 return ret;
 187         }
 188 
 189         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
 190                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
 191                         from = &builtin_functions[i];
 192                 }
 193                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
 194                         to = &builtin_functions[i];
 195                 }
 196         }
 197 
 198         if (from == NULL) {
 199                 for (from=charsets; from; from=from->next) {
 200                         if (strcasecmp(from->name, fromcode) == 0) break;
 201                 }
 202         }
 203 
 204         if (to == NULL) {
 205                 for (to=charsets; to; to=to->next) {
 206                         if (strcasecmp(to->name, tocode) == 0) break;
 207                 }
 208         }
 209 
 210 #ifdef HAVE_NATIVE_ICONV
 211         if ((!from || !to) && !native_iconv) {
 212                 goto failed;
 213         }
 214         if (!from) {
 215                 ret->pull = sys_iconv;
 216                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 217                 if (ret->cd_pull == (iconv_t)-1)
 218                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 219                 if (ret->cd_pull == (iconv_t)-1) goto failed;
 220         }
 221 
 222         if (!to) {
 223                 ret->push = sys_iconv;
 224                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 225                 if (ret->cd_push == (iconv_t)-1)
 226                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 227                 if (ret->cd_push == (iconv_t)-1) goto failed;
 228         }
 229 #else
 230         if (!from || !to) {
 231                 goto failed;
 232         }
 233 #endif
 234 
 235         /* check for conversion to/from ucs2 */
 236         if (is_utf16(fromcode) && to) {
 237                 ret->direct = to->push;
 238                 return ret;
 239         }
 240         if (is_utf16(tocode) && from) {
 241                 ret->direct = from->pull;
 242                 return ret;
 243         }
 244 
 245 #ifdef HAVE_NATIVE_ICONV
 246         if (is_utf16(fromcode)) {
 247                 ret->direct = sys_iconv;
 248                 ret->cd_direct = ret->cd_push;
 249                 ret->cd_push = NULL;
 250                 return ret;
 251         }
 252         if (is_utf16(tocode)) {
 253                 ret->direct = sys_iconv;
 254                 ret->cd_direct = ret->cd_pull;
 255                 ret->cd_pull = NULL;
 256                 return ret;
 257         }
 258 #endif
 259 
 260         /* the general case has to go via a buffer */
 261         if (!ret->pull) ret->pull = from->pull;
 262         if (!ret->push) ret->push = to->push;
 263         return ret;
 264 
 265 failed:
 266         talloc_free(ret);
 267         errno = EINVAL;
 268         return (smb_iconv_t)-1;
 269 }
 270 
 271 /*
 272   simple iconv_open() wrapper
 273  */
 274 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
     /* [<][>][^][v][top][bottom][index][help] */
 275 {
 276         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
 277 }
 278 
 279 /*
 280   simple iconv_close() wrapper
 281 */
 282 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
     /* [<][>][^][v][top][bottom][index][help] */
 283 {
 284 #ifdef HAVE_NATIVE_ICONV
 285         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
 286         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
 287         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
 288 #endif
 289 
 290         talloc_free(cd);
 291         return 0;
 292 }
 293 
 294 
 295 /**********************************************************************
 296  the following functions implement the builtin character sets in Samba
 297  and also the "test" character sets that are designed to test
 298  multi-byte character set support for english users
 299 ***********************************************************************/
 300 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 301                          char **outbuf, size_t *outbytesleft)
 302 {
 303         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 304                 (*outbuf)[0] = (*inbuf)[0];
 305                 (*outbuf)[1] = 0;
 306                 (*inbytesleft)  -= 1;
 307                 (*outbytesleft) -= 2;
 308                 (*inbuf)  += 1;
 309                 (*outbuf) += 2;
 310         }
 311 
 312         if (*inbytesleft > 0) {
 313                 errno = E2BIG;
 314                 return -1;
 315         }
 316         
 317         return 0;
 318 }
 319 
 320 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 321                          char **outbuf, size_t *outbytesleft)
 322 {
 323         int ir_count=0;
 324 
 325         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 326                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
 327                 if ((*inbuf)[1]) ir_count++;
 328                 (*inbytesleft)  -= 2;
 329                 (*outbytesleft) -= 1;
 330                 (*inbuf)  += 2;
 331                 (*outbuf) += 1;
 332         }
 333 
 334         if (*inbytesleft == 1) {
 335                 errno = EINVAL;
 336                 return -1;
 337         }
 338 
 339         if (*inbytesleft > 1) {
 340                 errno = E2BIG;
 341                 return -1;
 342         }
 343         
 344         return ir_count;
 345 }
 346 
 347 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 348                          char **outbuf, size_t *outbytesleft)
 349 {
 350         int ir_count=0;
 351 
 352         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 353                 (*outbuf)[0] = (*inbuf)[0];
 354                 if ((*inbuf)[1]) ir_count++;
 355                 (*inbytesleft)  -= 2;
 356                 (*outbytesleft) -= 1;
 357                 (*inbuf)  += 2;
 358                 (*outbuf) += 1;
 359         }
 360 
 361         if (*inbytesleft == 1) {
 362                 errno = EINVAL;
 363                 return -1;
 364         }
 365 
 366         if (*inbytesleft > 1) {
 367                 errno = E2BIG;
 368                 return -1;
 369         }
 370         
 371         return ir_count;
 372 }
 373 
 374 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 375                          char **outbuf, size_t *outbytesleft)
 376 {
 377         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 378                 uint_t v;
 379 
 380                 if ((*inbuf)[0] != '@') {
 381                         /* seven bit ascii case */
 382                         (*outbuf)[0] = (*inbuf)[0];
 383                         (*outbuf)[1] = 0;
 384                         (*inbytesleft)  -= 1;
 385                         (*outbytesleft) -= 2;
 386                         (*inbuf)  += 1;
 387                         (*outbuf) += 2;
 388                         continue;
 389                 }
 390                 /* it's a hex character */
 391                 if (*inbytesleft < 5) {
 392                         errno = EINVAL;
 393                         return -1;
 394                 }
 395                 
 396                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 397                         errno = EILSEQ;
 398                         return -1;
 399                 }
 400 
 401                 (*outbuf)[0] = v&0xff;
 402                 (*outbuf)[1] = v>>8;
 403                 (*inbytesleft)  -= 5;
 404                 (*outbytesleft) -= 2;
 405                 (*inbuf)  += 5;
 406                 (*outbuf) += 2;
 407         }
 408 
 409         if (*inbytesleft > 0) {
 410                 errno = E2BIG;
 411                 return -1;
 412         }
 413         
 414         return 0;
 415 }
 416 
 417 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 418                            char **outbuf, size_t *outbytesleft)
 419 {
 420         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 421                 char buf[6];
 422 
 423                 if ((*inbuf)[1] == 0 && 
 424                     ((*inbuf)[0] & 0x80) == 0 &&
 425                     (*inbuf)[0] != '@') {
 426                         (*outbuf)[0] = (*inbuf)[0];
 427                         (*inbytesleft)  -= 2;
 428                         (*outbytesleft) -= 1;
 429                         (*inbuf)  += 2;
 430                         (*outbuf) += 1;
 431                         continue;
 432                 }
 433                 if (*outbytesleft < 5) {
 434                         errno = E2BIG;
 435                         return -1;
 436                 }
 437                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 438                 memcpy(*outbuf, buf, 5);
 439                 (*inbytesleft)  -= 2;
 440                 (*outbytesleft) -= 5;
 441                 (*inbuf)  += 2;
 442                 (*outbuf) += 5;
 443         }
 444 
 445         if (*inbytesleft == 1) {
 446                 errno = EINVAL;
 447                 return -1;
 448         }
 449 
 450         if (*inbytesleft > 1) {
 451                 errno = E2BIG;
 452                 return -1;
 453         }
 454         
 455         return 0;
 456 }
 457 
 458 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 459                          char **outbuf, size_t *outbytesleft)
 460 {
 461         int n;
 462 
 463         n = MIN(*inbytesleft, *outbytesleft);
 464 
 465         swab(*inbuf, *outbuf, (n&~1));
 466         if (n&1) {
 467                 (*outbuf)[n-1] = 0;
 468         }
 469 
 470         (*inbytesleft) -= n;
 471         (*outbytesleft) -= n;
 472         (*inbuf) += n;
 473         (*outbuf) += n;
 474 
 475         if (*inbytesleft > 0) {
 476                 errno = E2BIG;
 477                 return -1;
 478         }
 479 
 480         return 0;
 481 }
 482 
 483 
 484 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 485                          char **outbuf, size_t *outbytesleft)
 486 {
 487         int n;
 488 
 489         n = MIN(*inbytesleft, *outbytesleft);
 490 
 491         memmove(*outbuf, *inbuf, n);
 492 
 493         (*inbytesleft) -= n;
 494         (*outbytesleft) -= n;
 495         (*inbuf) += n;
 496         (*outbuf) += n;
 497 
 498         if (*inbytesleft > 0) {
 499                 errno = E2BIG;
 500                 return -1;
 501         }
 502 
 503         return 0;
 504 }
 505 
 506 /*
 507   this takes a UTF8 sequence and produces a UTF16 sequence
 508  */
 509 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 510                          char **outbuf, size_t *outbytesleft)
 511 {
 512         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 513         const uint8_t *c = (const uint8_t *)*inbuf;
 514         uint8_t *uc = (uint8_t *)*outbuf;
 515 
 516         while (in_left >= 1 && out_left >= 2) {
 517                 if ((c[0] & 0x80) == 0) {
 518                         uc[0] = c[0];
 519                         uc[1] = 0;
 520                         c  += 1;
 521                         in_left  -= 1;
 522                         out_left -= 2;
 523                         uc += 2;
 524                         continue;
 525                 }
 526 
 527                 if ((c[0] & 0xe0) == 0xc0) {
 528                         if (in_left < 2 ||
 529                             (c[1] & 0xc0) != 0x80) {
 530                                 errno = EILSEQ;
 531                                 goto error;
 532                         }
 533                         uc[1] = (c[0]>>2) & 0x7;
 534                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 535                         c  += 2;
 536                         in_left  -= 2;
 537                         out_left -= 2;
 538                         uc += 2;
 539                         continue;
 540                 }
 541 
 542                 if ((c[0] & 0xf0) == 0xe0) {
 543                         if (in_left < 3 ||
 544                             (c[1] & 0xc0) != 0x80 || 
 545                             (c[2] & 0xc0) != 0x80) {
 546                                 errno = EILSEQ;
 547                                 goto error;
 548                         }
 549                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
 550                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
 551                         c  += 3;
 552                         in_left  -= 3;
 553                         out_left -= 2;
 554                         uc += 2;
 555                         continue;
 556                 }
 557 
 558                 if ((c[0] & 0xf8) == 0xf0) {
 559                         unsigned int codepoint;
 560                         if (in_left < 4 ||
 561                             (c[1] & 0xc0) != 0x80 || 
 562                             (c[2] & 0xc0) != 0x80 ||
 563                             (c[3] & 0xc0) != 0x80) {
 564                                 errno = EILSEQ;
 565                                 goto error;
 566                         }
 567                         codepoint = 
 568                                 (c[3]&0x3f) | 
 569                                 ((c[2]&0x3f)<<6) | 
 570                                 ((c[1]&0x3f)<<12) |
 571                                 ((c[0]&0x7)<<18);
 572                         if (codepoint < 0x10000) {
 573                                 /* accept UTF-8 characters that are not
 574                                    minimally packed, but pack the result */
 575                                 uc[0] = (codepoint & 0xFF);
 576                                 uc[1] = (codepoint >> 8);
 577                                 c += 4;
 578                                 in_left -= 4;
 579                                 out_left -= 2;
 580                                 uc += 2;
 581                                 continue;
 582                         }
 583 
 584                         codepoint -= 0x10000;
 585 
 586                         if (out_left < 4) {
 587                                 errno = E2BIG;
 588                                 goto error;
 589                         }
 590 
 591                         uc[0] = (codepoint>>10) & 0xFF;
 592                         uc[1] = (codepoint>>18) | 0xd8;
 593                         uc[2] = codepoint & 0xFF;
 594                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 595                         c  += 4;
 596                         in_left  -= 4;
 597                         out_left -= 4;
 598                         uc += 4;
 599                         continue;
 600                 }
 601 
 602                 /* we don't handle 5 byte sequences */
 603                 errno = EINVAL;
 604                 goto error;
 605         }
 606 
 607         if (in_left > 0) {
 608                 errno = E2BIG;
 609                 goto error;
 610         }
 611 
 612         *inbytesleft = in_left;
 613         *outbytesleft = out_left;
 614         *inbuf = (const char *)c;
 615         *outbuf = (char *)uc;
 616         return 0;
 617 
 618 error:
 619         *inbytesleft = in_left;
 620         *outbytesleft = out_left;
 621         *inbuf = (const char *)c;
 622         *outbuf = (char *)uc;
 623         return -1;
 624 }
 625 
 626 
 627 /*
 628   this takes a UTF16 sequence and produces a UTF8 sequence
 629  */
 630 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 631                         char **outbuf, size_t *outbytesleft)
 632 {
 633         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 634         uint8_t *c = (uint8_t *)*outbuf;
 635         const uint8_t *uc = (const uint8_t *)*inbuf;
 636 
 637         while (in_left >= 2 && out_left >= 1) {
 638                 unsigned int codepoint;
 639 
 640                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 641                         /* simplest case */
 642                         c[0] = uc[0];
 643                         in_left  -= 2;
 644                         out_left -= 1;
 645                         uc += 2;
 646                         c  += 1;
 647                         continue;
 648                 }
 649 
 650                 if ((uc[1]&0xf8) == 0) {
 651                         /* next simplest case */
 652                         if (out_left < 2) {
 653                                 errno = E2BIG;
 654                                 goto error;
 655                         }
 656                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 657                         c[1] = 0x80 | (uc[0] & 0x3f);
 658                         in_left  -= 2;
 659                         out_left -= 2;
 660                         uc += 2;
 661                         c  += 2;
 662                         continue;
 663                 }
 664 
 665                 if ((uc[1] & 0xfc) == 0xdc) {
 666                         /* its the second part of a 4 byte sequence. Illegal */
 667                         if (in_left < 4) {
 668                                 errno = EINVAL;
 669                         } else {
 670                                 errno = EILSEQ;
 671                         }
 672                         goto error;
 673                 }
 674 
 675                 if ((uc[1] & 0xfc) != 0xd8) {
 676                         codepoint = uc[0] | (uc[1]<<8);
 677                         if (out_left < 3) {
 678                                 errno = E2BIG;
 679                                 goto error;
 680                         }
 681                         c[0] = 0xe0 | (codepoint >> 12);
 682                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 683                         c[2] = 0x80 | (codepoint & 0x3f);
 684                         
 685                         in_left  -= 2;
 686                         out_left -= 3;
 687                         uc  += 2;
 688                         c   += 3;
 689                         continue;
 690                 }
 691 
 692                 /* its the first part of a 4 byte sequence */
 693                 if (in_left < 4) {
 694                         errno = EINVAL;
 695                         goto error;
 696                 }
 697                 if ((uc[3] & 0xfc) != 0xdc) {
 698                         errno = EILSEQ;
 699                         goto error;
 700                 }
 701                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
 702                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 703                 
 704                 if (out_left < 4) {
 705                         errno = E2BIG;
 706                         goto error;
 707                 }
 708                 c[0] = 0xf0 | (codepoint >> 18);
 709                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 710                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 711                 c[3] = 0x80 | (codepoint & 0x3f);
 712                 
 713                 in_left  -= 4;
 714                 out_left -= 4;
 715                 uc       += 4;
 716                 c        += 4;
 717         }
 718 
 719         if (in_left == 1) {
 720                 errno = EINVAL;
 721                 goto error;
 722         }
 723 
 724         if (in_left > 1) {
 725                 errno = E2BIG;
 726                 goto error;
 727         }
 728 
 729         *inbytesleft = in_left;
 730         *outbytesleft = out_left;
 731         *inbuf  = (const char *)uc;
 732         *outbuf = (char *)c;
 733         
 734         return 0;
 735 
 736 error:
 737         *inbytesleft = in_left;
 738         *outbytesleft = out_left;
 739         *inbuf  = (const char *)uc;
 740         *outbuf = (char *)c;
 741         return -1;
 742 }
 743 
 744 
 745 /*
 746   this takes a UTF16 munged sequence, modifies it according to the
 747   string2key rules, and produces a UTF16 sequence
 748 
 749 The rules are:
 750 
 751     1) any 0x0000 characters are mapped to 0x0001
 752 
 753     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
 754        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
 755        U+FFFD (OBJECT REPLACEMENT CHARACTER).
 756 
 757     3) the same for any low surrogate that was not preceded by a high surrogate.
 758 
 759  */
 760 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 761                                char **outbuf, size_t *outbytesleft)
 762 {
 763         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 764         uint8_t *c = (uint8_t *)*outbuf;
 765         const uint8_t *uc = (const uint8_t *)*inbuf;
 766 
 767         while (in_left >= 2 && out_left >= 2) {
 768                 unsigned int codepoint = uc[0] | (uc[1]<<8);
 769 
 770                 if (codepoint == 0) {
 771                         codepoint = 1;
 772                 }
 773 
 774                 if ((codepoint & 0xfc00) == 0xd800) {
 775                         /* a high surrogate */
 776                         unsigned int codepoint2;
 777                         if (in_left < 4) {
 778                                 codepoint = 0xfffd;
 779                                 goto codepoint16;                               
 780                         }
 781                         codepoint2 = uc[2] | (uc[3]<<8);
 782                         if ((codepoint2 & 0xfc00) != 0xdc00) {
 783                                 /* high surrogate not followed by low
 784                                    surrogate: convert to 0xfffd */
 785                                 codepoint = 0xfffd;
 786                                 goto codepoint16;
 787                         }
 788                         if (out_left < 4) {
 789                                 errno = E2BIG;
 790                                 goto error;
 791                         }
 792                         memcpy(c, uc, 4);
 793                         in_left  -= 4;
 794                         out_left -= 4;
 795                         uc       += 4;
 796                         c        += 4;
 797                         continue;
 798                 }
 799 
 800                 if ((codepoint & 0xfc00) == 0xdc00) {
 801                         /* low surrogate not preceded by high
 802                            surrogate: convert to 0xfffd */
 803                         codepoint = 0xfffd;
 804                 }
 805 
 806         codepoint16:
 807                 c[0] = codepoint & 0xFF;
 808                 c[1] = (codepoint>>8) & 0xFF;
 809                 
 810                 in_left  -= 2;
 811                 out_left -= 2;
 812                 uc  += 2;
 813                 c   += 2;
 814                 continue;               
 815         }
 816 
 817         if (in_left == 1) {
 818                 errno = EINVAL;
 819                 goto error;
 820         }
 821 
 822         if (in_left > 1) {
 823                 errno = E2BIG;
 824                 goto error;
 825         }
 826 
 827         *inbytesleft = in_left;
 828         *outbytesleft = out_left;
 829         *inbuf  = (const char *)uc;
 830         *outbuf = (char *)c;
 831         
 832         return 0;
 833 
 834 error:
 835         *inbytesleft = in_left;
 836         *outbytesleft = out_left;
 837         *inbuf  = (const char *)uc;
 838         *outbuf = (char *)c;
 839         return -1;
 840 }
 841 
 842 
 843 

/* [<][>][^][v][top][bottom][index][help] */