root/source3/lib/iconv.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. find_charset_functions
  2. smb_register_charset
  3. lazy_initialize_iconv
  4. sys_iconv
  5. smb_iconv
  6. is_utf16
  7. smb_iconv_open
  8. smb_iconv_close
  9. ascii_pull
  10. ascii_push
  11. latin1_push
  12. ucs2hex_pull
  13. ucs2hex_push
  14. iconv_swab
  15. iconv_copy
  16. utf8_pull
  17. utf8_push

   1 /* 
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002,2003
   6    
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11    
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16    
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20 
  21 #include "includes.h"
  22 
  23 /*
  24  * We have to use strcasecmp here as the character conversions
  25  * haven't been initialised yet. JRA.
  26  */
  27 
  28 #undef strcasecmp
  29 
  30 /**
  31  * @file
  32  *
  33  * @brief Samba wrapper/stub for iconv character set conversion.
  34  *
  35  * iconv is the XPG2 interface for converting between character
  36  * encodings.  This file provides a Samba wrapper around it, and also
  37  * a simple reimplementation that is used if the system does not
  38  * implement iconv.
  39  *
  40  * Samba only works with encodings that are supersets of ASCII: ascii
  41  * characters like whitespace can be tested for directly, multibyte
  42  * sequences start with a byte with the high bit set, and strings are
  43  * terminated by a nul byte.
  44  *
  45  * Note that the only function provided by iconv is conversion between
  46  * characters.  It doesn't directly support operations like
  47  * uppercasing or comparison.  We have to convert to UCS-2 and compare
  48  * there.
  49  *
  50  * @sa Samba Developers Guide
  51  **/
  52 
  53 static_decl_charset;
  54 
  55 static size_t ascii_pull(void *,const char **, size_t *, char **, size_t *);
  56 static size_t ascii_push(void *,const char **, size_t *, char **, size_t *);
  57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
  58 static size_t  utf8_pull(void *,const char **, size_t *, char **, size_t *);
  59 static size_t  utf8_push(void *,const char **, size_t *, char **, size_t *);
  60 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  61 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  62 static size_t iconv_copy(void *,const char **, size_t *, char **, size_t *);
  63 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  64 
  65 static struct charset_functions builtin_functions[] = {
  66         /* windows is really neither UCS-2 not UTF-16 */
  67         {"UCS-2LE",  iconv_copy, iconv_copy},
  68         {"UTF-16LE",  iconv_copy, iconv_copy},
  69         {"UCS-2BE",  iconv_swab, iconv_swab},
  70         {"UTF-16BE",  iconv_swab, iconv_swab},
  71 
  72         /* we include the UTF-8 alias to cope with differing locale settings */
  73         {"UTF8",   utf8_pull,  utf8_push},
  74         {"UTF-8",   utf8_pull,  utf8_push},
  75         {"ASCII", ascii_pull, ascii_push},
  76         {"646", ascii_pull, ascii_push},
  77         {"ISO-8859-1", ascii_pull, latin1_push},
  78         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push},
  79         {NULL, NULL, NULL}
  80 };
  81 
  82 static struct charset_functions *charsets = NULL;
  83 
  84 static struct charset_functions *find_charset_functions(const char *name) 
     /* [<][>][^][v][top][bottom][index][help] */
  85 {
  86         struct charset_functions *c = charsets;
  87 
  88         while(c) {
  89                 if (strcasecmp(name, c->name) == 0) {
  90                         return c;
  91                 }
  92                 c = c->next;
  93         }
  94 
  95         return NULL;
  96 }
  97 
  98 NTSTATUS smb_register_charset(struct charset_functions *funcs) 
     /* [<][>][^][v][top][bottom][index][help] */
  99 {
 100         if (!funcs) {
 101                 return NT_STATUS_INVALID_PARAMETER;
 102         }
 103 
 104         DEBUG(5, ("Attempting to register new charset %s\n", funcs->name));
 105         /* Check whether we already have this charset... */
 106         if (find_charset_functions(funcs->name)) {
 107                 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs->name));
 108                 return NT_STATUS_OBJECT_NAME_COLLISION;
 109         }
 110 
 111         funcs->next = funcs->prev = NULL;
 112         DEBUG(5, ("Registered charset %s\n", funcs->name));
 113         DLIST_ADD(charsets, funcs);
 114         return NT_STATUS_OK;
 115 }
 116 
 117 static void lazy_initialize_iconv(void)
     /* [<][>][^][v][top][bottom][index][help] */
 118 {
 119         static bool initialized;
 120         int i;
 121 
 122         if (!initialized) {
 123                 initialized = True;
 124                 for(i = 0; builtin_functions[i].name; i++) 
 125                         smb_register_charset(&builtin_functions[i]);
 126                 static_init_charset;
 127         }
 128 }
 129 
 130 #ifdef HAVE_NATIVE_ICONV
 131 /* if there was an error then reset the internal state,
 132    this ensures that we don't have a shift state remaining for
 133    character sets like SJIS */
 134 static size_t sys_iconv(void *cd, 
     /* [<][>][^][v][top][bottom][index][help] */
 135                         const char **inbuf, size_t *inbytesleft,
 136                         char **outbuf, size_t *outbytesleft)
 137 {
 138         size_t ret = iconv((iconv_t)cd, 
 139                            (void *)inbuf, inbytesleft,
 140                            outbuf, outbytesleft);
 141         if (ret == (size_t)-1) {
 142                 int saved_errno = errno;
 143                 iconv(cd, NULL, NULL, NULL, NULL);
 144                 errno = saved_errno;
 145         }
 146         return ret;
 147 }
 148 #endif
 149 
 150 /**
 151  * This is a simple portable iconv() implementaion.
 152  *
 153  * It only knows about a very small number of character sets - just
 154  * enough that Samba works on systems that don't have iconv.
 155  **/
 156 size_t smb_iconv(smb_iconv_t cd, 
     /* [<][>][^][v][top][bottom][index][help] */
 157                  const char **inbuf, size_t *inbytesleft,
 158                  char **outbuf, size_t *outbytesleft)
 159 {
 160         char cvtbuf[2048];
 161         char *bufp = cvtbuf;
 162         size_t bufsize;
 163 
 164         /* in many cases we can go direct */
 165         if (cd->direct) {
 166                 return cd->direct(cd->cd_direct, 
 167                                   inbuf, inbytesleft, outbuf, outbytesleft);
 168         }
 169 
 170 
 171         /* otherwise we have to do it chunks at a time */
 172         while (*inbytesleft > 0) {
 173                 bufp = cvtbuf;
 174                 bufsize = sizeof(cvtbuf);
 175                 
 176                 if (cd->pull(cd->cd_pull, 
 177                              inbuf, inbytesleft, &bufp, &bufsize) == -1
 178                     && errno != E2BIG) return -1;
 179 
 180                 bufp = cvtbuf;
 181                 bufsize = sizeof(cvtbuf) - bufsize;
 182 
 183                 if (cd->push(cd->cd_push, 
 184                              (const char **)&bufp, &bufsize, 
 185                              outbuf, outbytesleft) == -1) return -1;
 186         }
 187 
 188         return 0;
 189 }
 190 
 191 
 192 static bool is_utf16(const char *name)
     /* [<][>][^][v][top][bottom][index][help] */
 193 {
 194         return strcasecmp(name, "UCS-2LE") == 0 ||
 195                 strcasecmp(name, "UTF-16LE") == 0;
 196 }
 197 
 198 /*
 199   simple iconv_open() wrapper
 200  */
 201 smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
     /* [<][>][^][v][top][bottom][index][help] */
 202 {
 203         smb_iconv_t ret;
 204         struct charset_functions *from, *to;
 205         
 206         lazy_initialize_iconv();
 207         from = charsets;
 208         to = charsets;
 209 
 210         ret = SMB_MALLOC_P(struct smb_iconv_s);
 211         if (!ret) {
 212                 errno = ENOMEM;
 213                 return (smb_iconv_t)-1;
 214         }
 215         memset(ret, 0, sizeof(struct smb_iconv_s));
 216 
 217         ret->from_name = SMB_STRDUP(fromcode);
 218         ret->to_name = SMB_STRDUP(tocode);
 219 
 220         /* check for the simplest null conversion */
 221         if (strcasecmp(fromcode, tocode) == 0) {
 222                 ret->direct = iconv_copy;
 223                 return ret;
 224         }
 225 
 226         /* check if we have a builtin function for this conversion */
 227         from = find_charset_functions(fromcode);
 228         if(from)ret->pull = from->pull;
 229         
 230         to = find_charset_functions(tocode);
 231         if(to)ret->push = to->push;
 232 
 233         /* check if we can use iconv for this conversion */
 234 #ifdef HAVE_NATIVE_ICONV
 235         if (!ret->pull) {
 236                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 237                 if (ret->cd_pull == (iconv_t)-1)
 238                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 239                 if (ret->cd_pull != (iconv_t)-1)
 240                         ret->pull = sys_iconv;
 241         }
 242 
 243         if (!ret->push) {
 244                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 245                 if (ret->cd_push == (iconv_t)-1)
 246                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 247                 if (ret->cd_push != (iconv_t)-1)
 248                         ret->push = sys_iconv;
 249         }
 250 #endif
 251         
 252         /* check if there is a module available that can do this conversion */
 253         if (!ret->pull && NT_STATUS_IS_OK(smb_probe_module("charset", fromcode))) {
 254                 if(!(from = find_charset_functions(fromcode)))
 255                         DEBUG(0, ("Module %s doesn't provide charset %s!\n", fromcode, fromcode));
 256                 else 
 257                         ret->pull = from->pull;
 258         }
 259 
 260         if (!ret->push && NT_STATUS_IS_OK(smb_probe_module("charset", tocode))) {
 261                 if(!(to = find_charset_functions(tocode)))
 262                         DEBUG(0, ("Module %s doesn't provide charset %s!\n", tocode, tocode));
 263                 else 
 264                         ret->push = to->push;
 265         }
 266 
 267         if (!ret->push || !ret->pull) {
 268                 SAFE_FREE(ret->from_name);
 269                 SAFE_FREE(ret->to_name);
 270                 SAFE_FREE(ret);
 271                 errno = EINVAL;
 272                 return (smb_iconv_t)-1;
 273         }
 274 
 275         /* check for conversion to/from ucs2 */
 276         if (is_utf16(fromcode) && to) {
 277                 ret->direct = to->push;
 278                 ret->push = ret->pull = NULL;
 279                 return ret;
 280         }
 281 
 282         if (is_utf16(tocode) && from) {
 283                 ret->direct = from->pull;
 284                 ret->push = ret->pull = NULL;
 285                 return ret;
 286         }
 287 
 288         /* Check if we can do the conversion direct */
 289 #ifdef HAVE_NATIVE_ICONV
 290         if (is_utf16(fromcode)) {
 291                 ret->direct = sys_iconv;
 292                 ret->cd_direct = ret->cd_push;
 293                 ret->cd_push = NULL;
 294                 return ret;
 295         }
 296         if (is_utf16(tocode)) {
 297                 ret->direct = sys_iconv;
 298                 ret->cd_direct = ret->cd_pull;
 299                 ret->cd_pull = NULL;
 300                 return ret;
 301         }
 302 #endif
 303 
 304         return ret;
 305 }
 306 
 307 /*
 308   simple iconv_close() wrapper
 309 */
 310 int smb_iconv_close (smb_iconv_t cd)
     /* [<][>][^][v][top][bottom][index][help] */
 311 {
 312 #ifdef HAVE_NATIVE_ICONV
 313         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
 314         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
 315         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
 316 #endif
 317 
 318         SAFE_FREE(cd->from_name);
 319         SAFE_FREE(cd->to_name);
 320 
 321         memset(cd, 0, sizeof(*cd));
 322         SAFE_FREE(cd);
 323         return 0;
 324 }
 325 
 326 
 327 /**********************************************************************
 328  the following functions implement the builtin character sets in Samba
 329  and also the "test" character sets that are designed to test
 330  multi-byte character set support for english users
 331 ***********************************************************************/
 332 
 333 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 334                          char **outbuf, size_t *outbytesleft)
 335 {
 336         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 337                 (*outbuf)[0] = (*inbuf)[0];
 338                 (*outbuf)[1] = 0;
 339                 (*inbytesleft)  -= 1;
 340                 (*outbytesleft) -= 2;
 341                 (*inbuf)  += 1;
 342                 (*outbuf) += 2;
 343         }
 344 
 345         if (*inbytesleft > 0) {
 346                 errno = E2BIG;
 347                 return -1;
 348         }
 349         
 350         return 0;
 351 }
 352 
 353 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 354                          char **outbuf, size_t *outbytesleft)
 355 {
 356         int ir_count=0;
 357 
 358         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 359                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
 360                 if ((*inbuf)[1]) ir_count++;
 361                 (*inbytesleft)  -= 2;
 362                 (*outbytesleft) -= 1;
 363                 (*inbuf)  += 2;
 364                 (*outbuf) += 1;
 365         }
 366 
 367         if (*inbytesleft == 1) {
 368                 errno = EINVAL;
 369                 return -1;
 370         }
 371 
 372         if (*inbytesleft > 1) {
 373                 errno = E2BIG;
 374                 return -1;
 375         }
 376         
 377         return ir_count;
 378 }
 379 
 380 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 381                          char **outbuf, size_t *outbytesleft)
 382 {
 383         int ir_count=0;
 384 
 385         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 386                 (*outbuf)[0] = (*inbuf)[0];
 387                 if ((*inbuf)[1]) ir_count++;
 388                 (*inbytesleft)  -= 2;
 389                 (*outbytesleft) -= 1;
 390                 (*inbuf)  += 2;
 391                 (*outbuf) += 1;
 392         }
 393 
 394         if (*inbytesleft == 1) {
 395                 errno = EINVAL;
 396                 return -1;
 397         }
 398 
 399         if (*inbytesleft > 1) {
 400                 errno = E2BIG;
 401                 return -1;
 402         }
 403         
 404         return ir_count;
 405 }
 406 
 407 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 408                          char **outbuf, size_t *outbytesleft)
 409 {
 410         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 411                 unsigned v;
 412 
 413                 if ((*inbuf)[0] != '@') {
 414                         /* seven bit ascii case */
 415                         (*outbuf)[0] = (*inbuf)[0];
 416                         (*outbuf)[1] = 0;
 417                         (*inbytesleft)  -= 1;
 418                         (*outbytesleft) -= 2;
 419                         (*inbuf)  += 1;
 420                         (*outbuf) += 2;
 421                         continue;
 422                 }
 423                 /* it's a hex character */
 424                 if (*inbytesleft < 5) {
 425                         errno = EINVAL;
 426                         return -1;
 427                 }
 428                 
 429                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 430                         errno = EILSEQ;
 431                         return -1;
 432                 }
 433 
 434                 (*outbuf)[0] = v&0xff;
 435                 (*outbuf)[1] = v>>8;
 436                 (*inbytesleft)  -= 5;
 437                 (*outbytesleft) -= 2;
 438                 (*inbuf)  += 5;
 439                 (*outbuf) += 2;
 440         }
 441 
 442         if (*inbytesleft > 0) {
 443                 errno = E2BIG;
 444                 return -1;
 445         }
 446         
 447         return 0;
 448 }
 449 
 450 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 451                            char **outbuf, size_t *outbytesleft)
 452 {
 453         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 454                 char buf[6];
 455 
 456                 if ((*inbuf)[1] == 0 && 
 457                     ((*inbuf)[0] & 0x80) == 0 &&
 458                     (*inbuf)[0] != '@') {
 459                         (*outbuf)[0] = (*inbuf)[0];
 460                         (*inbytesleft)  -= 2;
 461                         (*outbytesleft) -= 1;
 462                         (*inbuf)  += 2;
 463                         (*outbuf) += 1;
 464                         continue;
 465                 }
 466                 if (*outbytesleft < 5) {
 467                         errno = E2BIG;
 468                         return -1;
 469                 }
 470                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 471                 memcpy(*outbuf, buf, 5);
 472                 (*inbytesleft)  -= 2;
 473                 (*outbytesleft) -= 5;
 474                 (*inbuf)  += 2;
 475                 (*outbuf) += 5;
 476         }
 477 
 478         if (*inbytesleft == 1) {
 479                 errno = EINVAL;
 480                 return -1;
 481         }
 482 
 483         if (*inbytesleft > 1) {
 484                 errno = E2BIG;
 485                 return -1;
 486         }
 487         
 488         return 0;
 489 }
 490 
 491 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 492                          char **outbuf, size_t *outbytesleft)
 493 {
 494         int n;
 495 
 496         n = MIN(*inbytesleft, *outbytesleft);
 497 
 498         swab(*inbuf, *outbuf, (n&~1));
 499         if (n&1) {
 500                 (*outbuf)[n-1] = 0;
 501         }
 502 
 503         (*inbytesleft) -= n;
 504         (*outbytesleft) -= n;
 505         (*inbuf) += n;
 506         (*outbuf) += n;
 507 
 508         if (*inbytesleft > 0) {
 509                 errno = E2BIG;
 510                 return -1;
 511         }
 512 
 513         return 0;
 514 }
 515 
 516 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 517                          char **outbuf, size_t *outbytesleft)
 518 {
 519         int n;
 520 
 521         n = MIN(*inbytesleft, *outbytesleft);
 522 
 523         memmove(*outbuf, *inbuf, n);
 524 
 525         (*inbytesleft) -= n;
 526         (*outbytesleft) -= n;
 527         (*inbuf) += n;
 528         (*outbuf) += n;
 529 
 530         if (*inbytesleft > 0) {
 531                 errno = E2BIG;
 532                 return -1;
 533         }
 534 
 535         return 0;
 536 }
 537 
 538 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 539                          char **outbuf, size_t *outbytesleft)
 540 {
 541         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 542         const uint8 *c = (const uint8 *)*inbuf;
 543         uint8 *uc = (uint8 *)*outbuf;
 544 
 545         while (in_left >= 1 && out_left >= 2) {
 546                 unsigned int codepoint;
 547 
 548                 if ((c[0] & 0x80) == 0) {
 549                         uc[0] = c[0];
 550                         uc[1] = 0;
 551                         c  += 1;
 552                         in_left  -= 1;
 553                         out_left -= 2;
 554                         uc += 2;
 555                         continue;
 556                 }
 557 
 558                 if ((c[0] & 0xe0) == 0xc0) {
 559                         if (in_left < 2 ||
 560                             (c[1] & 0xc0) != 0x80) {
 561                                 errno = EILSEQ;
 562                                 goto error;
 563                         }
 564                         codepoint = (c[1]&0x3f) | ((c[0]&0x1f)<<6);
 565                         if (codepoint < 0x80) {
 566                                 /* don't accept UTF-8 characters that are not minimally packed */
 567                                 errno = EILSEQ;
 568                                 goto error;
 569                         }
 570                         uc[1] = codepoint >> 8;
 571                         uc[0] = codepoint & 0xff;
 572                         c  += 2;
 573                         in_left  -= 2;
 574                         out_left -= 2;
 575                         uc += 2;
 576                         continue;
 577                 }
 578 
 579                 if ((c[0] & 0xf0) == 0xe0) {
 580                         if (in_left < 3 ||
 581                             (c[1] & 0xc0) != 0x80 || 
 582                             (c[2] & 0xc0) != 0x80) {
 583                                 errno = EILSEQ;
 584                                 goto error;
 585                         }
 586                         codepoint = (c[2]&0x3f) | ((c[1]&0x3f)<<6) | ((c[0]&0xf)<<12);
 587                         if (codepoint < 0x800) {
 588                                 /* don't accept UTF-8 characters that are not minimally packed */
 589                                 errno = EILSEQ;
 590                                 goto error;
 591                         }
 592                         uc[1] = codepoint >> 8;
 593                         uc[0] = codepoint & 0xff;
 594                         c  += 3;
 595                         in_left  -= 3;
 596                         out_left -= 2;
 597                         uc += 2;
 598                         continue;
 599                 }
 600 
 601                 if ((c[0] & 0xf8) == 0xf0) {
 602                         if (in_left < 4 ||
 603                             (c[1] & 0xc0) != 0x80 || 
 604                             (c[2] & 0xc0) != 0x80 ||
 605                             (c[3] & 0xc0) != 0x80) {
 606                                 errno = EILSEQ;
 607                                 goto error;
 608                         }
 609                         codepoint = 
 610                                 (c[3]&0x3f) | 
 611                                 ((c[2]&0x3f)<<6) | 
 612                                 ((c[1]&0x3f)<<12) |
 613                                 ((c[0]&0x7)<<18);
 614                         if (codepoint < 0x10000 || codepoint > 0x10ffff) {
 615                                 /* don't accept UTF-8 characters that are not minimally packed */
 616                                 errno = EILSEQ;
 617                                 goto error;
 618                         }
 619 
 620                         codepoint -= 0x10000;
 621 
 622                         if (out_left < 4) {
 623                                 errno = E2BIG;
 624                                 goto error;
 625                         }
 626 
 627                         uc[0] = (codepoint>>10) & 0xFF;
 628                         uc[1] = (codepoint>>18) | 0xd8;
 629                         uc[2] = codepoint & 0xFF;
 630                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 631                         c  += 4;
 632                         in_left  -= 4;
 633                         out_left -= 4;
 634                         uc += 4;
 635                         continue;
 636                 }
 637 
 638                 /* we don't handle 5 byte sequences */
 639                 errno = EINVAL;
 640                 goto error;
 641         }
 642 
 643         if (in_left > 0) {
 644                 errno = E2BIG;
 645                 goto error;
 646         }
 647 
 648         *inbytesleft = in_left;
 649         *outbytesleft = out_left;
 650         *inbuf = (char *)c;
 651         *outbuf = (char *)uc;   
 652         return 0;
 653 
 654 error:
 655         *inbytesleft = in_left;
 656         *outbytesleft = out_left;
 657         *inbuf = (char *)c;
 658         *outbuf = (char *)uc;
 659         return -1;
 660 }
 661 
 662 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
     /* [<][>][^][v][top][bottom][index][help] */
 663                         char **outbuf, size_t *outbytesleft)
 664 {
 665         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 666         uint8 *c = (uint8 *)*outbuf;
 667         const uint8 *uc = (const uint8 *)*inbuf;
 668 
 669         while (in_left >= 2 && out_left >= 1) {
 670                 unsigned int codepoint;
 671 
 672                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 673                         /* simplest case */
 674                         c[0] = uc[0];
 675                         in_left  -= 2;
 676                         out_left -= 1;
 677                         uc += 2;
 678                         c  += 1;
 679                         continue;
 680                 }
 681 
 682                 if ((uc[1]&0xf8) == 0) {
 683                         /* next simplest case */
 684                         if (out_left < 2) {
 685                                 errno = E2BIG;
 686                                 goto error;
 687                         }
 688                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 689                         c[1] = 0x80 | (uc[0] & 0x3f);
 690                         in_left  -= 2;
 691                         out_left -= 2;
 692                         uc += 2;
 693                         c  += 2;
 694                         continue;
 695                 }
 696 
 697                 if ((uc[1] & 0xfc) == 0xdc) {
 698                         /* its the second part of a 4 byte sequence. Illegal */
 699                         if (in_left < 4) {
 700                                 errno = EINVAL;
 701                         } else {
 702                                 errno = EILSEQ;
 703                         }
 704                         goto error;
 705                 }
 706 
 707                 if ((uc[1] & 0xfc) != 0xd8) {
 708                         codepoint = uc[0] | (uc[1]<<8);
 709                         if (out_left < 3) {
 710                                 errno = E2BIG;
 711                                 goto error;
 712                         }
 713                         c[0] = 0xe0 | (codepoint >> 12);
 714                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 715                         c[2] = 0x80 | (codepoint & 0x3f);
 716                         
 717                         in_left  -= 2;
 718                         out_left -= 3;
 719                         uc  += 2;
 720                         c   += 3;
 721                         continue;
 722                 }
 723 
 724                 /* its the first part of a 4 byte sequence */
 725                 if (in_left < 4) {
 726                         errno = EINVAL;
 727                         goto error;
 728                 }
 729                 if ((uc[3] & 0xfc) != 0xdc) {
 730                         errno = EILSEQ;
 731                         goto error;
 732                 }
 733                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
 734                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 735                 
 736                 if (out_left < 4) {
 737                         errno = E2BIG;
 738                         goto error;
 739                 }
 740                 c[0] = 0xf0 | (codepoint >> 18);
 741                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 742                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 743                 c[3] = 0x80 | (codepoint & 0x3f);
 744                 
 745                 in_left  -= 4;
 746                 out_left -= 4;
 747                 uc       += 4;
 748                 c        += 4;
 749         }
 750 
 751         if (in_left == 1) {
 752                 errno = EINVAL;
 753                 goto error;
 754         }
 755 
 756         if (in_left > 1) {
 757                 errno = E2BIG;
 758                 goto error;
 759         }
 760 
 761         *inbytesleft = in_left;
 762         *outbytesleft = out_left;
 763         *inbuf  = (char *)uc;
 764         *outbuf = (char *)c;
 765         
 766         return 0;
 767 
 768 error:
 769         *inbytesleft = in_left;
 770         *outbytesleft = out_left;
 771         *inbuf  = (char *)uc;
 772         *outbuf = (char *)c;
 773         return -1;
 774 }
 775 

/* [<][>][^][v][top][bottom][index][help] */