root/source4/heimdal/lib/wind/utf8.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. utf8toutf32
  2. wind_utf8ucs4
  3. wind_utf8ucs4_length
  4. wind_ucs4utf8
  5. wind_ucs4utf8_length
  6. wind_ucs2read
  7. wind_ucs2write
  8. wind_utf8ucs2
  9. wind_utf8ucs2_length
  10. wind_ucs2utf8
  11. wind_ucs2utf8_length

   1 /*
   2  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
   3  * (Royal Institute of Technology, Stockholm, Sweden).
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the Institute nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33 
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37 #include "windlocl.h"
  38 
  39 RCSID("$Id$");
  40 
  41 static int
  42 utf8toutf32(const unsigned char **pp, uint32_t *out)
     /* [<][>][^][v][top][bottom][index][help] */
  43 {
  44     const unsigned char *p = *pp;
  45     unsigned c = *p;
  46 
  47     if (c & 0x80) {
  48         if ((c & 0xE0) == 0xC0) {
  49             const unsigned c2 = *++p;
  50             if ((c2 & 0xC0) == 0x80) {
  51                 *out =  ((c  & 0x1F) << 6)
  52                     | (c2 & 0x3F);
  53             } else {
  54                 return WIND_ERR_INVALID_UTF8;
  55             }
  56         } else if ((c & 0xF0) == 0xE0) {
  57             const unsigned c2 = *++p;
  58             if ((c2 & 0xC0) == 0x80) {
  59                 const unsigned c3 = *++p;
  60                 if ((c3 & 0xC0) == 0x80) {
  61                     *out =   ((c  & 0x0F) << 12)
  62                         | ((c2 & 0x3F) << 6)
  63                         |  (c3 & 0x3F);
  64                 } else {
  65                     return WIND_ERR_INVALID_UTF8;
  66                 }
  67             } else {
  68                 return WIND_ERR_INVALID_UTF8;
  69             }
  70         } else if ((c & 0xF8) == 0xF0) {
  71             const unsigned c2 = *++p;
  72             if ((c2 & 0xC0) == 0x80) {
  73                 const unsigned c3 = *++p;
  74                 if ((c3 & 0xC0) == 0x80) {
  75                     const unsigned c4 = *++p;
  76                     if ((c4 & 0xC0) == 0x80) {
  77                         *out =   ((c  & 0x07) << 18)
  78                             | ((c2 & 0x3F) << 12)
  79                             | ((c3 & 0x3F) <<  6)
  80                             |  (c4 & 0x3F);
  81                     } else {
  82                         return WIND_ERR_INVALID_UTF8;
  83                     }
  84                 } else {
  85                     return WIND_ERR_INVALID_UTF8;
  86                 }
  87             } else {
  88                 return WIND_ERR_INVALID_UTF8;
  89             }
  90         } else {
  91             return WIND_ERR_INVALID_UTF8;
  92         }
  93     } else {
  94         *out = c;
  95     }
  96 
  97     *pp = p;
  98 
  99     return 0;
 100 }
 101 
 102 /**
 103  * Convert an UTF-8 string to an UCS4 string.
 104  *
 105  * @param in an UTF-8 string to convert.
 106  * @param out the resulting UCS4 strint, must be at least
 107  * wind_utf8ucs4_length() long.  If out is NULL, the function will
 108  * calculate the needed space for the out variable (just like
 109  * wind_utf8ucs4_length()).
 110  * @param out_len before processing out_len should be the length of
 111  * the out variable, after processing it will be the length of the out
 112  * string.
 113  *
 114  * @return returns 0 on success, an wind error code otherwise
 115  * @ingroup wind
 116  */
 117 
 118 int
 119 wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 120 {
 121     const unsigned char *p;
 122     size_t o = 0;
 123     int ret;
 124 
 125     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 126         uint32_t u;
 127 
 128         ret = utf8toutf32(&p, &u);
 129         if (ret)
 130             return ret;
 131 
 132         if (out) {
 133             if (o >= *out_len)
 134                 return WIND_ERR_OVERRUN;
 135             out[o] = u;
 136         }
 137         o++;
 138     }
 139     *out_len = o;
 140     return 0;
 141 }
 142 
 143 /**
 144  * Calculate the length of from converting a UTF-8 string to a UCS4
 145  * string.
 146  *
 147  * @param in an UTF-8 string to convert.
 148  * @param out_len the length of the resulting UCS4 string.
 149  *
 150  * @return returns 0 on success, an wind error code otherwise
 151  * @ingroup wind
 152  */
 153 
 154 int
 155 wind_utf8ucs4_length(const char *in, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 156 {
 157     return wind_utf8ucs4(in, NULL, out_len);
 158 }
 159 
 160 static const char first_char[4] =
 161     { 0x00, 0xC0, 0xE0, 0xF0 };
 162 
 163 /**
 164  * Convert an UCS4 string to a UTF-8 string.
 165  *
 166  * @param in an UCS4 string to convert.
 167  * @param in_len the length input array.
 168 
 169  * @param out the resulting UTF-8 strint, must be at least
 170  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
 171  * out is NULL, the function will calculate the needed space for the
 172  * out variable (just like wind_ucs4utf8_length()).
 173 
 174  * @param out_len before processing out_len should be the length of
 175  * the out variable, after processing it will be the length of the out
 176  * string.
 177  *
 178  * @return returns 0 on success, an wind error code otherwise
 179  * @ingroup wind
 180  */
 181 
 182 int
 183 wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 184 {
 185     uint32_t ch;
 186     size_t i, len, o;
 187 
 188     for (o = 0, i = 0; i < in_len; i++) {
 189         ch = in[i];
 190         
 191         if (ch < 0x80) {
 192             len = 1;
 193         } else if (ch < 0x800) {
 194             len = 2;
 195         } else if (ch < 0x10000) {
 196             len = 3;
 197         } else if (ch <= 0x10FFFF) {
 198             len = 4;
 199         } else
 200             return WIND_ERR_INVALID_UTF32;
 201         
 202         o += len;
 203 
 204         if (out) {
 205             if (o >= *out_len)
 206                 return WIND_ERR_OVERRUN;
 207 
 208             switch(len) {
 209             case 4:
 210                 out[3] = (ch | 0x80) & 0xbf;
 211                 ch = ch << 6;
 212             case 3:
 213                 out[2] = (ch | 0x80) & 0xbf;
 214                 ch = ch << 6;
 215             case 2:
 216                 out[1] = (ch | 0x80) & 0xbf;
 217                 ch = ch << 6;
 218             case 1:
 219                 out[0] = ch | first_char[len - 1];
 220             }
 221         }
 222         out += len;
 223     }
 224     if (out) {
 225         if (o + 1 >= *out_len)
 226             return WIND_ERR_OVERRUN;
 227         *out = '\0';
 228     }
 229     *out_len = o;
 230     return 0;
 231 }
 232 
 233 /**
 234  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
 235  *
 236  * @param in an UCS4 string to convert.
 237  * @param in_len the length of UCS4 string to convert.
 238  * @param out_len the length of the resulting UTF-8 string.
 239  *
 240  * @return returns 0 on success, an wind error code otherwise
 241  * @ingroup wind
 242  */
 243 
 244 int
 245 wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 246 {
 247     return wind_ucs4utf8(in, in_len, NULL, out_len);
 248 }
 249 
 250 /**
 251  * Read in an UCS2 from a buffer.
 252  *
 253  * @param ptr The input buffer to read from.
 254  * @param len the length of the input buffer.
 255  * @param flags Flags to control the behavior of the function.
 256  * @param out the output UCS2, the array must be at least out/2 long.
 257  * @param out_len the output length
 258  *
 259  * @return returns 0 on success, an wind error code otherwise.
 260  * @ingroup wind
 261  */
 262 
 263 int
 264 wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
     /* [<][>][^][v][top][bottom][index][help] */
 265               uint16_t *out, size_t *out_len)
 266 {
 267     const unsigned char *p = ptr;
 268     int little = ((*flags) & WIND_RW_LE);
 269     size_t olen = *out_len;
 270 
 271     /** if len is zero, flags are unchanged */
 272     if (len == 0) {
 273         *out_len = 0;
 274         return 0;
 275     }
 276 
 277     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
 278     if (len & 1)
 279         return WIND_ERR_LENGTH_NOT_MOD2;
 280 
 281     /**
 282      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
 283      * found, check is LE/BE flag is already and use that otherwise
 284      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
 285      * the LE/BE flag and set the resulting LE/BE flag.
 286      */
 287     if ((*flags) & WIND_RW_BOM) {
 288         uint16_t bom = (p[0] << 8) + p[1];
 289         if (bom == 0xfffe || bom == 0xfeff) {
 290             little = (bom == 0xfffe);
 291             p += 2;
 292             len -= 2;
 293         } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
 294             /* little already set */
 295         } else
 296             return WIND_ERR_NO_BOM;
 297         *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
 298         *flags |= little ? WIND_RW_LE : WIND_RW_BE;
 299     }
 300 
 301     while (len) {
 302         if (olen < 1)
 303             return WIND_ERR_OVERRUN;
 304         if (little)
 305             *out = (p[1] << 8) + p[0];
 306         else
 307             *out = (p[0] << 8) + p[1];
 308         out++; p += 2; len -= 2; olen--;
 309     }
 310     *out_len -= olen;
 311     return 0;
 312 }
 313 
 314 /**
 315  * Write an UCS2 string to a buffer.
 316  *
 317  * @param in The input UCS2 string.
 318  * @param in_len the length of the input buffer.
 319  * @param flags Flags to control the behavior of the function.
 320  * @param ptr The input buffer to write to, the array must be at least
 321  * (in + 1) * 2 bytes long.
 322  * @param out_len the output length
 323  *
 324  * @return returns 0 on success, an wind error code otherwise.
 325  * @ingroup wind
 326  */
 327 
 328 int
 329 wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
     /* [<][>][^][v][top][bottom][index][help] */
 330                void *ptr, size_t *out_len)
 331 {
 332     unsigned char *p = ptr;
 333     size_t len = *out_len;
 334 
 335     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
 336     if (len & 1)
 337         return WIND_ERR_LENGTH_NOT_MOD2;
 338 
 339     /** On zero input length, flags are preserved */
 340     if (in_len == 0) {
 341         *out_len = 0;
 342         return 0;
 343     }
 344     /** If flags have WIND_RW_BOM set, the byte order mark is written
 345      * first to the output data */
 346     if ((*flags) & WIND_RW_BOM) {
 347         uint16_t bom = 0xfffe;
 348         
 349         if (len < 2)
 350             return WIND_ERR_OVERRUN;
 351 
 352         if ((*flags) & WIND_RW_LE) {
 353             p[0] = (bom >> 8) & 0xff;
 354             p[1] = (bom     ) & 0xff;
 355         } else {
 356             p[1] = (bom     ) & 0xff;
 357             p[0] = (bom >> 8) & 0xff;
 358         }
 359         len -= 2;
 360     }
 361 
 362     while (in_len) {
 363         /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
 364         if (len < 2)
 365             return WIND_ERR_OVERRUN;
 366         if ((*flags) & WIND_RW_LE) {
 367             p[0] = (in[0] >> 8) & 0xff;
 368             p[1] = (in[0]     ) & 0xff;
 369         } else {
 370             p[1] = (in[0]     ) & 0xff;
 371             p[0] = (in[0] >> 8) & 0xff;
 372         }
 373         len -= 2;
 374         in_len--;
 375         p += 2;
 376         in++;
 377     }
 378     *out_len -= len;
 379     return 0;
 380 }
 381 
 382 
 383 /**
 384  * Convert an UTF-8 string to an UCS2 string.
 385  *
 386  * @param in an UTF-8 string to convert.
 387  * @param out the resulting UCS2 strint, must be at least
 388  * wind_utf8ucs2_length() long.  If out is NULL, the function will
 389  * calculate the needed space for the out variable (just like
 390  * wind_utf8ucs2_length()).
 391  * @param out_len before processing out_len should be the length of
 392  * the out variable, after processing it will be the length of the out
 393  * string.
 394  *
 395  * @return returns 0 on success, an wind error code otherwise
 396  * @ingroup wind
 397  */
 398 
 399 int
 400 wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 401 {
 402     const unsigned char *p;
 403     size_t o = 0;
 404     int ret;
 405 
 406     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 407         uint32_t u;
 408 
 409         ret = utf8toutf32(&p, &u);
 410         if (ret)
 411             return ret;
 412 
 413         if (u & 0xffff0000)
 414             return WIND_ERR_NOT_UTF16;
 415 
 416         if (out) {
 417             if (o >= *out_len)
 418                 return WIND_ERR_OVERRUN;
 419             out[o] = u;
 420         }
 421         o++;
 422     }
 423     *out_len = o;
 424     return 0;
 425 }
 426 
 427 /**
 428  * Calculate the length of from converting a UTF-8 string to a UCS2
 429  * string.
 430  *
 431  * @param in an UTF-8 string to convert.
 432  * @param out_len the length of the resulting UCS4 string.
 433  *
 434  * @return returns 0 on success, an wind error code otherwise
 435  * @ingroup wind
 436  */
 437 
 438 int
 439 wind_utf8ucs2_length(const char *in, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 440 {
 441     return wind_utf8ucs2(in, NULL, out_len);
 442 }
 443 
 444 /**
 445  * Convert an UCS2 string to a UTF-8 string.
 446  *
 447  * @param in an UCS2 string to convert.
 448  * @param in_len the length of the in UCS2 string.
 449  * @param out the resulting UTF-8 strint, must be at least
 450  * wind_ucs2utf8_length() long.  If out is NULL, the function will
 451  * calculate the needed space for the out variable (just like
 452  * wind_ucs2utf8_length()).
 453  * @param out_len before processing out_len should be the length of
 454  * the out variable, after processing it will be the length of the out
 455  * string.
 456  *
 457  * @return returns 0 on success, an wind error code otherwise
 458  * @ingroup wind
 459  */
 460 
 461 int
 462 wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 463 {
 464     uint16_t ch;
 465     size_t i, len, o;
 466 
 467     for (o = 0, i = 0; i < in_len; i++) {
 468         ch = in[i];
 469         
 470         if (ch < 0x80) {
 471             len = 1;
 472         } else if (ch < 0x800) {
 473             len = 2;
 474         } else
 475             len = 3;
 476         
 477         o += len;
 478 
 479         if (out) {
 480             if (o >= *out_len)
 481                 return WIND_ERR_OVERRUN;
 482 
 483             switch(len) {
 484             case 3:
 485                 out[2] = (ch | 0x80) & 0xbf;
 486                 ch = ch << 6;
 487             case 2:
 488                 out[1] = (ch | 0x80) & 0xbf;
 489                 ch = ch << 6;
 490             case 1:
 491                 out[0] = ch | first_char[len - 1];
 492             }
 493             out += len;
 494         }
 495     }
 496     if (out) {
 497         if (o >= *out_len)
 498             return WIND_ERR_OVERRUN;
 499         *out = '\0';
 500     }
 501     *out_len = o;
 502     return 0;
 503 }
 504 
 505 /**
 506  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
 507  *
 508  * @param in an UCS2 string to convert.
 509  * @param in_len an UCS2 string length to convert.
 510  * @param out_len the length of the resulting UTF-8 string.
 511  *
 512  * @return returns 0 on success, an wind error code otherwise
 513  * @ingroup wind
 514  */
 515 
 516 int
 517 wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
     /* [<][>][^][v][top][bottom][index][help] */
 518 {
 519     return wind_ucs2utf8(in, in_len, NULL, out_len);
 520 }

/* [<][>][^][v][top][bottom][index][help] */