root/source4/heimdal/lib/wind/normalize.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. translation_cmp
  2. hangul_decomp
  3. hangul_composition
  4. compat_decomp
  5. cc_cmp
  6. canonical_reorder
  7. find_composition
  8. combine
  9. _wind_stringprep_normalize

   1 /*
   2  * Copyright (c) 2004 Kungliga Tekniska Högskolan
   3  * (Royal Institute of Technology, Stockholm, Sweden).
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the Institute nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33 
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37 #include "windlocl.h"
  38 
  39 #include <assert.h>
  40 #include <stdlib.h>
  41 #include <errno.h>
  42 
  43 #include "normalize_table.h"
  44 
  45 RCSID("$Id$");
  46 
  47 static int
  48 translation_cmp(const void *key, const void *data)
     /* [<][>][^][v][top][bottom][index][help] */
  49 {
  50     const struct translation *t1 = (const struct translation *)key;
  51     const struct translation *t2 = (const struct translation *)data;
  52 
  53     return t1->key - t2->key;
  54 }
  55 
  56 enum { s_base  = 0xAC00};
  57 enum { s_count = 11172};
  58 enum { l_base  = 0x1100};
  59 enum { l_count = 19};
  60 enum { v_base  = 0x1161};
  61 enum { v_count = 21};
  62 enum { t_base  = 0x11A7};
  63 enum { t_count = 28};
  64 enum { n_count = v_count * t_count};
  65 
  66 static int
  67 hangul_decomp(const uint32_t *in, size_t in_len,
     /* [<][>][^][v][top][bottom][index][help] */
  68               uint32_t *out, size_t *out_len)
  69 {
  70     uint32_t u = *in;
  71     unsigned s_index;
  72     unsigned l, v, t;
  73     unsigned o;
  74 
  75     if (u < s_base || u >= s_base + s_count)
  76         return 0;
  77     s_index = u - s_base;
  78     l = l_base + s_index / n_count;
  79     v = v_base + (s_index % n_count) / t_count;
  80     t = t_base + s_index % t_count;
  81     o = 2;
  82     if (t != t_base)
  83         ++o;
  84     if (*out_len < o)
  85         return WIND_ERR_OVERRUN;
  86     out[0] = l;
  87     out[1] = v;
  88     if (t != t_base)
  89         out[2] = t;
  90     *out_len = o;
  91     return 1;
  92 }
  93 
  94 static uint32_t
  95 hangul_composition(const uint32_t *in, size_t in_len)
     /* [<][>][^][v][top][bottom][index][help] */
  96 {
  97     if (in_len < 2)
  98         return 0;
  99     if (in[0] >= l_base && in[0] < l_base + l_count) {
 100         unsigned l_index = in[0] - l_base;
 101         unsigned v_index;
 102 
 103         if (in[1] < v_base || in[1] >= v_base + v_count)
 104             return 0;
 105         v_index = in[1] - v_base;
 106         return (l_index * v_count + v_index) * t_count + s_base;
 107     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
 108         unsigned s_index = in[0] - s_base;
 109         unsigned t_index;
 110 
 111         if (s_index % t_count != 0)
 112             return 0;
 113         if (in[1] < t_base || in[1] >= t_base + t_count)
 114             return 0;
 115         t_index = in[1] - t_base;
 116         return in[0] + t_index;
 117     }
 118     return 0;
 119 }
 120 
 121 static int
 122 compat_decomp(const uint32_t *in, size_t in_len,
     /* [<][>][^][v][top][bottom][index][help] */
 123               uint32_t *out, size_t *out_len)
 124 {
 125     unsigned i;
 126     unsigned o = 0;
 127 
 128     for (i = 0; i < in_len; ++i) {
 129         struct translation ts = {in[i]};
 130         size_t sub_len = *out_len - o;
 131         int ret;
 132         
 133         ret = hangul_decomp(in + i, in_len - i,
 134                             out + o, &sub_len);
 135         if (ret) {
 136             if (ret == WIND_ERR_OVERRUN)
 137                 return ret;
 138             o += sub_len;
 139         } else {
 140             void *s = bsearch(&ts,
 141                               _wind_normalize_table,
 142                               _wind_normalize_table_size,
 143                               sizeof(_wind_normalize_table[0]),
 144                               translation_cmp);
 145             if (s != NULL) {
 146                 const struct translation *t = (const struct translation *)s;
 147 
 148                 ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
 149                                     t->val_len,
 150                                     out + o, &sub_len);
 151                 if (ret)
 152                     return ret;
 153                 o += sub_len;
 154             } else {
 155                 if (o >= *out_len)
 156                     return WIND_ERR_OVERRUN;
 157                 out[o++] = in[i];
 158 
 159             }
 160         }
 161     }
 162     *out_len = o;
 163     return 0;
 164 }
 165 
 166 static int
 167 cc_cmp(const void *a, const void *b)
     /* [<][>][^][v][top][bottom][index][help] */
 168 {
 169     const uint32_t *ua = (const uint32_t *)a;
 170     const uint32_t *ub = (const uint32_t *)b;
 171 
 172     return _wind_combining_class(*ua) - _wind_combining_class(*ub);
 173 }
 174 
 175 static void
 176 canonical_reorder(uint32_t *tmp, size_t tmp_len)
     /* [<][>][^][v][top][bottom][index][help] */
 177 {
 178     unsigned i;
 179 
 180     for (i = 0; i < tmp_len; ++i) {
 181         int cc = _wind_combining_class(tmp[i]);
 182         if (cc) {
 183             size_t j;
 184             for (j = i + 1;
 185                  j < tmp_len && _wind_combining_class(tmp[j]);
 186                  ++j)
 187                 ;
 188             qsort(&tmp[i], j - i, sizeof(unsigned),
 189                   cc_cmp);
 190             i = j;
 191         }
 192     }
 193 }
 194 
 195 static uint32_t
 196 find_composition(const uint32_t *in, unsigned in_len)
     /* [<][>][^][v][top][bottom][index][help] */
 197 {
 198     unsigned short canon_index = 0;
 199     uint32_t cur;
 200     unsigned n = 0;
 201 
 202     cur = hangul_composition(in, in_len);
 203     if (cur)
 204         return cur;
 205 
 206     do {
 207         const struct canon_node *c = &_wind_canon_table[canon_index];
 208         unsigned i;
 209 
 210         if (n % 5 == 0) {
 211             cur = *in++;
 212             if (in_len-- == 0)
 213                 return c->val;
 214         }
 215 
 216         i = cur >> 16;
 217         if (i < c->next_start || i >= c->next_end)
 218             canon_index = 0;
 219         else
 220             canon_index =
 221                 _wind_canon_next_table[c->next_offset + i - c->next_start];
 222         if (canon_index != 0) {
 223             cur = (cur << 4) & 0xFFFFF;
 224             ++n;
 225         }
 226     } while (canon_index != 0);
 227     return 0;
 228 }
 229 
 230 static int
 231 combine(const uint32_t *in, size_t in_len,
     /* [<][>][^][v][top][bottom][index][help] */
 232         uint32_t *out, size_t *out_len)
 233 {
 234     unsigned i;
 235     int ostarter;
 236     unsigned o = 0;
 237     int old_cc;
 238     int cc;
 239 
 240     for (i = 0; i < in_len;) {
 241         while (i < in_len && (cc = _wind_combining_class(in[i])) != 0) {
 242             out[o++] = in[i++];
 243         }
 244         if (i < in_len) {
 245             if (o >= *out_len)
 246                 return WIND_ERR_OVERRUN;
 247             ostarter = o;
 248             out[o++] = in[i++];
 249             old_cc   = -1;
 250 
 251             while (i < in_len) {
 252                 uint32_t comb;
 253                 uint32_t v[2];
 254 
 255                 v[0] = out[ostarter];
 256                 v[1] = in[i];
 257 
 258                 cc = _wind_combining_class(in[i]);
 259                 if (old_cc != cc && (comb = find_composition(v, 2))) {
 260                     out[ostarter] = comb;
 261                 } else if (cc == 0) {
 262                     break;
 263                 } else {
 264                     if (o >= *out_len)
 265                         return WIND_ERR_OVERRUN;
 266                     out[o++] = in[i];
 267                     old_cc   = cc;
 268                 }
 269                 ++i;
 270             }
 271         }
 272     }
 273     *out_len = o;
 274     return 0;
 275 }
 276 
 277 int
 278 _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
     /* [<][>][^][v][top][bottom][index][help] */
 279                            uint32_t *out, size_t *out_len)
 280 {
 281     size_t tmp_len;
 282     uint32_t *tmp;
 283     int ret;
 284 
 285     tmp_len = in_len * 4;
 286     if (tmp_len < MAX_LENGTH_CANON)
 287         tmp_len = MAX_LENGTH_CANON;
 288     tmp = malloc(tmp_len * sizeof(uint32_t));
 289     if (tmp == NULL)
 290         return ENOMEM;
 291 
 292     ret = compat_decomp(in, in_len, tmp, &tmp_len);
 293     if (ret) {
 294         free(tmp);
 295         return ret;
 296     }
 297     canonical_reorder(tmp, tmp_len);
 298     ret = combine(tmp, tmp_len, out, out_len);
 299     free(tmp);
 300     return ret;
 301 }

/* [<][>][^][v][top][bottom][index][help] */