|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #include <assert.h> 00009 #include <stdlib.h> 00010 #include <string.h> 00011 00012 #include <parserutils/charset/mibenum.h> 00013 #include <parserutils/charset/utf16.h> 00014 00015 #include "charset/codecs/codec_impl.h" 00016 #include "utils/endian.h" 00017 #include "utils/utils.h" 00018 00022 typedef struct charset_utf16_codec { 00023 parserutils_charset_codec base; 00025 #define INVAL_BUFSIZE (32) 00026 uint8_t inval_buf[INVAL_BUFSIZE]; 00029 size_t inval_len; /*< Byte length of inval_buf **/ 00030 00031 #define READ_BUFSIZE (8) 00032 uint32_t read_buf[READ_BUFSIZE]; 00035 size_t read_len; 00037 #define WRITE_BUFSIZE (8) 00038 uint32_t write_buf[WRITE_BUFSIZE]; 00041 size_t write_len; 00043 } charset_utf16_codec; 00044 00045 static bool charset_utf16_codec_handles_charset(const char *charset); 00046 static parserutils_error charset_utf16_codec_create(const char *charset, 00047 parserutils_charset_codec **codec); 00048 static parserutils_error charset_utf16_codec_destroy( 00049 parserutils_charset_codec *codec); 00050 static parserutils_error charset_utf16_codec_encode( 00051 parserutils_charset_codec *codec, 00052 const uint8_t **source, size_t *sourcelen, 00053 uint8_t **dest, size_t *destlen); 00054 static parserutils_error charset_utf16_codec_decode( 00055 parserutils_charset_codec *codec, 00056 const uint8_t **source, size_t *sourcelen, 00057 uint8_t **dest, size_t *destlen); 00058 static parserutils_error charset_utf16_codec_reset( 00059 parserutils_charset_codec *codec); 00060 static inline parserutils_error charset_utf16_codec_read_char( 00061 charset_utf16_codec *c, 00062 const uint8_t **source, size_t *sourcelen, 00063 uint8_t **dest, size_t *destlen); 00064 static inline parserutils_error charset_utf16_codec_output_decoded_char( 00065 charset_utf16_codec *c, 00066 uint32_t ucs4, uint8_t **dest, size_t *destlen); 00067 00074 bool charset_utf16_codec_handles_charset(const char *charset) 00075 { 00076 return parserutils_charset_mibenum_from_name(charset, strlen(charset)) 00077 == 00078 parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16")); 00079 } 00080 00090 parserutils_error charset_utf16_codec_create(const char *charset, 00091 parserutils_charset_codec **codec) 00092 { 00093 charset_utf16_codec *c; 00094 00095 UNUSED(charset); 00096 00097 c = malloc(sizeof(charset_utf16_codec)); 00098 if (c == NULL) 00099 return PARSERUTILS_NOMEM; 00100 00101 c->inval_buf[0] = '\0'; 00102 c->inval_len = 0; 00103 00104 c->read_buf[0] = 0; 00105 c->read_len = 0; 00106 00107 c->write_buf[0] = 0; 00108 c->write_len = 0; 00109 00110 /* Finally, populate vtable */ 00111 c->base.handler.destroy = charset_utf16_codec_destroy; 00112 c->base.handler.encode = charset_utf16_codec_encode; 00113 c->base.handler.decode = charset_utf16_codec_decode; 00114 c->base.handler.reset = charset_utf16_codec_reset; 00115 00116 *codec = (parserutils_charset_codec *) c; 00117 00118 return PARSERUTILS_OK; 00119 } 00120 00127 parserutils_error charset_utf16_codec_destroy (parserutils_charset_codec *codec) 00128 { 00129 UNUSED(codec); 00130 00131 return PARSERUTILS_OK; 00132 } 00133 00161 parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec, 00162 const uint8_t **source, size_t *sourcelen, 00163 uint8_t **dest, size_t *destlen) 00164 { 00165 charset_utf16_codec *c = (charset_utf16_codec *) codec; 00166 uint32_t ucs4; 00167 uint32_t *towrite; 00168 size_t towritelen; 00169 parserutils_error error; 00170 00171 /* Process any outstanding characters from the previous call */ 00172 if (c->write_len > 0) { 00173 uint32_t *pwrite = c->write_buf; 00174 uint8_t buf[4]; 00175 size_t len; 00176 00177 while (c->write_len > 0) { 00178 error = parserutils_charset_utf16_from_ucs4( 00179 pwrite[0], buf, &len); 00180 assert(error == PARSERUTILS_OK); 00181 00182 if (*destlen < len) { 00183 /* Insufficient output buffer space */ 00184 for (len = 0; len < c->write_len; len++) 00185 c->write_buf[len] = pwrite[len]; 00186 00187 return PARSERUTILS_NOMEM; 00188 } 00189 00190 memcpy(*dest, buf, len); 00191 00192 *dest += len; 00193 *destlen -= len; 00194 00195 pwrite++; 00196 c->write_len--; 00197 } 00198 } 00199 00200 /* Now process the characters for this call */ 00201 while (*sourcelen > 0) { 00202 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source)); 00203 towrite = &ucs4; 00204 towritelen = 1; 00205 00206 /* Output current characters */ 00207 while (towritelen > 0) { 00208 uint8_t buf[4]; 00209 size_t len; 00210 00211 error = parserutils_charset_utf16_from_ucs4( 00212 towrite[0], buf, &len); 00213 assert(error == PARSERUTILS_OK); 00214 00215 if (*destlen < len) { 00216 /* Insufficient output space */ 00217 assert(towritelen < WRITE_BUFSIZE); 00218 00219 c->write_len = towritelen; 00220 00221 /* Copy pending chars to save area, for 00222 * processing next call. */ 00223 for (len = 0; len < towritelen; len++) 00224 c->write_buf[len] = towrite[len]; 00225 00226 /* Claim character we've just buffered, 00227 * so it's not reprocessed */ 00228 *source += 4; 00229 *sourcelen -= 4; 00230 00231 return PARSERUTILS_NOMEM; 00232 } 00233 00234 memcpy(*dest, buf, len); 00235 00236 *dest += len; 00237 *destlen -= len; 00238 00239 towrite++; 00240 towritelen--; 00241 } 00242 00243 *source += 4; 00244 *sourcelen -= 4; 00245 } 00246 00247 (void) error; 00248 00249 return PARSERUTILS_OK; 00250 } 00251 00293 parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec, 00294 const uint8_t **source, size_t *sourcelen, 00295 uint8_t **dest, size_t *destlen) 00296 { 00297 charset_utf16_codec *c = (charset_utf16_codec *) codec; 00298 parserutils_error error; 00299 00300 if (c->read_len > 0) { 00301 /* Output left over from last decode */ 00302 uint32_t *pread = c->read_buf; 00303 00304 while (c->read_len > 0 && *destlen >= c->read_len * 4) { 00305 *((uint32_t *) (void *) *dest) = 00306 endian_host_to_big(pread[0]); 00307 00308 *dest += 4; 00309 *destlen -= 4; 00310 00311 pread++; 00312 c->read_len--; 00313 } 00314 00315 if (*destlen < c->read_len * 4) { 00316 /* Ran out of output buffer */ 00317 size_t i; 00318 00319 /* Shuffle remaining output down */ 00320 for (i = 0; i < c->read_len; i++) 00321 c->read_buf[i] = pread[i]; 00322 00323 return PARSERUTILS_NOMEM; 00324 } 00325 } 00326 00327 if (c->inval_len > 0) { 00328 /* The last decode ended in an incomplete sequence. 00329 * Fill up inval_buf with data from the start of the 00330 * new chunk and process it. */ 00331 uint8_t *in = c->inval_buf; 00332 size_t ol = c->inval_len; 00333 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); 00334 size_t orig_l = l; 00335 00336 memcpy(c->inval_buf + ol, *source, l); 00337 00338 l += c->inval_len; 00339 00340 error = charset_utf16_codec_read_char(c, 00341 (const uint8_t **) &in, &l, dest, destlen); 00342 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { 00343 return error; 00344 } 00345 00346 /* And now, fix up source pointers */ 00347 *source += max((signed) (orig_l - l), 0); 00348 *sourcelen -= max((signed) (orig_l - l), 0); 00349 00350 /* Failed to resolve an incomplete character and 00351 * ran out of buffer space. No recovery strategy 00352 * possible, so explode everywhere. */ 00353 assert((orig_l + ol) - l != 0); 00354 00355 /* Report memory exhaustion case from above */ 00356 if (error != PARSERUTILS_OK) 00357 return error; 00358 } 00359 00360 /* Finally, the "normal" case; process all outstanding characters */ 00361 while (*sourcelen > 0) { 00362 error = charset_utf16_codec_read_char(c, 00363 source, sourcelen, dest, destlen); 00364 if (error != PARSERUTILS_OK) { 00365 return error; 00366 } 00367 } 00368 00369 return PARSERUTILS_OK; 00370 } 00371 00378 parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec) 00379 { 00380 charset_utf16_codec *c = (charset_utf16_codec *) codec; 00381 00382 c->inval_buf[0] = '\0'; 00383 c->inval_len = 0; 00384 00385 c->read_buf[0] = 0; 00386 c->read_len = 0; 00387 00388 c->write_buf[0] = 0; 00389 c->write_len = 0; 00390 00391 return PARSERUTILS_OK; 00392 } 00393 00394 00423 parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c, 00424 const uint8_t **source, size_t *sourcelen, 00425 uint8_t **dest, size_t *destlen) 00426 { 00427 uint32_t ucs4; 00428 size_t sucs4; 00429 parserutils_error error; 00430 00431 /* Convert a single character */ 00432 error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen, 00433 &ucs4, &sucs4); 00434 if (error == PARSERUTILS_OK) { 00435 /* Read a character */ 00436 error = charset_utf16_codec_output_decoded_char(c, 00437 ucs4, dest, destlen); 00438 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00439 /* output succeeded; update source pointers */ 00440 *source += sucs4; 00441 *sourcelen -= sucs4; 00442 } 00443 00444 /* Clear inval buffer */ 00445 c->inval_buf[0] = '\0'; 00446 c->inval_len = 0; 00447 00448 return error; 00449 } else if (error == PARSERUTILS_NEEDDATA) { 00450 /* Incomplete input sequence */ 00451 assert(*sourcelen < INVAL_BUFSIZE); 00452 00453 memmove(c->inval_buf, *source, *sourcelen); 00454 c->inval_buf[*sourcelen] = '\0'; 00455 c->inval_len = *sourcelen; 00456 00457 *source += *sourcelen; 00458 *sourcelen = 0; 00459 00460 return PARSERUTILS_OK; 00461 } else if (error == PARSERUTILS_INVALID) { 00462 /* Illegal input sequence */ 00463 uint32_t nextchar; 00464 00465 /* Clear inval buffer */ 00466 c->inval_buf[0] = '\0'; 00467 c->inval_len = 0; 00468 00469 /* Strict errormode; simply flag invalid character */ 00470 if (c->base.errormode == 00471 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { 00472 return PARSERUTILS_INVALID; 00473 } 00474 00475 /* Find next valid UTF-16 sequence. 00476 * We're processing client-provided data, so let's 00477 * be paranoid about its validity. */ 00478 error = parserutils_charset_utf16_next_paranoid( 00479 *source, *sourcelen, 0, &nextchar); 00480 if (error != PARSERUTILS_OK) { 00481 if (error == PARSERUTILS_NEEDDATA) { 00482 /* Need more data to be sure */ 00483 assert(*sourcelen < INVAL_BUFSIZE); 00484 00485 memmove(c->inval_buf, *source, *sourcelen); 00486 c->inval_buf[*sourcelen] = '\0'; 00487 c->inval_len = *sourcelen; 00488 00489 *source += *sourcelen; 00490 *sourcelen = 0; 00491 00492 nextchar = 0; 00493 } else { 00494 return error; 00495 } 00496 } 00497 00498 /* output U+FFFD and continue processing. */ 00499 error = charset_utf16_codec_output_decoded_char(c, 00500 0xFFFD, dest, destlen); 00501 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00502 /* output succeeded; update source pointers */ 00503 *source += nextchar; 00504 *sourcelen -= nextchar; 00505 } 00506 00507 return error; 00508 } 00509 00510 return PARSERUTILS_OK; 00511 } 00512 00523 parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c, 00524 uint32_t ucs4, uint8_t **dest, size_t *destlen) 00525 { 00526 if (*destlen < 4) { 00527 /* Run out of output buffer */ 00528 c->read_len = 1; 00529 c->read_buf[0] = ucs4; 00530 00531 return PARSERUTILS_NOMEM; 00532 } 00533 00534 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4); 00535 *dest += 4; 00536 *destlen -= 4; 00537 00538 return PARSERUTILS_OK; 00539 } 00540 00541 00542 const parserutils_charset_handler charset_utf16_codec_handler = { 00543 charset_utf16_codec_handles_charset, 00544 charset_utf16_codec_create 00545 };
1.7.3