|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #include <assert.h> 00009 #include <stdlib.h> 00010 #include <string.h> 00011 00012 #include <parserutils/charset/mibenum.h> 00013 00014 #include "charset/codecs/codec_impl.h" 00015 #include "charset/encodings/utf8impl.h" 00016 #include "utils/endian.h" 00017 #include "utils/utils.h" 00018 00022 typedef struct charset_utf8_codec { 00023 parserutils_charset_codec base; 00025 #define INVAL_BUFSIZE (32) 00026 uint8_t inval_buf[INVAL_BUFSIZE]; 00029 size_t inval_len; /*< Byte length of inval_buf **/ 00030 00031 #define READ_BUFSIZE (8) 00032 uint32_t read_buf[READ_BUFSIZE]; 00035 size_t read_len; 00037 #define WRITE_BUFSIZE (8) 00038 uint32_t write_buf[WRITE_BUFSIZE]; 00041 size_t write_len; 00043 } charset_utf8_codec; 00044 00045 static bool charset_utf8_codec_handles_charset(const char *charset); 00046 static parserutils_error charset_utf8_codec_create(const char *charset, 00047 parserutils_charset_codec **codec); 00048 static parserutils_error charset_utf8_codec_destroy( 00049 parserutils_charset_codec *codec); 00050 static parserutils_error charset_utf8_codec_encode( 00051 parserutils_charset_codec *codec, 00052 const uint8_t **source, size_t *sourcelen, 00053 uint8_t **dest, size_t *destlen); 00054 static parserutils_error charset_utf8_codec_decode( 00055 parserutils_charset_codec *codec, 00056 const uint8_t **source, size_t *sourcelen, 00057 uint8_t **dest, size_t *destlen); 00058 static parserutils_error charset_utf8_codec_reset( 00059 parserutils_charset_codec *codec); 00060 static inline parserutils_error charset_utf8_codec_read_char( 00061 charset_utf8_codec *c, 00062 const uint8_t **source, size_t *sourcelen, 00063 uint8_t **dest, size_t *destlen); 00064 static inline parserutils_error charset_utf8_codec_output_decoded_char( 00065 charset_utf8_codec *c, 00066 uint32_t ucs4, uint8_t **dest, size_t *destlen); 00067 00074 bool charset_utf8_codec_handles_charset(const char *charset) 00075 { 00076 return parserutils_charset_mibenum_from_name(charset, 00077 strlen(charset)) == 00078 parserutils_charset_mibenum_from_name("UTF-8", 00079 SLEN("UTF-8")); 00080 } 00081 00091 parserutils_error charset_utf8_codec_create(const char *charset, 00092 parserutils_charset_codec **codec) 00093 { 00094 charset_utf8_codec *c; 00095 00096 UNUSED(charset); 00097 00098 c = malloc(sizeof(charset_utf8_codec)); 00099 if (c == NULL) 00100 return PARSERUTILS_NOMEM; 00101 00102 c->inval_buf[0] = '\0'; 00103 c->inval_len = 0; 00104 00105 c->read_buf[0] = 0; 00106 c->read_len = 0; 00107 00108 c->write_buf[0] = 0; 00109 c->write_len = 0; 00110 00111 /* Finally, populate vtable */ 00112 c->base.handler.destroy = charset_utf8_codec_destroy; 00113 c->base.handler.encode = charset_utf8_codec_encode; 00114 c->base.handler.decode = charset_utf8_codec_decode; 00115 c->base.handler.reset = charset_utf8_codec_reset; 00116 00117 *codec = (parserutils_charset_codec *) c; 00118 00119 return PARSERUTILS_OK; 00120 } 00121 00128 parserutils_error charset_utf8_codec_destroy (parserutils_charset_codec *codec) 00129 { 00130 UNUSED(codec); 00131 00132 return PARSERUTILS_OK; 00133 } 00134 00162 parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec, 00163 const uint8_t **source, size_t *sourcelen, 00164 uint8_t **dest, size_t *destlen) 00165 { 00166 charset_utf8_codec *c = (charset_utf8_codec *) codec; 00167 uint32_t ucs4; 00168 uint32_t *towrite; 00169 size_t towritelen; 00170 parserutils_error error; 00171 00172 /* Process any outstanding characters from the previous call */ 00173 if (c->write_len > 0) { 00174 uint32_t *pwrite = c->write_buf; 00175 00176 while (c->write_len > 0) { 00177 UTF8_FROM_UCS4(pwrite[0], dest, destlen, error); 00178 if (error != PARSERUTILS_OK) { 00179 uint32_t len; 00180 assert(error == PARSERUTILS_NOMEM); 00181 00182 /* Insufficient output buffer space */ 00183 for (len = 0; len < c->write_len; len++) { 00184 c->write_buf[len] = pwrite[len]; 00185 } 00186 00187 return PARSERUTILS_NOMEM; 00188 } 00189 00190 pwrite++; 00191 c->write_len--; 00192 } 00193 } 00194 00195 /* Now process the characters for this call */ 00196 while (*sourcelen > 0) { 00197 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source)); 00198 towrite = &ucs4; 00199 towritelen = 1; 00200 00201 /* Output current characters */ 00202 while (towritelen > 0) { 00203 UTF8_FROM_UCS4(towrite[0], dest, destlen, error); 00204 if (error != PARSERUTILS_OK) { 00205 uint32_t len; 00206 assert(error == PARSERUTILS_NOMEM); 00207 00208 /* Insufficient output space */ 00209 assert(towritelen < WRITE_BUFSIZE); 00210 00211 c->write_len = towritelen; 00212 00213 /* Copy pending chars to save area, for 00214 * processing next call. */ 00215 for (len = 0; len < towritelen; len++) 00216 c->write_buf[len] = towrite[len]; 00217 00218 /* Claim character we've just buffered, 00219 * so it's not reprocessed */ 00220 *source += 4; 00221 *sourcelen -= 4; 00222 00223 return PARSERUTILS_NOMEM; 00224 } 00225 00226 towrite++; 00227 towritelen--; 00228 } 00229 00230 *source += 4; 00231 *sourcelen -= 4; 00232 } 00233 00234 return PARSERUTILS_OK; 00235 } 00236 00278 parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec, 00279 const uint8_t **source, size_t *sourcelen, 00280 uint8_t **dest, size_t *destlen) 00281 { 00282 charset_utf8_codec *c = (charset_utf8_codec *) codec; 00283 parserutils_error error; 00284 00285 if (c->read_len > 0) { 00286 /* Output left over from last decode */ 00287 uint32_t *pread = c->read_buf; 00288 00289 while (c->read_len > 0 && *destlen >= c->read_len * 4) { 00290 *((uint32_t *) (void *) *dest) = 00291 endian_host_to_big(pread[0]); 00292 00293 *dest += 4; 00294 *destlen -= 4; 00295 00296 pread++; 00297 c->read_len--; 00298 } 00299 00300 if (*destlen < c->read_len * 4) { 00301 /* Ran out of output buffer */ 00302 size_t i; 00303 00304 /* Shuffle remaining output down */ 00305 for (i = 0; i < c->read_len; i++) 00306 c->read_buf[i] = pread[i]; 00307 00308 return PARSERUTILS_NOMEM; 00309 } 00310 } 00311 00312 if (c->inval_len > 0) { 00313 /* The last decode ended in an incomplete sequence. 00314 * Fill up inval_buf with data from the start of the 00315 * new chunk and process it. */ 00316 uint8_t *in = c->inval_buf; 00317 size_t ol = c->inval_len; 00318 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); 00319 size_t orig_l = l; 00320 00321 memcpy(c->inval_buf + ol, *source, l); 00322 00323 l += c->inval_len; 00324 00325 error = charset_utf8_codec_read_char(c, 00326 (const uint8_t **) &in, &l, dest, destlen); 00327 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { 00328 return error; 00329 } 00330 00331 /* And now, fix up source pointers */ 00332 *source += max((signed) (orig_l - l), 0); 00333 *sourcelen -= max((signed) (orig_l - l), 0); 00334 00335 /* Failed to resolve an incomplete character and 00336 * ran out of buffer space. No recovery strategy 00337 * possible, so explode everywhere. */ 00338 assert((orig_l + ol) - l != 0); 00339 00340 /* Report memory exhaustion case from above */ 00341 if (error != PARSERUTILS_OK) 00342 return error; 00343 } 00344 00345 /* Finally, the "normal" case; process all outstanding characters */ 00346 while (*sourcelen > 0) { 00347 error = charset_utf8_codec_read_char(c, 00348 source, sourcelen, dest, destlen); 00349 if (error != PARSERUTILS_OK) { 00350 return error; 00351 } 00352 } 00353 00354 return PARSERUTILS_OK; 00355 } 00356 00363 parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec) 00364 { 00365 charset_utf8_codec *c = (charset_utf8_codec *) codec; 00366 00367 c->inval_buf[0] = '\0'; 00368 c->inval_len = 0; 00369 00370 c->read_buf[0] = 0; 00371 c->read_len = 0; 00372 00373 c->write_buf[0] = 0; 00374 c->write_len = 0; 00375 00376 return PARSERUTILS_OK; 00377 } 00378 00379 00408 parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c, 00409 const uint8_t **source, size_t *sourcelen, 00410 uint8_t **dest, size_t *destlen) 00411 { 00412 uint32_t ucs4; 00413 size_t sucs4; 00414 parserutils_error error; 00415 00416 /* Convert a single character */ 00417 { 00418 const uint8_t *src = *source; 00419 size_t srclen = *sourcelen; 00420 uint32_t *uptr = &ucs4; 00421 size_t *usptr = &sucs4; 00422 UTF8_TO_UCS4(src, srclen, uptr, usptr, error); 00423 } 00424 if (error == PARSERUTILS_OK) { 00425 /* Read a character */ 00426 error = charset_utf8_codec_output_decoded_char(c, 00427 ucs4, dest, destlen); 00428 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00429 /* output succeeded; update source pointers */ 00430 *source += sucs4; 00431 *sourcelen -= sucs4; 00432 } 00433 00434 /* Clear inval buffer */ 00435 c->inval_buf[0] = '\0'; 00436 c->inval_len = 0; 00437 00438 return error; 00439 } else if (error == PARSERUTILS_NEEDDATA) { 00440 /* Incomplete input sequence */ 00441 assert(*sourcelen < INVAL_BUFSIZE); 00442 00443 memmove(c->inval_buf, *source, *sourcelen); 00444 c->inval_buf[*sourcelen] = '\0'; 00445 c->inval_len = *sourcelen; 00446 00447 *source += *sourcelen; 00448 *sourcelen = 0; 00449 00450 return PARSERUTILS_OK; 00451 } else if (error == PARSERUTILS_INVALID) { 00452 /* Illegal input sequence */ 00453 uint32_t nextchar; 00454 00455 /* Strict errormode; simply flag invalid character */ 00456 if (c->base.errormode == 00457 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { 00458 /* Clear inval buffer */ 00459 c->inval_buf[0] = '\0'; 00460 c->inval_len = 0; 00461 00462 return PARSERUTILS_INVALID; 00463 } 00464 00465 /* Find next valid UTF-8 sequence. 00466 * We're processing client-provided data, so let's 00467 * be paranoid about its validity. */ 00468 { 00469 const uint8_t *src = *source; 00470 size_t srclen = *sourcelen; 00471 uint32_t off = 0; 00472 uint32_t *ncptr = &nextchar; 00473 00474 UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error); 00475 } 00476 if (error != PARSERUTILS_OK) { 00477 if (error == PARSERUTILS_NEEDDATA) { 00478 /* Need more data to be sure */ 00479 assert(*sourcelen < INVAL_BUFSIZE); 00480 00481 memmove(c->inval_buf, *source, *sourcelen); 00482 c->inval_buf[*sourcelen] = '\0'; 00483 c->inval_len = *sourcelen; 00484 00485 *source += *sourcelen; 00486 *sourcelen = 0; 00487 00488 nextchar = 0; 00489 } else { 00490 return error; 00491 } 00492 } 00493 00494 /* Clear inval buffer */ 00495 c->inval_buf[0] = '\0'; 00496 c->inval_len = 0; 00497 00498 /* output U+FFFD and continue processing. */ 00499 error = charset_utf8_codec_output_decoded_char(c, 00500 0xFFFD, dest, destlen); 00501 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00502 /* output succeeded; update source pointers */ 00503 *source += nextchar; 00504 *sourcelen -= nextchar; 00505 } 00506 00507 return error; 00508 } 00509 00510 return PARSERUTILS_OK; 00511 } 00512 00523 parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c, 00524 uint32_t ucs4, uint8_t **dest, size_t *destlen) 00525 { 00526 if (*destlen < 4) { 00527 /* Run out of output buffer */ 00528 c->read_len = 1; 00529 c->read_buf[0] = ucs4; 00530 00531 return PARSERUTILS_NOMEM; 00532 } 00533 00534 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4); 00535 *dest += 4; 00536 *destlen -= 4; 00537 00538 return PARSERUTILS_OK; 00539 } 00540 00541 00542 const parserutils_charset_handler charset_utf8_codec_handler = { 00543 charset_utf8_codec_handles_charset, 00544 charset_utf8_codec_create 00545 }; 00546
1.7.3