|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #include <assert.h> 00009 #include <stdlib.h> 00010 #include <string.h> 00011 00012 #include <parserutils/charset/mibenum.h> 00013 00014 #include "charset/codecs/codec_impl.h" 00015 #include "utils/endian.h" 00016 #include "utils/utils.h" 00017 00021 typedef struct charset_ascii_codec { 00022 parserutils_charset_codec base; 00024 #define READ_BUFSIZE (8) 00025 uint32_t read_buf[READ_BUFSIZE]; 00028 size_t read_len; 00030 #define WRITE_BUFSIZE (8) 00031 uint32_t write_buf[WRITE_BUFSIZE]; 00034 size_t write_len; 00036 } charset_ascii_codec; 00037 00038 static bool charset_ascii_codec_handles_charset(const char *charset); 00039 static parserutils_error charset_ascii_codec_create( 00040 const char *charset, parserutils_charset_codec **codec); 00041 static parserutils_error charset_ascii_codec_destroy( 00042 parserutils_charset_codec *codec); 00043 static parserutils_error charset_ascii_codec_encode( 00044 parserutils_charset_codec *codec, 00045 const uint8_t **source, size_t *sourcelen, 00046 uint8_t **dest, size_t *destlen); 00047 static parserutils_error charset_ascii_codec_decode( 00048 parserutils_charset_codec *codec, 00049 const uint8_t **source, size_t *sourcelen, 00050 uint8_t **dest, size_t *destlen); 00051 static parserutils_error charset_ascii_codec_reset( 00052 parserutils_charset_codec *codec); 00053 static inline parserutils_error charset_ascii_codec_read_char( 00054 charset_ascii_codec *c, 00055 const uint8_t **source, size_t *sourcelen, 00056 uint8_t **dest, size_t *destlen); 00057 static inline parserutils_error charset_ascii_codec_output_decoded_char( 00058 charset_ascii_codec *c, 00059 uint32_t ucs4, uint8_t **dest, size_t *destlen); 00060 static inline parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c, 00061 uint32_t ucs4, uint8_t **s, size_t *len); 00062 static inline parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c, 00063 const uint8_t *s, size_t len, uint32_t *ucs4); 00064 00071 bool charset_ascii_codec_handles_charset(const char *charset) 00072 { 00073 static uint16_t ascii; 00074 uint16_t match = parserutils_charset_mibenum_from_name(charset, 00075 strlen(charset)); 00076 00077 if (ascii == 0) { 00078 ascii = parserutils_charset_mibenum_from_name( 00079 "US-ASCII", SLEN("US-ASCII")); 00080 } 00081 00082 if (ascii != 0 && ascii == match) 00083 return true; 00084 00085 return false; 00086 } 00087 00097 parserutils_error charset_ascii_codec_create(const char *charset, 00098 parserutils_charset_codec **codec) 00099 { 00100 charset_ascii_codec *c; 00101 00102 UNUSED(charset); 00103 00104 c = malloc(sizeof(charset_ascii_codec)); 00105 if (c == NULL) 00106 return PARSERUTILS_NOMEM; 00107 00108 c->read_buf[0] = 0; 00109 c->read_len = 0; 00110 00111 c->write_buf[0] = 0; 00112 c->write_len = 0; 00113 00114 /* Finally, populate vtable */ 00115 c->base.handler.destroy = charset_ascii_codec_destroy; 00116 c->base.handler.encode = charset_ascii_codec_encode; 00117 c->base.handler.decode = charset_ascii_codec_decode; 00118 c->base.handler.reset = charset_ascii_codec_reset; 00119 00120 *codec = (parserutils_charset_codec *) c; 00121 00122 return PARSERUTILS_OK; 00123 } 00124 00131 parserutils_error charset_ascii_codec_destroy (parserutils_charset_codec *codec) 00132 { 00133 UNUSED(codec); 00134 00135 return PARSERUTILS_OK; 00136 } 00137 00165 parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec, 00166 const uint8_t **source, size_t *sourcelen, 00167 uint8_t **dest, size_t *destlen) 00168 { 00169 charset_ascii_codec *c = (charset_ascii_codec *) codec; 00170 uint32_t ucs4; 00171 uint32_t *towrite; 00172 size_t towritelen; 00173 parserutils_error error; 00174 00175 /* Process any outstanding characters from the previous call */ 00176 if (c->write_len > 0) { 00177 uint32_t *pwrite = c->write_buf; 00178 00179 while (c->write_len > 0) { 00180 error = charset_ascii_from_ucs4(c, pwrite[0], 00181 dest, destlen); 00182 if (error != PARSERUTILS_OK) { 00183 uint32_t len; 00184 assert(error == PARSERUTILS_NOMEM); 00185 00186 for (len = 0; len < c->write_len; len++) { 00187 c->write_buf[len] = pwrite[len]; 00188 } 00189 00190 return error; 00191 } 00192 00193 pwrite++; 00194 c->write_len--; 00195 } 00196 } 00197 00198 /* Now process the characters for this call */ 00199 while (*sourcelen > 0) { 00200 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source)); 00201 towrite = &ucs4; 00202 towritelen = 1; 00203 00204 /* Output current characters */ 00205 while (towritelen > 0) { 00206 error = charset_ascii_from_ucs4(c, towrite[0], dest, 00207 destlen); 00208 if (error != PARSERUTILS_OK) { 00209 uint32_t len; 00210 if (error != PARSERUTILS_NOMEM) { 00211 return error; 00212 } 00213 00214 /* Insufficient output space */ 00215 assert(towritelen < WRITE_BUFSIZE); 00216 00217 c->write_len = towritelen; 00218 00219 /* Copy pending chars to save area, for 00220 * processing next call. */ 00221 for (len = 0; len < towritelen; len++) 00222 c->write_buf[len] = towrite[len]; 00223 00224 /* Claim character we've just buffered, 00225 * so it's not reprocessed */ 00226 *source += 4; 00227 *sourcelen -= 4; 00228 00229 return PARSERUTILS_NOMEM; 00230 } 00231 00232 towrite++; 00233 towritelen--; 00234 } 00235 00236 *source += 4; 00237 *sourcelen -= 4; 00238 } 00239 00240 return PARSERUTILS_OK; 00241 } 00242 00284 parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec, 00285 const uint8_t **source, size_t *sourcelen, 00286 uint8_t **dest, size_t *destlen) 00287 { 00288 charset_ascii_codec *c = (charset_ascii_codec *) codec; 00289 parserutils_error error; 00290 00291 if (c->read_len > 0) { 00292 /* Output left over from last decode */ 00293 uint32_t *pread = c->read_buf; 00294 00295 while (c->read_len > 0 && *destlen >= c->read_len * 4) { 00296 *((uint32_t *) (void *) *dest) = 00297 endian_host_to_big(pread[0]); 00298 00299 *dest += 4; 00300 *destlen -= 4; 00301 00302 pread++; 00303 c->read_len--; 00304 } 00305 00306 if (*destlen < c->read_len * 4) { 00307 /* Ran out of output buffer */ 00308 size_t i; 00309 00310 /* Shuffle remaining output down */ 00311 for (i = 0; i < c->read_len; i++) 00312 c->read_buf[i] = pread[i]; 00313 00314 return PARSERUTILS_NOMEM; 00315 } 00316 } 00317 00318 /* Finally, the "normal" case; process all outstanding characters */ 00319 while (*sourcelen > 0) { 00320 error = charset_ascii_codec_read_char(c, 00321 source, sourcelen, dest, destlen); 00322 if (error != PARSERUTILS_OK) { 00323 return error; 00324 } 00325 } 00326 00327 return PARSERUTILS_OK; 00328 } 00329 00336 parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec) 00337 { 00338 charset_ascii_codec *c = (charset_ascii_codec *) codec; 00339 00340 c->read_buf[0] = 0; 00341 c->read_len = 0; 00342 00343 c->write_buf[0] = 0; 00344 c->write_len = 0; 00345 00346 return PARSERUTILS_OK; 00347 } 00348 00349 00378 parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c, 00379 const uint8_t **source, size_t *sourcelen, 00380 uint8_t **dest, size_t *destlen) 00381 { 00382 uint32_t ucs4; 00383 parserutils_error error; 00384 00385 /* Convert a single character */ 00386 error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4); 00387 if (error == PARSERUTILS_OK) { 00388 /* Read a character */ 00389 error = charset_ascii_codec_output_decoded_char(c, 00390 ucs4, dest, destlen); 00391 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00392 /* output succeeded; update source pointers */ 00393 *source += 1; 00394 *sourcelen -= 1; 00395 } 00396 00397 return error; 00398 } else if (error == PARSERUTILS_NEEDDATA) { 00399 /* Can only happen if sourcelen == 0 */ 00400 return error; 00401 } else if (error == PARSERUTILS_INVALID) { 00402 /* Illegal input sequence */ 00403 00404 /* Strict errormode; simply flag invalid character */ 00405 if (c->base.errormode == 00406 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { 00407 return PARSERUTILS_INVALID; 00408 } 00409 00410 /* output U+FFFD and continue processing. */ 00411 error = charset_ascii_codec_output_decoded_char(c, 00412 0xFFFD, dest, destlen); 00413 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00414 /* output succeeded; update source pointers */ 00415 *source += 1; 00416 *sourcelen -= 1; 00417 } 00418 00419 return error; 00420 } 00421 00422 return PARSERUTILS_OK; 00423 } 00424 00435 parserutils_error charset_ascii_codec_output_decoded_char( 00436 charset_ascii_codec *c, 00437 uint32_t ucs4, uint8_t **dest, size_t *destlen) 00438 { 00439 if (*destlen < 4) { 00440 /* Run out of output buffer */ 00441 c->read_len = 1; 00442 c->read_buf[0] = ucs4; 00443 00444 return PARSERUTILS_NOMEM; 00445 } 00446 00447 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4); 00448 *dest += 4; 00449 *destlen -= 4; 00450 00451 return PARSERUTILS_OK; 00452 } 00453 00470 parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c, 00471 uint32_t ucs4, uint8_t **s, size_t *len) 00472 { 00473 uint8_t out = 0; 00474 00475 if (*len < 1) 00476 return PARSERUTILS_NOMEM; 00477 00478 if (ucs4 < 0x80) { 00479 /* ASCII */ 00480 out = ucs4; 00481 } else { 00482 if (c->base.errormode == PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) 00483 return PARSERUTILS_INVALID; 00484 else 00485 out = '?'; 00486 } 00487 00488 *(*s) = out; 00489 (*s)++; 00490 (*len)--; 00491 00492 return PARSERUTILS_OK; 00493 } 00494 00506 parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c, 00507 const uint8_t *s, size_t len, uint32_t *ucs4) 00508 { 00509 uint32_t out; 00510 00511 UNUSED(c); 00512 00513 if (len < 1) 00514 return PARSERUTILS_NEEDDATA; 00515 00516 if (*s < 0x80) { 00517 out = *s; 00518 } else { 00519 return PARSERUTILS_INVALID; 00520 } 00521 00522 *ucs4 = out; 00523 00524 return PARSERUTILS_OK; 00525 } 00526 00527 const parserutils_charset_handler charset_ascii_codec_handler = { 00528 charset_ascii_codec_handles_charset, 00529 charset_ascii_codec_create 00530 }; 00531
1.7.3