|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #include <assert.h> 00009 #include <stdlib.h> 00010 #include <string.h> 00011 00012 #include <parserutils/charset/mibenum.h> 00013 00014 #include "charset/codecs/codec_impl.h" 00015 #include "utils/endian.h" 00016 #include "utils/utils.h" 00017 00018 #include "charset/codecs/ext8_tables.h" 00019 00020 static struct { 00021 uint16_t mib; 00022 const char *name; 00023 size_t len; 00024 uint32_t *table; 00025 } known_charsets[] = { 00026 { 0, "Windows-1250", SLEN("Windows-1250"), w1250 }, 00027 { 0, "Windows-1251", SLEN("Windows-1251"), w1251 }, 00028 { 0, "Windows-1252", SLEN("Windows-1252"), w1252 }, 00029 { 0, "Windows-1253", SLEN("Windows-1253"), w1253 }, 00030 { 0, "Windows-1254", SLEN("Windows-1254"), w1254 }, 00031 { 0, "Windows-1255", SLEN("Windows-1255"), w1255 }, 00032 { 0, "Windows-1256", SLEN("Windows-1256"), w1256 }, 00033 { 0, "Windows-1257", SLEN("Windows-1257"), w1257 }, 00034 { 0, "Windows-1258", SLEN("Windows-1258"), w1258 }, 00035 }; 00036 00040 typedef struct charset_ext8_codec { 00041 parserutils_charset_codec base; 00043 uint32_t *table; 00045 #define READ_BUFSIZE (8) 00046 uint32_t read_buf[READ_BUFSIZE]; 00049 size_t read_len; 00051 #define WRITE_BUFSIZE (8) 00052 uint32_t write_buf[WRITE_BUFSIZE]; 00055 size_t write_len; 00057 } charset_ext8_codec; 00058 00059 static bool charset_ext8_codec_handles_charset(const char *charset); 00060 static parserutils_error charset_ext8_codec_create(const char *charset, 00061 parserutils_charset_codec **codec); 00062 static parserutils_error charset_ext8_codec_destroy( 00063 parserutils_charset_codec *codec); 00064 static parserutils_error charset_ext8_codec_encode( 00065 parserutils_charset_codec *codec, 00066 const uint8_t **source, size_t *sourcelen, 00067 uint8_t **dest, size_t *destlen); 00068 static parserutils_error charset_ext8_codec_decode( 00069 parserutils_charset_codec *codec, 00070 const uint8_t **source, size_t *sourcelen, 00071 uint8_t **dest, size_t *destlen); 00072 static parserutils_error charset_ext8_codec_reset( 00073 parserutils_charset_codec *codec); 00074 static inline parserutils_error charset_ext8_codec_read_char( 00075 charset_ext8_codec *c, 00076 const uint8_t **source, size_t *sourcelen, 00077 uint8_t **dest, size_t *destlen); 00078 static inline parserutils_error charset_ext8_codec_output_decoded_char( 00079 charset_ext8_codec *c, 00080 uint32_t ucs4, uint8_t **dest, size_t *destlen); 00081 static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c, 00082 uint32_t ucs4, uint8_t **s, size_t *len); 00083 static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c, 00084 const uint8_t *s, size_t len, uint32_t *ucs4); 00085 00092 bool charset_ext8_codec_handles_charset(const char *charset) 00093 { 00094 uint32_t i; 00095 uint16_t match = parserutils_charset_mibenum_from_name(charset, 00096 strlen(charset)); 00097 00098 if (known_charsets[0].mib == 0) { 00099 for (i = 0; i < N_ELEMENTS(known_charsets); i++) { 00100 known_charsets[i].mib = 00101 parserutils_charset_mibenum_from_name( 00102 known_charsets[i].name, 00103 known_charsets[i].len); 00104 } 00105 } 00106 00107 for (i = 0; i < N_ELEMENTS(known_charsets); i++) { 00108 if (known_charsets[i].mib == match) 00109 return true; 00110 } 00111 00112 return false; 00113 } 00114 00124 parserutils_error charset_ext8_codec_create(const char *charset, 00125 parserutils_charset_codec **codec) 00126 { 00127 uint32_t i; 00128 charset_ext8_codec *c; 00129 uint16_t match = parserutils_charset_mibenum_from_name( 00130 charset, strlen(charset)); 00131 uint32_t *table = NULL; 00132 00133 for (i = 0; i < N_ELEMENTS(known_charsets); i++) { 00134 if (known_charsets[i].mib == match) { 00135 table = known_charsets[i].table; 00136 break; 00137 } 00138 } 00139 00140 assert(table != NULL); 00141 00142 c = malloc(sizeof(charset_ext8_codec)); 00143 if (c == NULL) 00144 return PARSERUTILS_NOMEM; 00145 00146 c->table = table; 00147 00148 c->read_buf[0] = 0; 00149 c->read_len = 0; 00150 00151 c->write_buf[0] = 0; 00152 c->write_len = 0; 00153 00154 /* Finally, populate vtable */ 00155 c->base.handler.destroy = charset_ext8_codec_destroy; 00156 c->base.handler.encode = charset_ext8_codec_encode; 00157 c->base.handler.decode = charset_ext8_codec_decode; 00158 c->base.handler.reset = charset_ext8_codec_reset; 00159 00160 *codec = (parserutils_charset_codec *) c; 00161 00162 return PARSERUTILS_OK; 00163 } 00164 00171 parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec) 00172 { 00173 UNUSED(codec); 00174 00175 return PARSERUTILS_OK; 00176 } 00177 00205 parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec, 00206 const uint8_t **source, size_t *sourcelen, 00207 uint8_t **dest, size_t *destlen) 00208 { 00209 charset_ext8_codec *c = (charset_ext8_codec *) codec; 00210 uint32_t ucs4; 00211 uint32_t *towrite; 00212 size_t towritelen; 00213 parserutils_error error; 00214 00215 /* Process any outstanding characters from the previous call */ 00216 if (c->write_len > 0) { 00217 uint32_t *pwrite = c->write_buf; 00218 00219 while (c->write_len > 0) { 00220 error = charset_ext8_from_ucs4(c, pwrite[0], 00221 dest, destlen); 00222 if (error != PARSERUTILS_OK) { 00223 uint32_t len; 00224 assert(error == PARSERUTILS_NOMEM); 00225 00226 for (len = 0; len < c->write_len; len++) { 00227 c->write_buf[len] = pwrite[len]; 00228 } 00229 00230 return error; 00231 } 00232 00233 pwrite++; 00234 c->write_len--; 00235 } 00236 } 00237 00238 /* Now process the characters for this call */ 00239 while (*sourcelen > 0) { 00240 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source)); 00241 towrite = &ucs4; 00242 towritelen = 1; 00243 00244 /* Output current characters */ 00245 while (towritelen > 0) { 00246 error = charset_ext8_from_ucs4(c, towrite[0], dest, 00247 destlen); 00248 if (error != PARSERUTILS_OK) { 00249 uint32_t len; 00250 if (error != PARSERUTILS_NOMEM) { 00251 return error; 00252 } 00253 00254 /* Insufficient output space */ 00255 assert(towritelen < WRITE_BUFSIZE); 00256 00257 c->write_len = towritelen; 00258 00259 /* Copy pending chars to save area, for 00260 * processing next call. */ 00261 for (len = 0; len < towritelen; len++) 00262 c->write_buf[len] = towrite[len]; 00263 00264 /* Claim character we've just buffered, 00265 * so it's not reprocessed */ 00266 *source += 4; 00267 *sourcelen -= 4; 00268 00269 return PARSERUTILS_NOMEM; 00270 } 00271 00272 towrite++; 00273 towritelen--; 00274 } 00275 00276 *source += 4; 00277 *sourcelen -= 4; 00278 } 00279 00280 return PARSERUTILS_OK; 00281 } 00282 00324 parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec, 00325 const uint8_t **source, size_t *sourcelen, 00326 uint8_t **dest, size_t *destlen) 00327 { 00328 charset_ext8_codec *c = (charset_ext8_codec *) codec; 00329 parserutils_error error; 00330 00331 if (c->read_len > 0) { 00332 /* Output left over from last decode */ 00333 uint32_t *pread = c->read_buf; 00334 00335 while (c->read_len > 0 && *destlen >= c->read_len * 4) { 00336 *((uint32_t *) (void *) *dest) = 00337 endian_host_to_big(pread[0]); 00338 00339 *dest += 4; 00340 *destlen -= 4; 00341 00342 pread++; 00343 c->read_len--; 00344 } 00345 00346 if (*destlen < c->read_len * 4) { 00347 /* Ran out of output buffer */ 00348 size_t i; 00349 00350 /* Shuffle remaining output down */ 00351 for (i = 0; i < c->read_len; i++) 00352 c->read_buf[i] = pread[i]; 00353 00354 return PARSERUTILS_NOMEM; 00355 } 00356 } 00357 00358 /* Finally, the "normal" case; process all outstanding characters */ 00359 while (*sourcelen > 0) { 00360 error = charset_ext8_codec_read_char(c, 00361 source, sourcelen, dest, destlen); 00362 if (error != PARSERUTILS_OK) { 00363 return error; 00364 } 00365 } 00366 00367 return PARSERUTILS_OK; 00368 } 00369 00376 parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec) 00377 { 00378 charset_ext8_codec *c = (charset_ext8_codec *) codec; 00379 00380 c->read_buf[0] = 0; 00381 c->read_len = 0; 00382 00383 c->write_buf[0] = 0; 00384 c->write_len = 0; 00385 00386 return PARSERUTILS_OK; 00387 } 00388 00389 00418 parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c, 00419 const uint8_t **source, size_t *sourcelen, 00420 uint8_t **dest, size_t *destlen) 00421 { 00422 uint32_t ucs4; 00423 parserutils_error error; 00424 00425 /* Convert a single character */ 00426 error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4); 00427 if (error == PARSERUTILS_OK) { 00428 /* Read a character */ 00429 error = charset_ext8_codec_output_decoded_char(c, 00430 ucs4, dest, destlen); 00431 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00432 /* output succeeded; update source pointers */ 00433 *source += 1; 00434 *sourcelen -= 1; 00435 } 00436 00437 return error; 00438 } else if (error == PARSERUTILS_NEEDDATA) { 00439 /* Can only happen if sourcelen == 0 */ 00440 return error; 00441 } else if (error == PARSERUTILS_INVALID) { 00442 /* Illegal input sequence */ 00443 00444 /* Strict errormode; simply flag invalid character */ 00445 if (c->base.errormode == 00446 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { 00447 return PARSERUTILS_INVALID; 00448 } 00449 00450 /* output U+FFFD and continue processing. */ 00451 error = charset_ext8_codec_output_decoded_char(c, 00452 0xFFFD, dest, destlen); 00453 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00454 /* output succeeded; update source pointers */ 00455 *source += 1; 00456 *sourcelen -= 1; 00457 } 00458 00459 return error; 00460 } 00461 00462 return PARSERUTILS_OK; 00463 } 00464 00475 parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c, 00476 uint32_t ucs4, uint8_t **dest, size_t *destlen) 00477 { 00478 if (*destlen < 4) { 00479 /* Run out of output buffer */ 00480 c->read_len = 1; 00481 c->read_buf[0] = ucs4; 00482 00483 return PARSERUTILS_NOMEM; 00484 } 00485 00486 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4); 00487 *dest += 4; 00488 *destlen -= 4; 00489 00490 return PARSERUTILS_OK; 00491 } 00492 00509 parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c, 00510 uint32_t ucs4, uint8_t **s, size_t *len) 00511 { 00512 uint8_t out = 0; 00513 00514 if (*len < 1) 00515 return PARSERUTILS_NOMEM; 00516 00517 if (ucs4 < 0x80) { 00518 /* ASCII */ 00519 out = ucs4; 00520 } else { 00521 uint32_t i; 00522 00523 for (i = 0; i < 128; i++) { 00524 if (ucs4 == c->table[i]) 00525 break; 00526 } 00527 00528 if (i == 128) { 00529 if (c->base.errormode == 00530 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) 00531 return PARSERUTILS_INVALID; 00532 else 00533 out = '?'; 00534 } else { 00535 out = 0x80 + i; 00536 } 00537 } 00538 00539 *(*s) = out; 00540 (*s)++; 00541 (*len)--; 00542 00543 return PARSERUTILS_OK; 00544 } 00545 00557 parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c, 00558 const uint8_t *s, size_t len, uint32_t *ucs4) 00559 { 00560 uint32_t out; 00561 00562 if (len < 1) 00563 return PARSERUTILS_NEEDDATA; 00564 00565 if (*s < 0x80) { 00566 out = *s; 00567 } else { 00568 if (c->table[*s - 0x80] == 0xFFFF) 00569 return PARSERUTILS_INVALID; 00570 00571 out = c->table[*s - 0x80]; 00572 } 00573 00574 *ucs4 = out; 00575 00576 return PARSERUTILS_OK; 00577 } 00578 00579 const parserutils_charset_handler charset_ext8_codec_handler = { 00580 charset_ext8_codec_handles_charset, 00581 charset_ext8_codec_create 00582 }; 00583
1.7.3