|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #include <assert.h> 00009 #include <stdlib.h> 00010 #include <string.h> 00011 00012 #include <parserutils/charset/mibenum.h> 00013 00014 #include "charset/codecs/codec_impl.h" 00015 #include "utils/endian.h" 00016 #include "utils/utils.h" 00017 00018 #include "charset/codecs/8859_tables.h" 00019 00020 static struct { 00021 uint16_t mib; 00022 const char *name; 00023 size_t len; 00024 uint32_t *table; 00025 } known_charsets[] = { 00026 { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 }, 00027 { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 }, 00028 { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 }, 00029 { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 }, 00030 { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 }, 00031 { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 }, 00032 { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 }, 00033 { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 }, 00034 { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 }, 00035 { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 }, 00036 { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 }, 00037 { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 }, 00038 { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 }, 00039 { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 }, 00040 { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 } 00041 }; 00042 00046 typedef struct charset_8859_codec { 00047 parserutils_charset_codec base; 00049 uint32_t *table; 00051 #define READ_BUFSIZE (8) 00052 uint32_t read_buf[READ_BUFSIZE]; 00055 size_t read_len; 00057 #define WRITE_BUFSIZE (8) 00058 uint32_t write_buf[WRITE_BUFSIZE]; 00061 size_t write_len; 00063 } charset_8859_codec; 00064 00065 static bool charset_8859_codec_handles_charset(const char *charset); 00066 static parserutils_error charset_8859_codec_create(const char *charset, 00067 parserutils_charset_codec **codec); 00068 static parserutils_error charset_8859_codec_destroy( 00069 parserutils_charset_codec *codec); 00070 static parserutils_error charset_8859_codec_encode( 00071 parserutils_charset_codec *codec, 00072 const uint8_t **source, size_t *sourcelen, 00073 uint8_t **dest, size_t *destlen); 00074 static parserutils_error charset_8859_codec_decode( 00075 parserutils_charset_codec *codec, 00076 const uint8_t **source, size_t *sourcelen, 00077 uint8_t **dest, size_t *destlen); 00078 static parserutils_error charset_8859_codec_reset( 00079 parserutils_charset_codec *codec); 00080 static inline parserutils_error charset_8859_codec_read_char( 00081 charset_8859_codec *c, 00082 const uint8_t **source, size_t *sourcelen, 00083 uint8_t **dest, size_t *destlen); 00084 static inline parserutils_error charset_8859_codec_output_decoded_char( 00085 charset_8859_codec *c, 00086 uint32_t ucs4, uint8_t **dest, size_t *destlen); 00087 static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c, 00088 uint32_t ucs4, uint8_t **s, size_t *len); 00089 static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c, 00090 const uint8_t *s, size_t len, uint32_t *ucs4); 00091 00098 bool charset_8859_codec_handles_charset(const char *charset) 00099 { 00100 uint32_t i; 00101 uint16_t match = parserutils_charset_mibenum_from_name(charset, 00102 strlen(charset)); 00103 00104 if (known_charsets[0].mib == 0) { 00105 for (i = 0; i < N_ELEMENTS(known_charsets); i++) { 00106 known_charsets[i].mib = 00107 parserutils_charset_mibenum_from_name( 00108 known_charsets[i].name, 00109 known_charsets[i].len); 00110 } 00111 } 00112 00113 for (i = 0; i < N_ELEMENTS(known_charsets); i++) { 00114 if (known_charsets[i].mib == match) 00115 return true; 00116 } 00117 00118 return false; 00119 } 00120 00130 parserutils_error charset_8859_codec_create(const char *charset, 00131 parserutils_charset_codec **codec) 00132 { 00133 uint32_t i; 00134 charset_8859_codec *c; 00135 uint16_t match = parserutils_charset_mibenum_from_name( 00136 charset, strlen(charset)); 00137 uint32_t *table = NULL; 00138 00139 for (i = 0; i < N_ELEMENTS(known_charsets); i++) { 00140 if (known_charsets[i].mib == match) { 00141 table = known_charsets[i].table; 00142 break; 00143 } 00144 } 00145 00146 assert(table != NULL); 00147 00148 c = malloc(sizeof(charset_8859_codec)); 00149 if (c == NULL) 00150 return PARSERUTILS_NOMEM; 00151 00152 c->table = table; 00153 00154 c->read_buf[0] = 0; 00155 c->read_len = 0; 00156 00157 c->write_buf[0] = 0; 00158 c->write_len = 0; 00159 00160 /* Finally, populate vtable */ 00161 c->base.handler.destroy = charset_8859_codec_destroy; 00162 c->base.handler.encode = charset_8859_codec_encode; 00163 c->base.handler.decode = charset_8859_codec_decode; 00164 c->base.handler.reset = charset_8859_codec_reset; 00165 00166 *codec = (parserutils_charset_codec *) c; 00167 00168 return PARSERUTILS_OK; 00169 } 00170 00177 parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec) 00178 { 00179 UNUSED(codec); 00180 00181 return PARSERUTILS_OK; 00182 } 00183 00211 parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec, 00212 const uint8_t **source, size_t *sourcelen, 00213 uint8_t **dest, size_t *destlen) 00214 { 00215 charset_8859_codec *c = (charset_8859_codec *) codec; 00216 uint32_t ucs4; 00217 uint32_t *towrite; 00218 size_t towritelen; 00219 parserutils_error error; 00220 00221 /* Process any outstanding characters from the previous call */ 00222 if (c->write_len > 0) { 00223 uint32_t *pwrite = c->write_buf; 00224 00225 while (c->write_len > 0) { 00226 error = charset_8859_from_ucs4(c, pwrite[0], 00227 dest, destlen); 00228 if (error != PARSERUTILS_OK) { 00229 uint32_t len; 00230 assert(error == PARSERUTILS_NOMEM); 00231 00232 for (len = 0; len < c->write_len; len++) { 00233 c->write_buf[len] = pwrite[len]; 00234 } 00235 00236 return error; 00237 } 00238 00239 pwrite++; 00240 c->write_len--; 00241 } 00242 } 00243 00244 /* Now process the characters for this call */ 00245 while (*sourcelen > 0) { 00246 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source)); 00247 towrite = &ucs4; 00248 towritelen = 1; 00249 00250 /* Output current characters */ 00251 while (towritelen > 0) { 00252 error = charset_8859_from_ucs4(c, towrite[0], dest, 00253 destlen); 00254 if (error != PARSERUTILS_OK) { 00255 uint32_t len; 00256 if (error != PARSERUTILS_NOMEM) { 00257 return error; 00258 } 00259 00260 /* Insufficient output space */ 00261 assert(towritelen < WRITE_BUFSIZE); 00262 00263 c->write_len = towritelen; 00264 00265 /* Copy pending chars to save area, for 00266 * processing next call. */ 00267 for (len = 0; len < towritelen; len++) 00268 c->write_buf[len] = towrite[len]; 00269 00270 /* Claim character we've just buffered, 00271 * so it's not reprocessed */ 00272 *source += 4; 00273 *sourcelen -= 4; 00274 00275 return PARSERUTILS_NOMEM; 00276 } 00277 00278 towrite++; 00279 towritelen--; 00280 } 00281 00282 *source += 4; 00283 *sourcelen -= 4; 00284 } 00285 00286 return PARSERUTILS_OK; 00287 } 00288 00330 parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec, 00331 const uint8_t **source, size_t *sourcelen, 00332 uint8_t **dest, size_t *destlen) 00333 { 00334 charset_8859_codec *c = (charset_8859_codec *) codec; 00335 parserutils_error error; 00336 00337 if (c->read_len > 0) { 00338 /* Output left over from last decode */ 00339 uint32_t *pread = c->read_buf; 00340 00341 while (c->read_len > 0 && *destlen >= c->read_len * 4) { 00342 *((uint32_t *) (void *) *dest) = 00343 endian_host_to_big(pread[0]); 00344 00345 *dest += 4; 00346 *destlen -= 4; 00347 00348 pread++; 00349 c->read_len--; 00350 } 00351 00352 if (*destlen < c->read_len * 4) { 00353 /* Ran out of output buffer */ 00354 size_t i; 00355 00356 /* Shuffle remaining output down */ 00357 for (i = 0; i < c->read_len; i++) 00358 c->read_buf[i] = pread[i]; 00359 00360 return PARSERUTILS_NOMEM; 00361 } 00362 } 00363 00364 /* Finally, the "normal" case; process all outstanding characters */ 00365 while (*sourcelen > 0) { 00366 error = charset_8859_codec_read_char(c, 00367 source, sourcelen, dest, destlen); 00368 if (error != PARSERUTILS_OK) { 00369 return error; 00370 } 00371 } 00372 00373 return PARSERUTILS_OK; 00374 } 00375 00382 parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec) 00383 { 00384 charset_8859_codec *c = (charset_8859_codec *) codec; 00385 00386 c->read_buf[0] = 0; 00387 c->read_len = 0; 00388 00389 c->write_buf[0] = 0; 00390 c->write_len = 0; 00391 00392 return PARSERUTILS_OK; 00393 } 00394 00395 00424 parserutils_error charset_8859_codec_read_char(charset_8859_codec *c, 00425 const uint8_t **source, size_t *sourcelen, 00426 uint8_t **dest, size_t *destlen) 00427 { 00428 uint32_t ucs4; 00429 parserutils_error error; 00430 00431 /* Convert a single character */ 00432 error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4); 00433 if (error == PARSERUTILS_OK) { 00434 /* Read a character */ 00435 error = charset_8859_codec_output_decoded_char(c, 00436 ucs4, dest, destlen); 00437 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00438 /* output succeeded; update source pointers */ 00439 *source += 1; 00440 *sourcelen -= 1; 00441 } 00442 00443 return error; 00444 } else if (error == PARSERUTILS_NEEDDATA) { 00445 /* Can only happen if sourcelen == 0 */ 00446 return error; 00447 } else if (error == PARSERUTILS_INVALID) { 00448 /* Illegal input sequence */ 00449 00450 /* Strict errormode; simply flag invalid character */ 00451 if (c->base.errormode == 00452 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { 00453 return PARSERUTILS_INVALID; 00454 } 00455 00456 /* output U+FFFD and continue processing. */ 00457 error = charset_8859_codec_output_decoded_char(c, 00458 0xFFFD, dest, destlen); 00459 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { 00460 /* output succeeded; update source pointers */ 00461 *source += 1; 00462 *sourcelen -= 1; 00463 } 00464 00465 return error; 00466 } 00467 00468 return PARSERUTILS_OK; 00469 } 00470 00481 parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c, 00482 uint32_t ucs4, uint8_t **dest, size_t *destlen) 00483 { 00484 if (*destlen < 4) { 00485 /* Run out of output buffer */ 00486 c->read_len = 1; 00487 c->read_buf[0] = ucs4; 00488 00489 return PARSERUTILS_NOMEM; 00490 } 00491 00492 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4); 00493 *dest += 4; 00494 *destlen -= 4; 00495 00496 return PARSERUTILS_OK; 00497 } 00498 00515 parserutils_error charset_8859_from_ucs4(charset_8859_codec *c, 00516 uint32_t ucs4, uint8_t **s, size_t *len) 00517 { 00518 uint8_t out = 0; 00519 00520 if (*len < 1) 00521 return PARSERUTILS_NOMEM; 00522 00523 if (ucs4 < 0x80) { 00524 /* ASCII */ 00525 out = ucs4; 00526 } else { 00527 uint32_t i; 00528 00529 for (i = 0; i < 96; i++) { 00530 if (ucs4 == c->table[i]) 00531 break; 00532 } 00533 00534 if (i == 96) { 00535 if (c->base.errormode == 00536 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) 00537 return PARSERUTILS_INVALID; 00538 else 00539 out = '?'; 00540 } else { 00541 out = 0xA0 + i; 00542 } 00543 } 00544 00545 *(*s) = out; 00546 (*s)++; 00547 (*len)--; 00548 00549 return PARSERUTILS_OK; 00550 } 00551 00563 parserutils_error charset_8859_to_ucs4(charset_8859_codec *c, 00564 const uint8_t *s, size_t len, uint32_t *ucs4) 00565 { 00566 uint32_t out; 00567 00568 if (len < 1) 00569 return PARSERUTILS_NEEDDATA; 00570 00571 if (*s < 0x80) { 00572 out = *s; 00573 } else if (*s >= 0xA0) { 00574 if (c->table[*s - 0xA0] == 0xFFFF) 00575 return PARSERUTILS_INVALID; 00576 00577 out = c->table[*s - 0xA0]; 00578 } else { 00579 return PARSERUTILS_INVALID; 00580 } 00581 00582 *ucs4 = out; 00583 00584 return PARSERUTILS_OK; 00585 } 00586 00587 const parserutils_charset_handler charset_8859_codec_handler = { 00588 charset_8859_codec_handles_charset, 00589 charset_8859_codec_create 00590 }; 00591
1.7.3