|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #include <assert.h> 00009 #include <stdlib.h> 00010 #include <string.h> 00011 00012 #include <parserutils/charset/mibenum.h> 00013 #include <parserutils/charset/utf8.h> 00014 #include <parserutils/input/inputstream.h> 00015 00016 #include "input/filter.h" 00017 #include "utils/utils.h" 00018 00022 typedef struct parserutils_inputstream_private { 00023 parserutils_inputstream public; 00025 parserutils_buffer *raw; 00027 bool done_first_chunk; 00030 uint16_t mibenum; 00031 uint32_t encsrc; 00033 parserutils_filter *input; 00035 parserutils_charset_detect_func csdetect; 00036 } parserutils_inputstream_private; 00037 00038 static inline parserutils_error parserutils_inputstream_refill_buffer( 00039 parserutils_inputstream_private *stream); 00040 static inline parserutils_error parserutils_inputstream_strip_bom( 00041 uint16_t *mibenum, parserutils_buffer *buffer); 00042 00059 parserutils_error parserutils_inputstream_create(const char *enc, 00060 uint32_t encsrc, parserutils_charset_detect_func csdetect, 00061 parserutils_inputstream **stream) 00062 { 00063 parserutils_inputstream_private *s; 00064 parserutils_error error; 00065 00066 if (stream == NULL) 00067 return PARSERUTILS_BADPARM; 00068 00069 s = malloc(sizeof(parserutils_inputstream_private)); 00070 if (s == NULL) 00071 return PARSERUTILS_NOMEM; 00072 00073 error = parserutils_buffer_create(&s->raw); 00074 if (error != PARSERUTILS_OK) { 00075 free(s); 00076 return error; 00077 } 00078 00079 error = parserutils_buffer_create(&s->public.utf8); 00080 if (error != PARSERUTILS_OK) { 00081 parserutils_buffer_destroy(s->raw); 00082 free(s); 00083 return error; 00084 } 00085 00086 s->public.cursor = 0; 00087 s->public.had_eof = false; 00088 s->done_first_chunk = false; 00089 00090 error = parserutils__filter_create("UTF-8", &s->input); 00091 if (error != PARSERUTILS_OK) { 00092 parserutils_buffer_destroy(s->public.utf8); 00093 parserutils_buffer_destroy(s->raw); 00094 free(s); 00095 return error; 00096 } 00097 00098 if (enc != NULL) { 00099 parserutils_filter_optparams params; 00100 00101 s->mibenum = 00102 parserutils_charset_mibenum_from_name(enc, strlen(enc)); 00103 00104 if (s->mibenum == 0) { 00105 parserutils__filter_destroy(s->input); 00106 parserutils_buffer_destroy(s->public.utf8); 00107 parserutils_buffer_destroy(s->raw); 00108 free(s); 00109 return PARSERUTILS_BADENCODING; 00110 } 00111 00112 params.encoding.name = enc; 00113 00114 error = parserutils__filter_setopt(s->input, 00115 PARSERUTILS_FILTER_SET_ENCODING, 00116 ¶ms); 00117 if (error != PARSERUTILS_OK) { 00118 parserutils__filter_destroy(s->input); 00119 parserutils_buffer_destroy(s->public.utf8); 00120 parserutils_buffer_destroy(s->raw); 00121 free(s); 00122 return error; 00123 } 00124 00125 s->encsrc = encsrc; 00126 } else { 00127 s->mibenum = 0; 00128 s->encsrc = 0; 00129 } 00130 00131 s->csdetect = csdetect; 00132 00133 *stream = (parserutils_inputstream *) s; 00134 00135 return PARSERUTILS_OK; 00136 } 00137 00144 parserutils_error parserutils_inputstream_destroy( 00145 parserutils_inputstream *stream) 00146 { 00147 parserutils_inputstream_private *s = 00148 (parserutils_inputstream_private *) stream; 00149 00150 if (stream == NULL) 00151 return PARSERUTILS_BADPARM; 00152 00153 parserutils__filter_destroy(s->input); 00154 parserutils_buffer_destroy(s->public.utf8); 00155 parserutils_buffer_destroy(s->raw); 00156 free(s); 00157 00158 return PARSERUTILS_OK; 00159 } 00160 00169 parserutils_error parserutils_inputstream_append( 00170 parserutils_inputstream *stream, 00171 const uint8_t *data, size_t len) 00172 { 00173 parserutils_inputstream_private *s = 00174 (parserutils_inputstream_private *) stream; 00175 00176 if (stream == NULL) 00177 return PARSERUTILS_BADPARM; 00178 00179 if (data == NULL) { 00180 s->public.had_eof = true; 00181 return PARSERUTILS_OK; 00182 } 00183 00184 return parserutils_buffer_append(s->raw, data, len); 00185 } 00186 00195 parserutils_error parserutils_inputstream_insert( 00196 parserutils_inputstream *stream, 00197 const uint8_t *data, size_t len) 00198 { 00199 parserutils_inputstream_private *s = 00200 (parserutils_inputstream_private *) stream; 00201 00202 if (stream == NULL || data == NULL) 00203 return PARSERUTILS_BADPARM; 00204 00205 return parserutils_buffer_insert(s->public.utf8, s->public.cursor, 00206 data, len); 00207 } 00208 00209 #define IS_ASCII(x) (((x) & 0x80) == 0) 00210 00232 parserutils_error parserutils_inputstream_peek_slow( 00233 parserutils_inputstream *stream, 00234 size_t offset, const uint8_t **ptr, size_t *length) 00235 { 00236 parserutils_inputstream_private *s = 00237 (parserutils_inputstream_private *) stream; 00238 parserutils_error error = PARSERUTILS_OK; 00239 size_t len; 00240 00241 if (stream == NULL || ptr == NULL || length == NULL) 00242 return PARSERUTILS_BADPARM; 00243 00244 /* There's insufficient data in the buffer, so read some more */ 00245 if (s->raw->length == 0) { 00246 /* No more data to be had */ 00247 return s->public.had_eof ? PARSERUTILS_EOF 00248 : PARSERUTILS_NEEDDATA; 00249 } 00250 00251 /* Refill utf8 buffer from raw buffer */ 00252 error = parserutils_inputstream_refill_buffer(s); 00253 if (error != PARSERUTILS_OK) 00254 return error; 00255 00256 /* Refill may have succeeded, but not actually produced any new data */ 00257 if (s->public.cursor + offset == s->public.utf8->length) 00258 return PARSERUTILS_NEEDDATA; 00259 00260 /* Now try the read */ 00261 if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) { 00262 len = 1; 00263 } else { 00264 error = parserutils_charset_utf8_char_byte_length( 00265 s->public.utf8->data + s->public.cursor + offset, 00266 &len); 00267 00268 if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA) 00269 return error; 00270 00271 if (error == PARSERUTILS_NEEDDATA) { 00272 return s->public.had_eof ? PARSERUTILS_EOF 00273 : PARSERUTILS_NEEDDATA; 00274 } 00275 } 00276 00277 (*length) = len; 00278 (*ptr) = (s->public.utf8->data + s->public.cursor + offset); 00279 00280 return PARSERUTILS_OK; 00281 } 00282 00283 #undef IS_ASCII 00284 00292 const char *parserutils_inputstream_read_charset( 00293 parserutils_inputstream *stream, uint32_t *source) 00294 { 00295 parserutils_inputstream_private *s = 00296 (parserutils_inputstream_private *) stream; 00297 00298 if (stream == NULL || source == NULL) 00299 return NULL; 00300 00301 *source = s->encsrc; 00302 00303 if (s->encsrc == 0) 00304 return "UTF-8"; 00305 00306 return parserutils_charset_mibenum_to_name(s->mibenum); 00307 } 00308 00321 parserutils_error parserutils_inputstream_change_charset( 00322 parserutils_inputstream *stream, 00323 const char *enc, uint32_t source) 00324 { 00325 parserutils_inputstream_private *s = 00326 (parserutils_inputstream_private *) stream; 00327 parserutils_filter_optparams params; 00328 uint16_t temp; 00329 parserutils_error error; 00330 00331 if (stream == NULL || enc == NULL) 00332 return PARSERUTILS_BADPARM; 00333 00334 if (s->done_first_chunk) 00335 return PARSERUTILS_INVALID; 00336 00337 temp = parserutils_charset_mibenum_from_name(enc, strlen(enc)); 00338 if (temp == 0) 00339 return PARSERUTILS_BADENCODING; 00340 00341 /* Ensure filter is using the correct encoding */ 00342 params.encoding.name = enc; 00343 error = parserutils__filter_setopt(s->input, 00344 PARSERUTILS_FILTER_SET_ENCODING, 00345 ¶ms); 00346 if (error != PARSERUTILS_OK) 00347 return error; 00348 00349 /* Finally, replace the current settings */ 00350 s->mibenum = temp; 00351 s->encsrc = source; 00352 00353 return PARSERUTILS_OK; 00354 } 00355 00356 /****************************************************************************** 00357 ******************************************************************************/ 00358 00365 parserutils_error parserutils_inputstream_refill_buffer( 00366 parserutils_inputstream_private *stream) 00367 { 00368 const uint8_t *raw; 00369 uint8_t *utf8; 00370 size_t raw_length, utf8_space; 00371 parserutils_error error; 00372 00373 /* If this is the first chunk of data, we must detect the charset and 00374 * strip the BOM, if one exists */ 00375 if (stream->done_first_chunk == false) { 00376 parserutils_filter_optparams params; 00377 00378 /* If there is a charset detection routine, give it an 00379 * opportunity to override any charset specified when the 00380 * inputstream was created */ 00381 if (stream->csdetect != NULL) { 00382 error = stream->csdetect(stream->raw->data, 00383 stream->raw->length, 00384 &stream->mibenum, &stream->encsrc); 00385 if (error != PARSERUTILS_OK) { 00386 if (error != PARSERUTILS_NEEDDATA || 00387 stream->public.had_eof == false) 00388 return error; 00389 00390 /* We don't have enough data to detect the 00391 * input encoding, but we're not going to get 00392 * any more as we've been notified of EOF. 00393 * Therefore, leave the encoding alone 00394 * so that any charset specified when the 00395 * inputstream was created will be preserved. 00396 * If there was no charset specified, then 00397 * we'll default to UTF-8, below */ 00398 } 00399 } 00400 00401 /* Default to UTF-8 if there is still no encoding information 00402 * We'll do this if there was no encoding specified up-front 00403 * and: 00404 * 1) there was no charset detection routine 00405 * or 2) there was insufficient data for the charset 00406 * detection routine to detect an encoding 00407 */ 00408 if (stream->mibenum == 0) { 00409 stream->mibenum = 00410 parserutils_charset_mibenum_from_name("UTF-8", 00411 SLEN("UTF-8")); 00412 stream->encsrc = 0; 00413 } 00414 00415 assert(stream->mibenum != 0); 00416 00417 /* Strip any BOM, and update encoding as appropriate */ 00418 error = parserutils_inputstream_strip_bom(&stream->mibenum, 00419 stream->raw); 00420 if (error != PARSERUTILS_OK) 00421 return error; 00422 00423 /* Ensure filter is using the correct encoding */ 00424 params.encoding.name = 00425 parserutils_charset_mibenum_to_name(stream->mibenum); 00426 00427 error = parserutils__filter_setopt(stream->input, 00428 PARSERUTILS_FILTER_SET_ENCODING, 00429 ¶ms); 00430 if (error != PARSERUTILS_OK) 00431 return error; 00432 00433 stream->done_first_chunk = true; 00434 } 00435 00436 /* Work out how to perform the buffer fill */ 00437 if (stream->public.cursor == stream->public.utf8->length) { 00438 /* Cursor's at the end, so simply reuse the entire buffer */ 00439 utf8 = stream->public.utf8->data; 00440 utf8_space = stream->public.utf8->allocated; 00441 } else { 00442 /* Cursor's not at the end, so shift data after cursor to the 00443 * bottom of the buffer. If the buffer's still over half full, 00444 * extend it. */ 00445 memmove(stream->public.utf8->data, 00446 stream->public.utf8->data + stream->public.cursor, 00447 stream->public.utf8->length - stream->public.cursor); 00448 00449 stream->public.utf8->length -= stream->public.cursor; 00450 00451 if (stream->public.utf8->length > 00452 stream->public.utf8->allocated / 2) { 00453 error = parserutils_buffer_grow(stream->public.utf8); 00454 if (error != PARSERUTILS_OK) 00455 return error; 00456 } 00457 00458 utf8 = stream->public.utf8->data + stream->public.utf8->length; 00459 utf8_space = stream->public.utf8->allocated - 00460 stream->public.utf8->length; 00461 } 00462 00463 raw = stream->raw->data; 00464 raw_length = stream->raw->length; 00465 00466 /* Try to fill utf8 buffer from the raw data */ 00467 error = parserutils__filter_process_chunk(stream->input, 00468 &raw, &raw_length, &utf8, &utf8_space); 00469 /* _NOMEM implies that there's more input to read than available space 00470 * in the utf8 buffer. That's fine, so we'll ignore that error. */ 00471 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) 00472 return error; 00473 00474 /* Remove the raw data we've processed from the raw buffer */ 00475 error = parserutils_buffer_discard(stream->raw, 0, 00476 stream->raw->length - raw_length); 00477 if (error != PARSERUTILS_OK) 00478 return error; 00479 00480 /* Fix up the utf8 buffer information */ 00481 stream->public.utf8->length = 00482 stream->public.utf8->allocated - utf8_space; 00483 00484 /* Finally, fix up the cursor */ 00485 stream->public.cursor = 0; 00486 00487 return PARSERUTILS_OK; 00488 } 00489 00496 parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum, 00497 parserutils_buffer *buffer) 00498 { 00499 static uint16_t utf8; 00500 static uint16_t utf16; 00501 static uint16_t utf16be; 00502 static uint16_t utf16le; 00503 static uint16_t utf32; 00504 static uint16_t utf32be; 00505 static uint16_t utf32le; 00506 00507 if (utf8 == 0) { 00508 utf8 = parserutils_charset_mibenum_from_name("UTF-8", 00509 SLEN("UTF-8")); 00510 utf16 = parserutils_charset_mibenum_from_name("UTF-16", 00511 SLEN("UTF-16")); 00512 utf16be = parserutils_charset_mibenum_from_name("UTF-16BE", 00513 SLEN("UTF-16BE")); 00514 utf16le = parserutils_charset_mibenum_from_name("UTF-16LE", 00515 SLEN("UTF-16LE")); 00516 utf32 = parserutils_charset_mibenum_from_name("UTF-32", 00517 SLEN("UTF-32")); 00518 utf32be = parserutils_charset_mibenum_from_name("UTF-32BE", 00519 SLEN("UTF-32BE")); 00520 utf32le = parserutils_charset_mibenum_from_name("UTF-32LE", 00521 SLEN("UTF-32LE")); 00522 } 00523 00524 #define UTF32_BOM_LEN (4) 00525 #define UTF16_BOM_LEN (2) 00526 #define UTF8_BOM_LEN (3) 00527 00528 if (*mibenum == utf8) { 00529 if (buffer->length >= UTF8_BOM_LEN && 00530 buffer->data[0] == 0xEF && 00531 buffer->data[1] == 0xBB && 00532 buffer->data[2] == 0xBF) { 00533 return parserutils_buffer_discard( 00534 buffer, 0, UTF8_BOM_LEN); 00535 } 00536 } else if (*mibenum == utf16be) { 00537 if (buffer->length >= UTF16_BOM_LEN && 00538 buffer->data[0] == 0xFE && 00539 buffer->data[1] == 0xFF) { 00540 return parserutils_buffer_discard( 00541 buffer, 0, UTF16_BOM_LEN); 00542 } 00543 } else if (*mibenum == utf16le) { 00544 if (buffer->length >= UTF16_BOM_LEN && 00545 buffer->data[0] == 0xFF && 00546 buffer->data[1] == 0xFE) { 00547 return parserutils_buffer_discard( 00548 buffer, 0, UTF16_BOM_LEN); 00549 } 00550 } else if (*mibenum == utf16) { 00551 *mibenum = utf16be; 00552 00553 if (buffer->length >= UTF16_BOM_LEN) { 00554 if (buffer->data[0] == 0xFE && 00555 buffer->data[1] == 0xFF) { 00556 return parserutils_buffer_discard( 00557 buffer, 0, UTF16_BOM_LEN); 00558 } else if (buffer->data[0] == 0xFF && 00559 buffer->data[1] == 0xFE) { 00560 *mibenum = utf16le; 00561 return parserutils_buffer_discard( 00562 buffer, 0, UTF16_BOM_LEN); 00563 } 00564 } 00565 } else if (*mibenum == utf32be) { 00566 if (buffer->length >= UTF32_BOM_LEN && 00567 buffer->data[0] == 0x00 && 00568 buffer->data[1] == 0x00 && 00569 buffer->data[2] == 0xFE && 00570 buffer->data[3] == 0xFF) { 00571 return parserutils_buffer_discard( 00572 buffer, 0, UTF32_BOM_LEN); 00573 } 00574 } else if (*mibenum == utf32le) { 00575 if (buffer->length >= UTF32_BOM_LEN && 00576 buffer->data[0] == 0xFF && 00577 buffer->data[1] == 0xFE && 00578 buffer->data[2] == 0x00 && 00579 buffer->data[3] == 0x00) { 00580 return parserutils_buffer_discard( 00581 buffer, 0, UTF32_BOM_LEN); 00582 } 00583 } else if (*mibenum == utf32) { 00584 *mibenum = utf32be; 00585 00586 if (buffer->length >= UTF32_BOM_LEN) { 00587 if (buffer->data[0] == 0x00 && 00588 buffer->data[1] == 0x00 && 00589 buffer->data[2] == 0xFE && 00590 buffer->data[3] == 0xFF) { 00591 return parserutils_buffer_discard( 00592 buffer, 0, UTF32_BOM_LEN); 00593 } else if (buffer->data[0] == 0xFF && 00594 buffer->data[1] == 0xFE && 00595 buffer->data[2] == 0x00 && 00596 buffer->data[3] == 0x00) { 00597 *mibenum = utf32le; 00598 return parserutils_buffer_discard( 00599 buffer, 0, UTF32_BOM_LEN); 00600 } 00601 } 00602 } 00603 00604 #undef UTF8_BOM_LEN 00605 #undef UTF16_BOM_LEN 00606 #undef UTF32_BOM_LEN 00607 00608 return PARSERUTILS_OK; 00609 } 00610
1.7.3