|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00012 #include <stdbool.h> 00013 #include <stdlib.h> 00014 #include <string.h> 00015 00016 #include <parserutils/charset/utf16.h> 00017 00027 parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 00028 size_t len, uint32_t *ucs4, size_t *clen) 00029 { 00030 const uint16_t *ss = (const uint16_t *) (const void *) s; 00031 00032 if (s == NULL || ucs4 == NULL || clen == NULL) 00033 return PARSERUTILS_BADPARM; 00034 00035 if (len < 2) 00036 return PARSERUTILS_NEEDDATA; 00037 00038 if (*ss < 0xD800 || *ss > 0xDFFF) { 00039 *ucs4 = *ss; 00040 *clen = 2; 00041 } else if (0xD800 <= *ss && *ss <= 0xDBFF) { 00042 /* High-surrogate code unit. */ 00043 if (len < 4) 00044 return PARSERUTILS_NEEDDATA; 00045 00046 if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) { 00047 /* We have a valid surrogate pair. */ 00048 *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF)) 00049 + (1<<16); 00050 *clen = 4; 00051 } else { 00052 return PARSERUTILS_INVALID; 00053 } 00054 } else { 00055 /* Low-surrogate code unit. */ 00056 return PARSERUTILS_INVALID; 00057 } 00058 00059 return PARSERUTILS_OK; 00060 } 00061 00070 parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, 00071 size_t *len) 00072 { 00073 uint16_t *ss = (uint16_t *) (void *) s; 00074 uint32_t l = 0; 00075 00076 if (s == NULL || len == NULL) 00077 return PARSERUTILS_BADPARM; 00078 else if (ucs4 < 0x10000) { 00079 *ss = (uint16_t) ucs4; 00080 l = 2; 00081 } else if (ucs4 < 0x110000) { 00082 ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10); 00083 ss[1] = 0xDC00 | (ucs4 & 0x3ff); 00084 l = 4; 00085 } else { 00086 return PARSERUTILS_INVALID; 00087 } 00088 00089 *len = l; 00090 00091 return PARSERUTILS_OK; 00092 } 00093 00102 parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max, 00103 size_t *len) 00104 { 00105 const uint16_t *ss = (const uint16_t *) (const void *) s; 00106 const uint16_t *end = (const uint16_t *) (const void *) (s + max); 00107 int l = 0; 00108 00109 if (s == NULL || len == NULL) 00110 return PARSERUTILS_BADPARM; 00111 00112 while (ss < end) { 00113 if (*ss < 0xD800 || 0xDFFF < *ss) 00114 ss++; 00115 else 00116 ss += 2; 00117 00118 l++; 00119 } 00120 00121 *len = l; 00122 00123 return PARSERUTILS_OK; 00124 } 00125 00133 parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, 00134 size_t *len) 00135 { 00136 const uint16_t *ss = (const uint16_t *) (const void *) s; 00137 00138 if (s == NULL || len == NULL) 00139 return PARSERUTILS_BADPARM; 00140 00141 if (*ss < 0xD800 || 0xDFFF < *ss) 00142 *len = 2; 00143 else 00144 *len = 4; 00145 00146 return PARSERUTILS_OK; 00147 } 00148 00158 parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off, 00159 uint32_t *prevoff) 00160 { 00161 const uint16_t *ss = (const uint16_t *) (const void *) s; 00162 00163 if (s == NULL || prevoff == NULL) 00164 return PARSERUTILS_BADPARM; 00165 00166 if (off < 2) 00167 *prevoff = 0; 00168 else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF) 00169 *prevoff = off - 2; 00170 else 00171 *prevoff = (off < 4) ? 0 : off - 4; 00172 00173 return PARSERUTILS_OK; 00174 } 00175 00186 parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len, 00187 uint32_t off, uint32_t *nextoff) 00188 { 00189 const uint16_t *ss = (const uint16_t *) (const void *) s; 00190 00191 if (s == NULL || off >= len || nextoff == NULL) 00192 return PARSERUTILS_BADPARM; 00193 00194 if (len - off < 4) 00195 *nextoff = len; 00196 else if (ss[1] < 0xD800 || ss[1] > 0xDBFF) 00197 *nextoff = off + 2; 00198 else 00199 *nextoff = (len - off < 6) ? len : off + 4; 00200 00201 return PARSERUTILS_OK; 00202 } 00203 00214 parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, 00215 uint32_t len, uint32_t off, uint32_t *nextoff) 00216 { 00217 const uint16_t *ss = (const uint16_t *) (const void *) s; 00218 00219 if (s == NULL || off >= len || nextoff == NULL) 00220 return PARSERUTILS_BADPARM; 00221 00222 while (1) { 00223 if (len - off < 4) { 00224 return PARSERUTILS_NEEDDATA; 00225 } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) { 00226 *nextoff = off + 2; 00227 break; 00228 } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) { 00229 if (len - off < 6) 00230 return PARSERUTILS_NEEDDATA; 00231 00232 if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) { 00233 *nextoff = off + 4; 00234 break; 00235 } else { 00236 ss++; 00237 off += 2; 00238 } 00239 } 00240 } 00241 00242 return PARSERUTILS_OK; 00243 } 00244
1.7.3