|
Libparserutils
|
00001 /* 00002 * This file is part of LibParserUtils. 00003 * Licensed under the MIT License, 00004 * http://www.opensource.org/licenses/mit-license.php 00005 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 00006 */ 00007 00008 #ifndef parserutils_charset_encodings_utf8impl_h_ 00009 #define parserutils_charset_encodings_utf8impl_h_ 00010 00015 #include <stdbool.h> 00016 #include <stdlib.h> 00017 #include <string.h> 00018 00020 extern const uint8_t numContinuations[256]; 00021 00034 #define UTF8_TO_UCS4(s, len, ucs4, clen, error) \ 00035 do { \ 00036 uint32_t c, min; \ 00037 uint8_t n; \ 00038 uint8_t i; \ 00039 \ 00040 error = PARSERUTILS_OK; \ 00041 \ 00042 if (s == NULL || ucs4 == NULL || clen == NULL) { \ 00043 error = PARSERUTILS_BADPARM; \ 00044 break; \ 00045 } \ 00046 \ 00047 if (len == 0) { \ 00048 error = PARSERUTILS_NEEDDATA; \ 00049 break; \ 00050 } \ 00051 \ 00052 c = s[0]; \ 00053 \ 00054 if (c < 0x80) { \ 00055 n = 1; \ 00056 min = 0; \ 00057 } else if ((c & 0xE0) == 0xC0) { \ 00058 c &= 0x1F; \ 00059 n = 2; \ 00060 min = 0x80; \ 00061 } else if ((c & 0xF0) == 0xE0) { \ 00062 c &= 0x0F; \ 00063 n = 3; \ 00064 min = 0x800; \ 00065 } else if ((c & 0xF8) == 0xF0) { \ 00066 c &= 0x07; \ 00067 n = 4; \ 00068 min = 0x10000; \ 00069 } else if ((c & 0xFC) == 0xF8) { \ 00070 c &= 0x03; \ 00071 n = 5; \ 00072 min = 0x200000; \ 00073 } else if ((c & 0xFE) == 0xFC) { \ 00074 c &= 0x01; \ 00075 n = 6; \ 00076 min = 0x4000000; \ 00077 } else { \ 00078 error = PARSERUTILS_INVALID; \ 00079 break; \ 00080 } \ 00081 \ 00082 if (len < n) { \ 00083 error = PARSERUTILS_NEEDDATA; \ 00084 break; \ 00085 } \ 00086 \ 00087 for (i = 1; i < n; i++) { \ 00088 uint32_t t = s[i]; \ 00089 \ 00090 if ((t & 0xC0) != 0x80) { \ 00091 error = PARSERUTILS_INVALID; \ 00092 break; \ 00093 } \ 00094 \ 00095 c <<= 6; \ 00096 c |= t & 0x3F; \ 00097 } \ 00098 \ 00099 if (error == PARSERUTILS_OK) { \ 00100 /* Detect overlong sequences, surrogates and fffe/ffff */ \ 00101 if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \ 00102 c == 0xFFFE || c == 0xFFFF) { \ 00103 error = PARSERUTILS_INVALID; \ 00104 break; \ 00105 } \ 00106 \ 00107 *ucs4 = c; \ 00108 *clen = n; \ 00109 } \ 00110 } while(0) 00111 00123 #define UTF8_FROM_UCS4(ucs4, s, len, error) \ 00124 do { \ 00125 uint8_t *buf; \ 00126 uint8_t l = 0; \ 00127 \ 00128 error = PARSERUTILS_OK; \ 00129 \ 00130 if (s == NULL || *s == NULL || len == NULL) { \ 00131 error = PARSERUTILS_BADPARM; \ 00132 break; \ 00133 } \ 00134 \ 00135 if (ucs4 < 0x80) { \ 00136 l = 1; \ 00137 } else if (ucs4 < 0x800) { \ 00138 l = 2; \ 00139 } else if (ucs4 < 0x10000) { \ 00140 l = 3; \ 00141 } else if (ucs4 < 0x200000) { \ 00142 l = 4; \ 00143 } else if (ucs4 < 0x4000000) { \ 00144 l = 5; \ 00145 } else if (ucs4 <= 0x7FFFFFFF) { \ 00146 l = 6; \ 00147 } else { \ 00148 error = PARSERUTILS_INVALID; \ 00149 break; \ 00150 } \ 00151 \ 00152 if (l > *len) { \ 00153 error = PARSERUTILS_NOMEM; \ 00154 break; \ 00155 } \ 00156 \ 00157 buf = *s; \ 00158 \ 00159 if (l == 1) { \ 00160 buf[0] = (uint8_t) ucs4; \ 00161 } else { \ 00162 uint8_t i; \ 00163 for (i = l; i > 1; i--) { \ 00164 buf[i - 1] = 0x80 | (ucs4 & 0x3F); \ 00165 ucs4 >>= 6; \ 00166 } \ 00167 buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \ 00168 } \ 00169 \ 00170 *s += l; \ 00171 *len -= l; \ 00172 } while(0) 00173 00182 #define UTF8_LENGTH(s, max, len, error) \ 00183 do { \ 00184 const uint8_t *end = s + max; \ 00185 int l = 0; \ 00186 \ 00187 error = PARSERUTILS_OK; \ 00188 \ 00189 if (s == NULL || len == NULL) { \ 00190 error = PARSERUTILS_BADPARM; \ 00191 break; \ 00192 } \ 00193 \ 00194 while (s < end) { \ 00195 uint32_t c = s[0]; \ 00196 \ 00197 if ((c & 0x80) == 0x00) \ 00198 s += 1; \ 00199 else if ((c & 0xE0) == 0xC0) \ 00200 s += 2; \ 00201 else if ((c & 0xF0) == 0xE0) \ 00202 s += 3; \ 00203 else if ((c & 0xF8) == 0xF0) \ 00204 s += 4; \ 00205 else if ((c & 0xFC) == 0xF8) \ 00206 s += 5; \ 00207 else if ((c & 0xFE) == 0xFC) \ 00208 s += 6; \ 00209 else { \ 00210 error = PARSERUTILS_INVALID; \ 00211 break; \ 00212 } \ 00213 \ 00214 l++; \ 00215 } \ 00216 \ 00217 if (error == PARSERUTILS_OK) \ 00218 *len = l; \ 00219 } while(0) 00220 00228 #define UTF8_CHAR_BYTE_LENGTH(s, len, error) \ 00229 do { \ 00230 if (s == NULL || len == NULL) { \ 00231 error = PARSERUTILS_BADPARM; \ 00232 break; \ 00233 } \ 00234 \ 00235 *len = numContinuations[s[0]] + 1 /* Start byte */; \ 00236 \ 00237 error = PARSERUTILS_OK; \ 00238 } while(0) 00239 00249 #define UTF8_PREV(s, off, prevoff, error) \ 00250 do { \ 00251 if (s == NULL || prevoff == NULL) { \ 00252 error = PARSERUTILS_BADPARM; \ 00253 break; \ 00254 } \ 00255 \ 00256 while (off != 0 && (s[--off] & 0xC0) == 0x80) \ 00257 /* do nothing */; \ 00258 \ 00259 *prevoff = off; \ 00260 \ 00261 error = PARSERUTILS_OK; \ 00262 } while(0) 00263 00274 #define UTF8_NEXT(s, len, off, nextoff, error) \ 00275 do { \ 00276 if (s == NULL || off >= len || nextoff == NULL) { \ 00277 error = PARSERUTILS_BADPARM; \ 00278 break; \ 00279 } \ 00280 \ 00281 /* Skip current start byte (if present - may be mid-sequence) */\ 00282 if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \ 00283 off++; \ 00284 \ 00285 while (off < len && (s[off] & 0xC0) == 0x80) \ 00286 off++; \ 00287 \ 00288 *nextoff = off; \ 00289 \ 00290 error = PARSERUTILS_OK; \ 00291 } while(0) 00292 00303 #define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \ 00304 do { \ 00305 uint8_t c; \ 00306 \ 00307 error = PARSERUTILS_OK; \ 00308 \ 00309 if (s == NULL || off >= len || nextoff == NULL) { \ 00310 error = PARSERUTILS_BADPARM; \ 00311 break; \ 00312 } \ 00313 \ 00314 c = s[off]; \ 00315 \ 00316 /* If we're mid-sequence, simply advance to next byte */ \ 00317 if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \ 00318 off++; \ 00319 } else { \ 00320 uint32_t nCont = numContinuations[c]; \ 00321 uint32_t nToSkip; \ 00322 \ 00323 if (off + nCont + 1 >= len) { \ 00324 error = PARSERUTILS_NEEDDATA; \ 00325 break; \ 00326 } \ 00327 \ 00328 /* Verify continuation bytes */ \ 00329 for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \ 00330 if ((s[off + nToSkip] & 0xC0) != 0x80) \ 00331 break; \ 00332 } \ 00333 \ 00334 /* Skip over the valid bytes */ \ 00335 off += nToSkip; \ 00336 } \ 00337 \ 00338 *nextoff = off; \ 00339 } while(0) 00340 00341 #endif
1.7.3