From 12c91df5774c5174d6b3191cfc0dbc9cc396755d Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Tue, 20 Feb 2024 00:22:32 +0100 Subject: [PATCH] Improve surrogate handling readability - add inline function to test and convert surrogates is_surrogate(c), is_hi_surrogate(c), is_lo_surrogate(c), get_hi_surrogate(c), get_lo_surrogate(c), from_surrogate(hi, lo) - use names for BC header offsets and lengths in libregexp.c - remove strict aliasing violations in `lre_exec_backtrack()` - pass all context variables to XXX_CHAR macros in `lre_exec_backtrack()` --- cutils.h | 39 ++++++++++++- libregexp.c | 159 ++++++++++++++++++++++++++-------------------------- quickjs.c | 76 +++++++++++-------------- 3 files changed, 150 insertions(+), 124 deletions(-) diff --git a/cutils.h b/cutils.h index ea7a760..ff2d3fb 100644 --- a/cutils.h +++ b/cutils.h @@ -45,9 +45,10 @@ #ifndef countof #define countof(x) (sizeof(x) / sizeof((x)[0])) #endif - +#ifndef container_of /* return the pointer of type 'type *' containing 'ptr' as field 'member' */ #define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member))) +#endif typedef int BOOL; @@ -207,17 +208,22 @@ static inline void put_u8(uint8_t *tab, uint8_t val) *tab = val; } +#ifndef bswap16 static inline uint16_t bswap16(uint16_t x) { return (x >> 8) | (x << 8); } +#endif +#ifndef bswap32 static inline uint32_t bswap32(uint32_t v) { return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >> 8) | ((v & 0x0000ff00) << 8) | ((v & 0x000000ff) << 24); } +#endif +#ifndef bswap64 static inline uint64_t bswap64(uint64_t v) { return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) | @@ -229,6 +235,7 @@ static inline uint64_t bswap64(uint64_t v) ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) | ((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8)); } +#endif /* XXX: should take an extra argument to pass slack information to the caller */ typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size); @@ -278,6 +285,36 @@ static inline void dbuf_set_error(DynBuf *s) int unicode_to_utf8(uint8_t *buf, unsigned int c); int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); +static inline BOOL is_surrogate(uint32_t c) +{ + return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF +} + +static inline BOOL is_hi_surrogate(uint32_t c) +{ + return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF +} + +static inline BOOL is_lo_surrogate(uint32_t c) +{ + return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF +} + +static inline uint32_t get_hi_surrogate(uint32_t c) +{ + return (c >> 10) - (0x10000 >> 10) + 0xD800; +} + +static inline uint32_t get_lo_surrogate(uint32_t c) +{ + return (c & 0x3FF) | 0xDC00; +} + +static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo) +{ + return 0x10000 + 0x400 * (hi - 0xD800) + (lo - 0xDC00); +} + static inline int from_hex(int c) { if (c >= '0' && c <= '9') diff --git a/libregexp.c b/libregexp.c index f91c6e6..d73a19f 100644 --- a/libregexp.c +++ b/libregexp.c @@ -100,6 +100,7 @@ static const REOpCode reopcode_info[REOP_COUNT] = { #define RE_HEADER_FLAGS 0 #define RE_HEADER_CAPTURE_COUNT 1 #define RE_HEADER_STACK_SIZE 2 +#define RE_HEADER_BYTECODE_LEN 3 #define RE_HEADER_LEN 7 @@ -224,16 +225,16 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, assert(buf_len >= RE_HEADER_LEN); - re_flags = buf[0]; - bc_len = get_u32(buf + 3); + re_flags = lre_get_flags(buf); + bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN); assert(bc_len + RE_HEADER_LEN <= buf_len); printf("flags: 0x%x capture_count=%d stack_size=%d\n", - re_flags, buf[1], buf[2]); + re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]); if (re_flags & LRE_FLAG_NAMED_GROUPS) { const char *p; p = (char *)buf + RE_HEADER_LEN + bc_len; printf("named groups: "); - for(i = 1; i < buf[1]; i++) { + for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) { if (i != 1) printf(","); printf("<%s>", p); @@ -494,7 +495,7 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16) } c = (c << 4) | h; } - if (c >= 0xd800 && c < 0xdc00 && + if (is_hi_surrogate(c) && allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { /* convert an escaped surrogate pair into a unicode char */ @@ -505,9 +506,9 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16) break; c1 = (c1 << 4) | h; } - if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) { + if (i == 4 && is_lo_surrogate(c1)) { p += 6; - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + c = from_surrogate(c, c1); } } } @@ -936,7 +937,7 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len) case REOP_backward_back_reference: break; default: - /* safe behvior: we cannot predict the outcome */ + /* safe behavior: we cannot predict the outcome */ return TRUE; } pos += len; @@ -1005,10 +1006,10 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) break; } else if (c >= 128) { c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if (c >= 0xD800 && c <= 0xDBFF) { + if (is_hi_surrogate(c)) { d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); - if (d >= 0xDC00 && d <= 0xDFFF) { - c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00); + if (is_lo_surrogate(d)) { + c = from_surrogate(c, d); p = p1; } } @@ -1116,9 +1117,10 @@ static int find_group_name(REParseState *s, const char *name) size_t len, name_len; int capture_index; - name_len = strlen(name); p = (char *)s->group_names.buf; + if (!p) return -1; buf_end = (char *)s->group_names.buf + s->group_names.size; + name_len = strlen(name); capture_index = 1; while (p < buf_end) { len = strlen(p); @@ -1813,7 +1815,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; - put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN); + put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN, + s->byte_code.size - RE_HEADER_LEN); /* add the named groups if needed */ if (s->group_names.size > (s->capture_count - 1)) { @@ -1844,93 +1847,86 @@ static BOOL is_word_char(uint32_t c) (c == '_')); } -#define GET_CHAR(c, cptr, cbuf_end) \ +#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ do { \ if (cbuf_type == 0) { \ c = *cptr++; \ } else { \ - uint32_t __c1; \ - c = *(uint16_t *)cptr; \ - cptr += 2; \ - if (c >= 0xd800 && c < 0xdc00 && \ - cbuf_type == 2 && cptr < cbuf_end) { \ - __c1 = *(uint16_t *)cptr; \ - if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ - c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ - cptr += 2; \ + const uint16_t *_p = (const uint16_t *)cptr; \ + const uint16_t *_end = (const uint16_t *)cbuf_end; \ + c = *_p++; \ + if (is_hi_surrogate(c) && cbuf_type == 2) { \ + if (_p < _end && is_lo_surrogate(*_p)) { \ + c = from_surrogate(c, *_p++); \ } \ } \ + cptr = (const void *)_p; \ } \ } while (0) -#define PEEK_CHAR(c, cptr, cbuf_end) \ +#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \ do { \ if (cbuf_type == 0) { \ c = cptr[0]; \ } else { \ - uint32_t __c1; \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xd800 && c < 0xdc00 && \ - cbuf_type == 2 && (cptr + 2) < cbuf_end) { \ - __c1 = ((uint16_t *)cptr)[1]; \ - if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ - c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ + const uint16_t *_p = (const uint16_t *)cptr; \ + const uint16_t *_end = (const uint16_t *)cbuf_end; \ + c = *_p++; \ + if (is_hi_surrogate(c) && cbuf_type == 2) { \ + if (_p < _end && is_lo_surrogate(*_p)) { \ + c = from_surrogate(c, *_p); \ } \ } \ } \ } while (0) -#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ +#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ do { \ if (cbuf_type == 0) { \ c = cptr[-1]; \ } else { \ - uint32_t __c1; \ - c = ((uint16_t *)cptr)[-1]; \ - if (c >= 0xdc00 && c < 0xe000 && \ - cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \ - __c1 = ((uint16_t *)cptr)[-2]; \ - if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ - c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ + const uint16_t *_p = (const uint16_t *)cptr - 1; \ + const uint16_t *_start = (const uint16_t *)cbuf_start; \ + c = *_p; \ + if (is_lo_surrogate(c) && cbuf_type == 2) { \ + if (_p > _start && is_hi_surrogate(_p[-1])) { \ + c = from_surrogate(*--_p, c); \ } \ } \ } \ } while (0) -#define GET_PREV_CHAR(c, cptr, cbuf_start) \ +#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ do { \ if (cbuf_type == 0) { \ cptr--; \ c = cptr[0]; \ } else { \ - uint32_t __c1; \ - cptr -= 2; \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xdc00 && c < 0xe000 && \ - cbuf_type == 2 && cptr > cbuf_start) { \ - __c1 = ((uint16_t *)cptr)[-1]; \ - if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ - cptr -= 2; \ - c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ + const uint16_t *_p = (const uint16_t *)cptr - 1; \ + const uint16_t *_start = (const uint16_t *)cbuf_start; \ + c = *_p; \ + if (is_lo_surrogate(c) && cbuf_type == 2) { \ + if (_p > _start && is_hi_surrogate(_p[-1])) { \ + c = from_surrogate(*--_p, c); \ } \ } \ + cptr = (const void *)_p; \ } \ } while (0) -#define PREV_CHAR(cptr, cbuf_start) \ +#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \ do { \ if (cbuf_type == 0) { \ cptr--; \ } else { \ - cptr -= 2; \ - if (cbuf_type == 2) { \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \ - c = ((uint16_t *)cptr)[-1]; \ - if (c >= 0xd800 && c < 0xdc00) \ - cptr -= 2; \ + const uint16_t *_p = (const uint16_t *)cptr - 1; \ + const uint16_t *_start = (const uint16_t *)cbuf_start; \ + if (is_lo_surrogate(*_p) && cbuf_type == 2) { \ + if (_p > _start && is_hi_surrogate(_p[-1])) { \ + --_p; \ } \ } \ + cptr = (const void *)_p; \ } \ } while (0) @@ -2070,7 +2066,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, /* go backward */ char_count = get_u32(pc + 12); for(i = 0; i < char_count; i++) { - PREV_CHAR(cptr, s->cbuf); + PREV_CHAR(cptr, s->cbuf, cbuf_type); } pc = (pc + 16) + (int)get_u32(pc); rs->cptr = cptr; @@ -2105,7 +2101,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, test_char: if (cptr >= cbuf_end) goto no_match; - GET_CHAR(c, cptr, cbuf_end); + GET_CHAR(c, cptr, cbuf_end, cbuf_type); if (s->ignore_case) { c = lre_canonicalize(c, s->is_unicode); } @@ -2152,7 +2148,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, break; if (!s->multi_line) goto no_match; - PEEK_PREV_CHAR(c, cptr, s->cbuf); + PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); if (!is_line_terminator(c)) goto no_match; break; @@ -2161,21 +2157,21 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, break; if (!s->multi_line) goto no_match; - PEEK_CHAR(c, cptr, cbuf_end); + PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); if (!is_line_terminator(c)) goto no_match; break; case REOP_dot: if (cptr == cbuf_end) goto no_match; - GET_CHAR(c, cptr, cbuf_end); + GET_CHAR(c, cptr, cbuf_end, cbuf_type); if (is_line_terminator(c)) goto no_match; break; case REOP_any: if (cptr == cbuf_end) goto no_match; - GET_CHAR(c, cptr, cbuf_end); + GET_CHAR(c, cptr, cbuf_end, cbuf_type); break; case REOP_save_start: case REOP_save_end: @@ -2227,14 +2223,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, if (cptr == s->cbuf) { v1 = FALSE; } else { - PEEK_PREV_CHAR(c, cptr, s->cbuf); + PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); v1 = is_word_char(c); } /* current char */ if (cptr >= cbuf_end) { v2 = FALSE; } else { - PEEK_CHAR(c, cptr, cbuf_end); + PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); v2 = is_word_char(c); } if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) @@ -2259,8 +2255,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, while (cptr1 < cptr1_end) { if (cptr >= cbuf_end) goto no_match; - GET_CHAR(c1, cptr1, cptr1_end); - GET_CHAR(c2, cptr, cbuf_end); + GET_CHAR(c1, cptr1, cptr1_end, cbuf_type); + GET_CHAR(c2, cptr, cbuf_end, cbuf_type); if (s->ignore_case) { c1 = lre_canonicalize(c1, s->is_unicode); c2 = lre_canonicalize(c2, s->is_unicode); @@ -2273,8 +2269,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, while (cptr1 > cptr1_start) { if (cptr == s->cbuf) goto no_match; - GET_PREV_CHAR(c1, cptr1, cptr1_start); - GET_PREV_CHAR(c2, cptr, s->cbuf); + GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type); + GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type); if (s->ignore_case) { c1 = lre_canonicalize(c1, s->is_unicode); c2 = lre_canonicalize(c2, s->is_unicode); @@ -2294,7 +2290,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, pc += 2; if (cptr >= cbuf_end) goto no_match; - GET_CHAR(c, cptr, cbuf_end); + GET_CHAR(c, cptr, cbuf_end, cbuf_type); if (s->ignore_case) { c = lre_canonicalize(c, s->is_unicode); } @@ -2334,7 +2330,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, pc += 2; if (cptr >= cbuf_end) goto no_match; - GET_CHAR(c, cptr, cbuf_end); + GET_CHAR(c, cptr, cbuf_end, cbuf_type); if (s->ignore_case) { c = lre_canonicalize(c, s->is_unicode); } @@ -2366,7 +2362,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, /* go to the previous char */ if (cptr == s->cbuf) goto no_match; - PREV_CHAR(cptr, s->cbuf); + PREV_CHAR(cptr, s->cbuf, cbuf_type); break; case REOP_simple_greedy_quant: { @@ -2425,7 +2421,7 @@ int lre_exec(uint8_t **capture, int re_flags, i, alloca_size, ret; StackInt *stack_buf; - re_flags = bc_buf[RE_HEADER_FLAGS]; + re_flags = lre_get_flags(bc_buf); s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; @@ -2472,8 +2468,8 @@ const char *lre_get_groupnames(const uint8_t *bc_buf) uint32_t re_bytecode_len; if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) return NULL; - re_bytecode_len = get_u32(bc_buf + 3); - return (const char *)(bc_buf + 7 + re_bytecode_len); + re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN); + return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len); } #ifdef TEST @@ -2490,25 +2486,26 @@ void *lre_realloc(void *opaque, void *ptr, size_t size) int main(int argc, char **argv) { - int len, ret, i; + int len, flags, ret, i; uint8_t *bc; char error_msg[64]; uint8_t *capture[CAPTURE_COUNT_MAX * 2]; const char *input; int input_len, capture_count; - if (argc < 3) { - printf("usage: %s regexp input\n", argv[0]); - exit(1); + if (argc < 4) { + printf("usage: %s regexp flags input\n", argv[0]); + return 1; } + flags = atoi(argv[2]); bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], - strlen(argv[1]), 0, NULL); + strlen(argv[1]), flags, NULL); if (!bc) { fprintf(stderr, "error: %s\n", error_msg); exit(1); } - input = argv[2]; + input = argv[3]; input_len = strlen(input); ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); diff --git a/quickjs.c b/quickjs.c index c978e32..678df93 100644 --- a/quickjs.c +++ b/quickjs.c @@ -3685,10 +3685,9 @@ static int string_buffer_putc(StringBuffer *s, uint32_t c) { if (unlikely(c >= 0x10000)) { /* surrogate pair */ - c -= 0x10000; - if (string_buffer_putc16(s, (c >> 10) + 0xd800)) + if (string_buffer_putc16(s, get_hi_surrogate(c))) return -1; - c = (c & 0x3ff) + 0xdc00; + c = get_lo_surrogate(c); } return string_buffer_putc16(s, c); } @@ -3699,10 +3698,10 @@ static int string_getc(const JSString *p, int *pidx) idx = *pidx; if (p->is_wide_char) { c = p->u.str16[idx++]; - if (c >= 0xd800 && c < 0xdc00 && idx < p->len) { + if (is_hi_surrogate(c) && idx < p->len) { c1 = p->u.str16[idx]; - if (c1 >= 0xdc00 && c1 < 0xe000) { - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + if (is_lo_surrogate(c1)) { + c = from_surrogate(c, c1); idx++; } } @@ -3900,9 +3899,8 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len) } else if (c <= 0x10FFFF) { p = p_next; /* surrogate pair */ - c -= 0x10000; - string_buffer_putc16(b, (c >> 10) + 0xd800); - c = (c & 0x3ff) + 0xdc00; + string_buffer_putc16(b, get_hi_surrogate(c)); + c = get_lo_surrogate(c); } else { /* invalid char */ c = 0xfffd; @@ -4040,13 +4038,12 @@ const char *JS_ToCStringLen2(JSContext *ctx, size_t *plen, JSValueConst val1, BO if (c < 0x80) { *q++ = c; } else { - if (c >= 0xd800 && c < 0xdc00) { + if (is_hi_surrogate(c)) { if (pos < len && !cesu8) { c1 = src[pos]; - if (c1 >= 0xdc00 && c1 < 0xe000) { + if (is_lo_surrogate(c1)) { pos++; - /* surrogate pair */ - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + c = from_surrogate(c, c1); } else { /* Keep unmatched surrogate code points */ /* c = 0xfffd; */ /* error */ @@ -11729,7 +11726,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValueConst val1) goto fail; break; default: - if (c < 32 || (c >= 0xd800 && c < 0xe000)) { + if (c < 32 || is_surrogate(c)) { snprintf(buf, sizeof(buf), "\\u%04x", c); if (string_buffer_puts8(b, buf)) goto fail; @@ -41583,18 +41580,18 @@ static int64_t string_advance_index(JSString *p, int64_t index, BOOL unicode) -1 if none */ static int js_string_find_invalid_codepoint(JSString *p) { - int i, c; + int i; if (!p->is_wide_char) return -1; for(i = 0; i < p->len; i++) { - c = p->u.str16[i]; - if (c >= 0xD800 && c <= 0xDFFF) { - if (c >= 0xDC00 || (i + 1) >= p->len) + uint32_t c = p->u.str16[i]; + if (is_surrogate(c)) { + if (is_hi_surrogate(c) && (i + 1) < p->len + && is_lo_surrogate(p->u.str16[i + 1])) { + i++; + } else { return i; - c = p->u.str16[i + 1]; - if (c < 0xDC00 || c > 0xDFFF) - return i; - i++; + } } } return -1; @@ -41621,7 +41618,7 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValueConst this_val, { JSValue str, ret; JSString *p; - int c, i; + int i; str = JS_ToStringCheckObject(ctx, this_val); if (JS_IsException(str)) @@ -41640,17 +41637,13 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValueConst this_val, p = JS_VALUE_GET_STRING(ret); for (; i < p->len; i++) { - c = p->u.str16[i]; - if (c >= 0xD800 && c <= 0xDFFF) { - if (c >= 0xDC00 || (i + 1) >= p->len) { - p->u.str16[i] = 0xFFFD; + uint32_t c = p->u.str16[i]; + if (is_surrogate(c)) { + if (is_hi_surrogate(c) && (i + 1) < p->len + && is_lo_surrogate(p->u.str16[i + 1])) { + i++; } else { - c = p->u.str16[i + 1]; - if (c < 0xDC00 || c > 0xDFFF) { - p->u.str16[i] = 0xFFFD; - } else { - i++; - } + p->u.str16[i] = 0xFFFD; } } } @@ -42427,10 +42420,10 @@ static int string_prevc(JSString *p, int *pidx) idx--; if (p->is_wide_char) { c = p->u.str16[idx]; - if (c >= 0xdc00 && c < 0xe000 && idx > 0) { + if (is_lo_surrogate(c) && idx > 0) { c1 = p->u.str16[idx - 1]; - if (c1 >= 0xd800 && c1 <= 0xdc00) { - c = (((c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; + if (is_hi_surrogate(c1)) { + c = from_surrogate(c1, c); idx--; } } @@ -49114,8 +49107,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValueConst this_val, } c = (c << 6) | (c1 & 0x3f); } - if (c < c_min || c > 0x10FFFF || - (c >= 0xd800 && c < 0xe000)) { + if (c < c_min || c > 0x10FFFF || is_surrogate(c)) { js_throw_URIError(ctx, "malformed UTF-8"); goto fail; } @@ -49190,21 +49182,21 @@ static JSValue js_global_encodeURI(JSContext *ctx, JSValueConst this_val, if (isURIUnescaped(c, isComponent)) { string_buffer_putc16(b, c); } else { - if (c >= 0xdc00 && c <= 0xdfff) { + if (is_lo_surrogate(c)) { js_throw_URIError(ctx, "invalid character"); goto fail; - } else if (c >= 0xd800 && c <= 0xdbff) { + } else if (is_hi_surrogate(c)) { if (k >= p->len) { js_throw_URIError(ctx, "expecting surrogate pair"); goto fail; } c1 = string_get(p, k); k++; - if (c1 < 0xdc00 || c1 > 0xdfff) { + if (!is_lo_surrogate(c1)) { js_throw_URIError(ctx, "expecting surrogate pair"); goto fail; } - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + c = from_surrogate(c, c1); } if (c < 0x80) { encodeURI_hex(b, c);