From 8d932deb4947d74db9d349648c73c419087a8cea Mon Sep 17 00:00:00 2001
From: Charlie Gordon <github@chqrlie.org>
Date: Mon, 19 Feb 2024 16:30:08 +0100
Subject: [PATCH] Rename regex flag and field utf16 -> unicode

- rename is_utf16 structure member to is_unicode
- rename flag LRE_FLAG_UTF16 as LRE_FLAG_UNICODE
---
 libregexp.c | 118 ++++++++++++++++++++++++++--------------------------
 libregexp.h |   3 +-
 quickjs.c   |   8 ++--
 3 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/libregexp.c b/libregexp.c
index 982d171..f91c6e6 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -66,7 +66,7 @@ typedef struct {
     const uint8_t *buf_end;
     const uint8_t *buf_start;
     int re_flags;
-    BOOL is_utf16;
+    BOOL is_unicode;
     BOOL ignore_case;
     BOOL dotall;
     int capture_count;
@@ -224,7 +224,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
 
     assert(buf_len >= RE_HEADER_LEN);
 
-    re_flags=  buf[0];
+    re_flags = buf[0];
     bc_len = get_u32(buf + 3);
     assert(bc_len + RE_HEADER_LEN <= buf_len);
     printf("flags: 0x%x capture_count=%d stack_size=%d\n",
@@ -696,10 +696,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
             if ((c >= 'a' && c <= 'z') ||
                 (c >= 'A' && c <= 'Z') ||
                 (((c >= '0' && c <= '9') || c == '_') &&
-                 inclass && !s->is_utf16)) {   /* Annex B.1.4 */
+                 inclass && !s->is_unicode)) {   /* Annex B.1.4 */
                 c &= 0x1f;
                 p++;
-            } else if (s->is_utf16) {
+            } else if (s->is_unicode) {
                 goto invalid_escape;
             } else {
                 /* otherwise return '\' and 'c' */
@@ -710,7 +710,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
 #ifdef CONFIG_ALL_UNICODE
         case 'p':
         case 'P':
-            if (s->is_utf16) {
+            if (s->is_unicode) {
                 if (parse_unicode_property(s, cr, &p, (c == 'P')))
                     return -1;
                 c = CLASS_RANGE_BASE;
@@ -720,14 +720,14 @@ static int get_class_atom(REParseState *s, CharRange *cr,
 #endif
         default:
             p--;
-            ret = lre_parse_escape(&p, s->is_utf16 * 2);
+            ret = lre_parse_escape(&p, s->is_unicode * 2);
             if (ret >= 0) {
                 c = ret;
             } else {
                 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
                     /* always valid to escape these characters */
                     goto normal_char;
-                } else if (s->is_utf16) {
+                } else if (s->is_unicode) {
                 invalid_escape:
                     return re_parse_error(s, "invalid escape sequence in regular expression");
                 } else {
@@ -749,7 +749,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
         /* normal char */
         if (c >= 128) {
             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-            if ((unsigned)c > 0xffff && !s->is_utf16) {
+            if ((unsigned)c > 0xffff && !s->is_unicode) {
                 /* XXX: should handle non BMP-1 code points */
                 return re_parse_error(s, "malformed unicode char");
             }
@@ -811,11 +811,13 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
     cr_init(cr, s->opaque, lre_realloc);
     p = *pp;
     p++;    /* skip '[' */
+
     invert = FALSE;
     if (*p == '^') {
         p++;
         invert = TRUE;
     }
+
     for(;;) {
         if (*p == ']')
             break;
@@ -825,7 +827,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
         if (*p == '-' && p[1] != ']') {
             const uint8_t *p0 = p + 1;
             if (c1 >= CLASS_RANGE_BASE) {
-                if (s->is_utf16) {
+                if (s->is_unicode) {
                     cr_free(cr1);
                     goto invalid_class_range;
                 }
@@ -837,7 +839,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
                 goto fail;
             if (c2 >= CLASS_RANGE_BASE) {
                 cr_free(cr1);
-                if (s->is_utf16) {
+                if (s->is_unicode) {
                     goto invalid_class_range;
                 }
                 /* Annex B: match '-' character */
@@ -866,7 +868,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
         }
     }
     if (s->ignore_case) {
-        if (cr_regexp_canonicalize(cr, s->is_utf16))
+        if (cr_regexp_canonicalize(cr, s->is_unicode))
             goto memory_error;
     }
     if (invert) {
@@ -1161,7 +1163,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
             re_emit_op(s, REOP_prev);
         break;
     case '{':
-        if (s->is_utf16) {
+        if (s->is_unicode) {
             return re_parse_error(s, "syntax error");
         } else if (!is_digit(p[1])) {
             /* Annex B: we accept '{' not followed by digits as a
@@ -1213,7 +1215,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
             lookahead:
                 /* Annex B allows lookahead to be used as an atom for
                    the quantifiers */
-                if (!s->is_utf16 && !is_backward_lookahead)  {
+                if (!s->is_unicode && !is_backward_lookahead)  {
                     last_atom_start = s->byte_code.size;
                     last_capture_count = s->capture_count;
                 }
@@ -1289,7 +1291,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                     /* annex B: we tolerate invalid group names in non
                        unicode mode if there is no named capture
                        definition */
-                    if (s->is_utf16 || re_has_named_captures(s))
+                    if (s->is_unicode || re_has_named_captures(s))
                         return re_parse_error(s, "expecting group name");
                     else
                         goto parse_class_atom;
@@ -1297,7 +1299,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                 p1 += 3;
                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
                                         &p1)) {
-                    if (s->is_utf16 || re_has_named_captures(s))
+                    if (s->is_unicode || re_has_named_captures(s))
                         return re_parse_error(s, "invalid group name");
                     else
                         goto parse_class_atom;
@@ -1308,7 +1310,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                        after (inefficient, but hopefully not common */
                     c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
                     if (c < 0) {
-                        if (s->is_utf16 || re_has_named_captures(s))
+                        if (s->is_unicode || re_has_named_captures(s))
                             return re_parse_error(s, "group name not defined");
                         else
                             goto parse_class_atom;
@@ -1320,7 +1322,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
         case '0':
             p += 2;
             c = 0;
-            if (s->is_utf16) {
+            if (s->is_unicode) {
                 if (is_digit(*p)) {
                     return re_parse_error(s, "invalid decimal escape in regular expression");
                 }
@@ -1342,7 +1344,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
 
                 c = parse_digits(&p, FALSE);
                 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
-                    if (!s->is_utf16) {
+                    if (!s->is_unicode) {
                         /* Annex B.1.4: accept legacy octal */
                         p = q;
                         if (*p <= '7') {
@@ -1384,7 +1386,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
         break;
     case ']':
     case '}':
-        if (s->is_utf16)
+        if (s->is_unicode)
             return re_parse_error(s, "syntax error");
         goto parse_class_atom;
     default:
@@ -1406,7 +1408,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                 return -1;
         } else {
             if (s->ignore_case)
-                c = lre_canonicalize(c, s->is_utf16);
+                c = lre_canonicalize(c, s->is_unicode);
             if (c <= 0xffff)
                 re_emit_op_u16(s, REOP_char, c);
             else
@@ -1442,7 +1444,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                 /* As an extension (see ES6 annex B), we accept '{' not
                    followed by digits as a normal atom */
                 if (!is_digit(p[1])) {
-                    if (s->is_utf16)
+                    if (s->is_unicode)
                         goto invalid_quant_count;
                     break;
                 }
@@ -1461,7 +1463,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                         quant_max = INT32_MAX; /* infinity */
                     }
                 }
-                if (*p != '}' && !s->is_utf16) {
+                if (*p != '}' && !s->is_unicode) {
                     /* Annex B: normal atom if invalid '{' syntax */
                     p = p1;
                     break;
@@ -1753,7 +1755,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
     s->buf_end = s->buf_ptr + buf_len;
     s->buf_start = s->buf_ptr;
     s->re_flags = re_flags;
-    s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
+    s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
     is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
     s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
     s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
@@ -1861,11 +1863,11 @@ static BOOL is_word_char(uint32_t c)
         }                                                               \
     } while (0)
 
-#define PEEK_CHAR(c, cptr, cbuf_end)             \
-    do {                                         \
-        if (cbuf_type == 0) {                    \
-            c = cptr[0];                         \
-        } else {                                 \
+#define PEEK_CHAR(c, cptr, cbuf_end)                                    \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            c = cptr[0];                                                \
+        } else {                                                        \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[0];                                  \
             if (c >= 0xd800 && c < 0xdc00 &&                            \
@@ -1875,18 +1877,18 @@ static BOOL is_word_char(uint32_t c)
                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
                 }                                                       \
             }                                                           \
-        }                                        \
+        }                                                               \
     } while (0)
 
-#define PEEK_PREV_CHAR(c, cptr, cbuf_start)                 \
-    do {                                         \
-        if (cbuf_type == 0) {                    \
-            c = cptr[-1];                        \
-        } else {                                 \
+#define PEEK_PREV_CHAR(c, cptr, cbuf_start)                             \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            c = cptr[-1];                                               \
+        } else {                                                        \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[-1];                                 \
             if (c >= 0xdc00 && c < 0xe000 &&                            \
-                cbuf_type == 2 && (cptr - 4) >= cbuf_start) {              \
+                cbuf_type == 2 && (cptr - 4) >= cbuf_start) {           \
                 __c1 = ((uint16_t *)cptr)[-2];                          \
                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
@@ -1895,15 +1897,15 @@ static BOOL is_word_char(uint32_t c)
         }                                                               \
     } while (0)
 
-#define GET_PREV_CHAR(c, cptr, cbuf_start)       \
-    do {                                         \
-        if (cbuf_type == 0) {                    \
-            cptr--;                              \
-            c = cptr[0];                         \
-        } else {                                 \
+#define GET_PREV_CHAR(c, cptr, cbuf_start)                              \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            cptr--;                                                     \
+            c = cptr[0];                                                \
+        } else {                                                        \
             uint32_t __c1;                                              \
             cptr -= 2;                                                  \
-            c = ((uint16_t *)cptr)[0];                                 \
+            c = ((uint16_t *)cptr)[0];                                  \
             if (c >= 0xdc00 && c < 0xe000 &&                            \
                 cbuf_type == 2 && cptr > cbuf_start) {                  \
                 __c1 = ((uint16_t *)cptr)[-1];                          \
@@ -1915,12 +1917,12 @@ static BOOL is_word_char(uint32_t c)
         }                                                               \
     } while (0)
 
-#define PREV_CHAR(cptr, cbuf_start)       \
-    do {                                  \
-        if (cbuf_type == 0) {             \
-            cptr--;                       \
-        } else {                          \
-            cptr -= 2;                          \
+#define PREV_CHAR(cptr, cbuf_start)                                     \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            cptr--;                                                     \
+        } else {                                                        \
+            cptr -= 2;                                                  \
             if (cbuf_type == 2) {                                       \
                 c = ((uint16_t *)cptr)[0];                              \
                 if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) {   \
@@ -1959,7 +1961,7 @@ typedef struct {
     int stack_size_max;
     BOOL multi_line;
     BOOL ignore_case;
-    BOOL is_utf16;
+    BOOL is_unicode;
     void *opaque; /* used for stack overflow check */
 
     size_t state_size;
@@ -2105,7 +2107,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 goto no_match;
             GET_CHAR(c, cptr, cbuf_end);
             if (s->ignore_case) {
-                c = lre_canonicalize(c, s->is_utf16);
+                c = lre_canonicalize(c, s->is_unicode);
             }
             if (val != c)
                 goto no_match;
@@ -2260,8 +2262,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                         GET_CHAR(c1, cptr1, cptr1_end);
                         GET_CHAR(c2, cptr, cbuf_end);
                         if (s->ignore_case) {
-                            c1 = lre_canonicalize(c1, s->is_utf16);
-                            c2 = lre_canonicalize(c2, s->is_utf16);
+                            c1 = lre_canonicalize(c1, s->is_unicode);
+                            c2 = lre_canonicalize(c2, s->is_unicode);
                         }
                         if (c1 != c2)
                             goto no_match;
@@ -2274,8 +2276,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                         GET_PREV_CHAR(c1, cptr1, cptr1_start);
                         GET_PREV_CHAR(c2, cptr, s->cbuf);
                         if (s->ignore_case) {
-                            c1 = lre_canonicalize(c1, s->is_utf16);
-                            c2 = lre_canonicalize(c2, s->is_utf16);
+                            c1 = lre_canonicalize(c1, s->is_unicode);
+                            c2 = lre_canonicalize(c2, s->is_unicode);
                         }
                         if (c1 != c2)
                             goto no_match;
@@ -2294,7 +2296,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                     goto no_match;
                 GET_CHAR(c, cptr, cbuf_end);
                 if (s->ignore_case) {
-                    c = lre_canonicalize(c, s->is_utf16);
+                    c = lre_canonicalize(c, s->is_unicode);
                 }
                 idx_min = 0;
                 low = get_u16(pc + 0 * 4);
@@ -2334,7 +2336,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                     goto no_match;
                 GET_CHAR(c, cptr, cbuf_end);
                 if (s->ignore_case) {
-                    c = lre_canonicalize(c, s->is_utf16);
+                    c = lre_canonicalize(c, s->is_unicode);
                 }
                 idx_min = 0;
                 low = get_u32(pc + 0 * 8);
@@ -2426,13 +2428,13 @@ int lre_exec(uint8_t **capture,
     re_flags = bc_buf[RE_HEADER_FLAGS];
     s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
     s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
-    s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
+    s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
     s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
     s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
     s->cbuf = cbuf;
     s->cbuf_end = cbuf + (clen << cbuf_type);
     s->cbuf_type = cbuf_type;
-    if (s->cbuf_type == 1 && s->is_utf16)
+    if (s->cbuf_type == 1 && s->is_unicode)
         s->cbuf_type = 2;
     s->opaque = opaque;
 
diff --git a/libregexp.h b/libregexp.h
index 7c03b1a..757b277 100644
--- a/libregexp.h
+++ b/libregexp.h
@@ -34,10 +34,9 @@
 #define LRE_FLAG_IGNORECASE (1 << 1)
 #define LRE_FLAG_MULTILINE  (1 << 2)
 #define LRE_FLAG_DOTALL     (1 << 3)
-#define LRE_FLAG_UTF16      (1 << 4)
+#define LRE_FLAG_UNICODE    (1 << 4)
 #define LRE_FLAG_STICKY     (1 << 5)
 #define LRE_FLAG_INDICES    (1 << 6) /* Unused by libregexp, just recorded. */
-
 #define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
 
 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
diff --git a/quickjs.c b/quickjs.c
index c820b0a..c978e32 100644
--- a/quickjs.c
+++ b/quickjs.c
@@ -43355,7 +43355,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
                 mask = LRE_FLAG_DOTALL;
                 break;
             case 'u':
-                mask = LRE_FLAG_UTF16;
+                mask = LRE_FLAG_UNICODE;
                 break;
             case 'y':
                 mask = LRE_FLAG_STICKY;
@@ -43373,7 +43373,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
         JS_FreeCString(ctx, str);
     }
 
-    str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UTF16));
+    str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
     if (!str)
         return JS_EXCEPTION;
     re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
@@ -44040,7 +44040,7 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValueCon
             break;
         }
         if (end == start) {
-            if (!(re_flags & LRE_FLAG_UTF16) || (unsigned)end >= str->len || !str->is_wide_char) {
+            if (!(re_flags & LRE_FLAG_UNICODE) || (unsigned)end >= str->len || !str->is_wide_char) {
                 end++;
             } else {
                 string_getc(str, &end);
@@ -44815,7 +44815,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
     JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ),
     JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
     JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
-    JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UTF16 ),
+    JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
     JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
     JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
     JS_CFUNC_DEF("exec", 1, js_regexp_exec ),