mirror of
https://github.com/bellard/quickjs.git
synced 2025-05-29 01:49:18 +08:00
regexp: added v flag support - fixed corner cases of case insensitive matching
This commit is contained in:
parent
a8b2d7c2b2
commit
d7cdfdc8d7
830
libregexp.c
830
libregexp.c
File diff suppressed because it is too large
Load Diff
@ -35,6 +35,7 @@
|
||||
#define LRE_FLAG_STICKY (1 << 5)
|
||||
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */
|
||||
#define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
|
||||
#define LRE_FLAG_UNICODE_SETS (1 << 8)
|
||||
|
||||
#define LRE_RET_MEMORY_ERROR (-1)
|
||||
#define LRE_RET_TIMEOUT (-2)
|
||||
|
@ -4054,6 +4054,89 @@ static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[450] = {
|
||||
0x4f, 0xff,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_prop_Basic_Emoji1_table[143] = {
|
||||
0x60, 0x23, 0x19, 0x81, 0x40, 0xcc, 0x1a, 0x01,
|
||||
0x80, 0x42, 0x08, 0x81, 0x94, 0x81, 0xb1, 0x8b,
|
||||
0xaa, 0x80, 0x92, 0x80, 0x8c, 0x07, 0x81, 0x90,
|
||||
0x0c, 0x0f, 0x04, 0x80, 0x94, 0x06, 0x08, 0x03,
|
||||
0x01, 0x06, 0x03, 0x81, 0x9b, 0x80, 0xa2, 0x00,
|
||||
0x03, 0x10, 0x80, 0xbc, 0x82, 0x97, 0x80, 0x8d,
|
||||
0x80, 0x43, 0x5a, 0x81, 0xb2, 0x03, 0x80, 0x61,
|
||||
0xc4, 0xad, 0x80, 0x40, 0xc9, 0x80, 0x40, 0xbd,
|
||||
0x01, 0x89, 0xe5, 0x80, 0x97, 0x80, 0x93, 0x01,
|
||||
0x20, 0x82, 0x94, 0x81, 0x40, 0xad, 0xa0, 0x8b,
|
||||
0x88, 0x80, 0xc5, 0x80, 0x95, 0x8b, 0xaa, 0x1c,
|
||||
0x8b, 0x90, 0x10, 0x82, 0xc6, 0x00, 0x80, 0x40,
|
||||
0xba, 0x81, 0xbe, 0x8c, 0x18, 0x97, 0x91, 0x80,
|
||||
0x99, 0x81, 0x8c, 0x80, 0xd5, 0xd4, 0xaf, 0xc5,
|
||||
0x28, 0x12, 0x0a, 0x1b, 0x8a, 0x0e, 0x88, 0x40,
|
||||
0xe2, 0x8b, 0x18, 0x41, 0x1a, 0xae, 0x80, 0x89,
|
||||
0x80, 0x40, 0xb8, 0xef, 0x8c, 0x82, 0x89, 0x84,
|
||||
0xb7, 0x86, 0x8e, 0x81, 0x8a, 0x85, 0x88,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_prop_Basic_Emoji2_table[183] = {
|
||||
0x40, 0xa8, 0x03, 0x80, 0x5f, 0x8c, 0x80, 0x8b,
|
||||
0x80, 0x40, 0xd7, 0x80, 0x95, 0x80, 0xd9, 0x85,
|
||||
0x8e, 0x81, 0x41, 0x7c, 0x80, 0x40, 0xa5, 0x80,
|
||||
0x9c, 0x10, 0x0c, 0x82, 0x40, 0xc6, 0x80, 0x40,
|
||||
0xe6, 0x81, 0x89, 0x80, 0x88, 0x80, 0xb9, 0x0a,
|
||||
0x84, 0x88, 0x01, 0x05, 0x03, 0x01, 0x00, 0x09,
|
||||
0x02, 0x02, 0x0f, 0x14, 0x00, 0x80, 0x9b, 0x09,
|
||||
0x00, 0x08, 0x80, 0x91, 0x01, 0x80, 0x92, 0x00,
|
||||
0x18, 0x00, 0x0a, 0x05, 0x07, 0x81, 0x95, 0x05,
|
||||
0x00, 0x00, 0x80, 0x94, 0x05, 0x09, 0x01, 0x17,
|
||||
0x04, 0x09, 0x08, 0x01, 0x00, 0x00, 0x05, 0x02,
|
||||
0x80, 0x90, 0x81, 0x8e, 0x01, 0x80, 0x9a, 0x81,
|
||||
0xbb, 0x80, 0x41, 0x91, 0x81, 0x41, 0xce, 0x82,
|
||||
0x45, 0x27, 0x80, 0x8b, 0x80, 0x42, 0x58, 0x00,
|
||||
0x80, 0x61, 0xbe, 0xd5, 0x81, 0x8b, 0x81, 0x40,
|
||||
0x81, 0x80, 0xb3, 0x80, 0x40, 0xe8, 0x01, 0x88,
|
||||
0x88, 0x80, 0xc5, 0x80, 0x97, 0x08, 0x11, 0x81,
|
||||
0xaa, 0x1c, 0x8b, 0x92, 0x00, 0x00, 0x80, 0xc6,
|
||||
0x00, 0x80, 0x40, 0xba, 0x80, 0xca, 0x81, 0xa3,
|
||||
0x09, 0x86, 0x8c, 0x01, 0x19, 0x80, 0x93, 0x01,
|
||||
0x07, 0x81, 0x88, 0x04, 0x82, 0x8b, 0x17, 0x11,
|
||||
0x00, 0x03, 0x05, 0x02, 0x05, 0x80, 0x40, 0xcf,
|
||||
0x00, 0x82, 0x8f, 0x2a, 0x05, 0x01, 0x80,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_prop_RGI_Emoji_Modifier_Sequence_table[73] = {
|
||||
0x60, 0x26, 0x1c, 0x80, 0x40, 0xda, 0x80, 0x8f,
|
||||
0x83, 0x61, 0xcc, 0x76, 0x80, 0xbb, 0x11, 0x01,
|
||||
0x82, 0xf4, 0x09, 0x8a, 0x94, 0x18, 0x18, 0x88,
|
||||
0x10, 0x1a, 0x02, 0x30, 0x00, 0x97, 0x80, 0x40,
|
||||
0xc8, 0x0b, 0x80, 0x94, 0x03, 0x81, 0x40, 0xad,
|
||||
0x12, 0x84, 0xd2, 0x80, 0x8f, 0x82, 0x88, 0x80,
|
||||
0x8a, 0x80, 0x42, 0x3e, 0x01, 0x07, 0x3d, 0x80,
|
||||
0x88, 0x89, 0x11, 0xb7, 0x80, 0xbc, 0x08, 0x08,
|
||||
0x80, 0x90, 0x10, 0x8c, 0x40, 0xe4, 0x82, 0xa9,
|
||||
0x88,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_prop_RGI_Emoji_Flag_Sequence_table[128] = {
|
||||
0x0c, 0x00, 0x09, 0x00, 0x04, 0x01, 0x02, 0x06,
|
||||
0x03, 0x03, 0x01, 0x02, 0x01, 0x03, 0x07, 0x0d,
|
||||
0x18, 0x00, 0x09, 0x00, 0x00, 0x89, 0x08, 0x00,
|
||||
0x00, 0x81, 0x88, 0x83, 0x8c, 0x10, 0x00, 0x01,
|
||||
0x07, 0x08, 0x29, 0x10, 0x28, 0x00, 0x80, 0x8a,
|
||||
0x00, 0x0a, 0x00, 0x0e, 0x15, 0x18, 0x83, 0x89,
|
||||
0x06, 0x00, 0x81, 0x8d, 0x00, 0x12, 0x08, 0x00,
|
||||
0x03, 0x00, 0x24, 0x00, 0x05, 0x21, 0x00, 0x00,
|
||||
0x29, 0x90, 0x00, 0x02, 0x00, 0x08, 0x09, 0x00,
|
||||
0x08, 0x18, 0x8b, 0x80, 0x8c, 0x02, 0x19, 0x1a,
|
||||
0x11, 0x00, 0x00, 0x80, 0x9c, 0x80, 0x88, 0x02,
|
||||
0x00, 0x00, 0x02, 0x20, 0x88, 0x0a, 0x00, 0x03,
|
||||
0x01, 0x02, 0x05, 0x08, 0x00, 0x01, 0x09, 0x20,
|
||||
0x21, 0x18, 0x22, 0x00, 0x00, 0x00, 0x00, 0x18,
|
||||
0x28, 0x89, 0x80, 0x8b, 0x80, 0x90, 0x80, 0x92,
|
||||
0x80, 0x8d, 0x05, 0x80, 0x8a, 0x80, 0x88, 0x80,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_prop_Emoji_Keycap_Sequence_table[4] = {
|
||||
0xa2, 0x05, 0x04, 0x89,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = {
|
||||
0xaf, 0x89, 0x35, 0x99, 0x85,
|
||||
};
|
||||
@ -4493,6 +4576,11 @@ typedef enum {
|
||||
UNICODE_PROP_Changes_When_Titlecased1,
|
||||
UNICODE_PROP_Changes_When_Casefolded1,
|
||||
UNICODE_PROP_Changes_When_NFKC_Casefolded1,
|
||||
UNICODE_PROP_Basic_Emoji1,
|
||||
UNICODE_PROP_Basic_Emoji2,
|
||||
UNICODE_PROP_RGI_Emoji_Modifier_Sequence,
|
||||
UNICODE_PROP_RGI_Emoji_Flag_Sequence,
|
||||
UNICODE_PROP_Emoji_Keycap_Sequence,
|
||||
UNICODE_PROP_ASCII_Hex_Digit,
|
||||
UNICODE_PROP_Bidi_Control,
|
||||
UNICODE_PROP_Dash,
|
||||
@ -4633,6 +4721,11 @@ static const uint8_t * const unicode_prop_table[] = {
|
||||
unicode_prop_Changes_When_Titlecased1_table,
|
||||
unicode_prop_Changes_When_Casefolded1_table,
|
||||
unicode_prop_Changes_When_NFKC_Casefolded1_table,
|
||||
unicode_prop_Basic_Emoji1_table,
|
||||
unicode_prop_Basic_Emoji2_table,
|
||||
unicode_prop_RGI_Emoji_Modifier_Sequence_table,
|
||||
unicode_prop_RGI_Emoji_Flag_Sequence_table,
|
||||
unicode_prop_Emoji_Keycap_Sequence_table,
|
||||
unicode_prop_ASCII_Hex_Digit_table,
|
||||
unicode_prop_Bidi_Control_table,
|
||||
unicode_prop_Dash_table,
|
||||
@ -4688,6 +4781,11 @@ static const uint16_t unicode_prop_len_table[] = {
|
||||
countof(unicode_prop_Changes_When_Titlecased1_table),
|
||||
countof(unicode_prop_Changes_When_Casefolded1_table),
|
||||
countof(unicode_prop_Changes_When_NFKC_Casefolded1_table),
|
||||
countof(unicode_prop_Basic_Emoji1_table),
|
||||
countof(unicode_prop_Basic_Emoji2_table),
|
||||
countof(unicode_prop_RGI_Emoji_Modifier_Sequence_table),
|
||||
countof(unicode_prop_RGI_Emoji_Flag_Sequence_table),
|
||||
countof(unicode_prop_Emoji_Keycap_Sequence_table),
|
||||
countof(unicode_prop_ASCII_Hex_Digit_table),
|
||||
countof(unicode_prop_Bidi_Control_table),
|
||||
countof(unicode_prop_Dash_table),
|
||||
@ -4726,5 +4824,325 @@ static const uint16_t unicode_prop_len_table[] = {
|
||||
countof(unicode_prop_Case_Ignorable_table),
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
UNICODE_SEQUENCE_PROP_Basic_Emoji,
|
||||
UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence,
|
||||
UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence,
|
||||
UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence,
|
||||
UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence,
|
||||
UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence,
|
||||
UNICODE_SEQUENCE_PROP_RGI_Emoji,
|
||||
UNICODE_SEQUENCE_PROP_COUNT,
|
||||
} UnicodeSequencePropertyEnum;
|
||||
|
||||
static const char unicode_sequence_prop_name_table[] =
|
||||
"Basic_Emoji" "\0"
|
||||
"Emoji_Keycap_Sequence" "\0"
|
||||
"RGI_Emoji_Modifier_Sequence" "\0"
|
||||
"RGI_Emoji_Flag_Sequence" "\0"
|
||||
"RGI_Emoji_Tag_Sequence" "\0"
|
||||
"RGI_Emoji_ZWJ_Sequence" "\0"
|
||||
"RGI_Emoji" "\0"
|
||||
;
|
||||
|
||||
static const uint8_t unicode_rgi_emoji_tag_sequence[18] = {
|
||||
0x67, 0x62, 0x65, 0x6e, 0x67, 0x00, 0x67, 0x62,
|
||||
0x73, 0x63, 0x74, 0x00, 0x67, 0x62, 0x77, 0x6c,
|
||||
0x73, 0x00,
|
||||
};
|
||||
|
||||
static const uint8_t unicode_rgi_emoji_zwj_sequence[2320] = {
|
||||
0x02, 0xb8, 0x19, 0x40, 0x86, 0x02, 0xd1, 0x39,
|
||||
0xb0, 0x19, 0x02, 0x26, 0x39, 0x42, 0x86, 0x02,
|
||||
0xb4, 0x36, 0x42, 0x86, 0x03, 0x68, 0x54, 0x64,
|
||||
0x87, 0x68, 0x54, 0x02, 0xdc, 0x39, 0x42, 0x86,
|
||||
0x02, 0xd1, 0x39, 0x73, 0x13, 0x02, 0x39, 0x39,
|
||||
0x40, 0x86, 0x02, 0x69, 0x34, 0xbd, 0x19, 0x03,
|
||||
0xb6, 0x36, 0x40, 0x86, 0xa1, 0x87, 0x03, 0x68,
|
||||
0x74, 0x1d, 0x19, 0x68, 0x74, 0x03, 0x68, 0x34,
|
||||
0xbd, 0x19, 0xa1, 0x87, 0x02, 0xf1, 0x7a, 0xf2,
|
||||
0x7a, 0x02, 0xca, 0x33, 0x42, 0x86, 0x02, 0x69,
|
||||
0x34, 0xb0, 0x19, 0x04, 0x68, 0x14, 0x68, 0x14,
|
||||
0x67, 0x14, 0x66, 0x14, 0x02, 0xf9, 0x26, 0x42,
|
||||
0x86, 0x03, 0x69, 0x74, 0x1d, 0x19, 0x69, 0x74,
|
||||
0x03, 0xd1, 0x19, 0xbc, 0x19, 0xa1, 0x87, 0x02,
|
||||
0x3c, 0x19, 0x40, 0x86, 0x02, 0x68, 0x34, 0xeb,
|
||||
0x13, 0x02, 0xc3, 0x33, 0xa1, 0x87, 0x02, 0x70,
|
||||
0x34, 0x40, 0x86, 0x02, 0xd4, 0x39, 0x42, 0x86,
|
||||
0x02, 0xcf, 0x39, 0x42, 0x86, 0x02, 0x47, 0x36,
|
||||
0x40, 0x86, 0x02, 0x39, 0x39, 0x42, 0x86, 0x04,
|
||||
0xd1, 0x79, 0x64, 0x87, 0x8b, 0x14, 0xd1, 0x79,
|
||||
0x02, 0xd1, 0x39, 0x95, 0x86, 0x02, 0x68, 0x34,
|
||||
0x93, 0x13, 0x02, 0x69, 0x34, 0xed, 0x13, 0x02,
|
||||
0xda, 0x39, 0x40, 0x86, 0x03, 0x69, 0x34, 0xaf,
|
||||
0x19, 0xa1, 0x87, 0x02, 0xd1, 0x39, 0x93, 0x13,
|
||||
0x03, 0xce, 0x39, 0x42, 0x86, 0xa1, 0x87, 0x03,
|
||||
0xd1, 0x79, 0x64, 0x87, 0xd1, 0x79, 0x03, 0xc3,
|
||||
0x33, 0x42, 0x86, 0xa1, 0x87, 0x03, 0x69, 0x74,
|
||||
0x1d, 0x19, 0x68, 0x74, 0x02, 0x69, 0x34, 0x92,
|
||||
0x16, 0x02, 0xd1, 0x39, 0x96, 0x86, 0x04, 0x69,
|
||||
0x14, 0x64, 0x87, 0x8b, 0x14, 0x68, 0x14, 0x02,
|
||||
0x68, 0x34, 0x7c, 0x13, 0x02, 0x47, 0x36, 0x42,
|
||||
0x86, 0x02, 0x86, 0x34, 0x42, 0x86, 0x02, 0xd1,
|
||||
0x39, 0x7c, 0x13, 0x02, 0x69, 0x14, 0xa4, 0x13,
|
||||
0x02, 0xda, 0x39, 0x42, 0x86, 0x02, 0x37, 0x39,
|
||||
0x40, 0x86, 0x02, 0xd1, 0x39, 0x08, 0x87, 0x04,
|
||||
0x68, 0x54, 0x64, 0x87, 0x8b, 0x14, 0x68, 0x54,
|
||||
0x02, 0x4d, 0x36, 0x40, 0x86, 0x02, 0x68, 0x34,
|
||||
0x2c, 0x15, 0x02, 0x69, 0x34, 0xaf, 0x19, 0x02,
|
||||
0x6e, 0x34, 0x40, 0x86, 0x02, 0xcd, 0x39, 0x42,
|
||||
0x86, 0x02, 0xd1, 0x39, 0x2c, 0x15, 0x02, 0x6f,
|
||||
0x14, 0x40, 0x86, 0x03, 0xd1, 0x39, 0xbc, 0x19,
|
||||
0xa1, 0x87, 0x02, 0x68, 0x34, 0xa8, 0x13, 0x02,
|
||||
0x69, 0x34, 0x73, 0x13, 0x04, 0x69, 0x54, 0x64,
|
||||
0x87, 0x8b, 0x14, 0x68, 0x54, 0x02, 0x71, 0x34,
|
||||
0x42, 0x86, 0x02, 0xd1, 0x39, 0xa8, 0x13, 0x02,
|
||||
0x45, 0x36, 0x40, 0x86, 0x03, 0x69, 0x54, 0x64,
|
||||
0x87, 0x68, 0x54, 0x03, 0x69, 0x54, 0x64, 0x87,
|
||||
0x69, 0x54, 0x03, 0xce, 0x39, 0x40, 0x86, 0xa1,
|
||||
0x87, 0x02, 0xd8, 0x39, 0x40, 0x86, 0x03, 0xc3,
|
||||
0x33, 0x40, 0x86, 0xa1, 0x87, 0x02, 0x4d, 0x36,
|
||||
0x42, 0x86, 0x02, 0xd1, 0x19, 0x92, 0x16, 0x02,
|
||||
0xd1, 0x39, 0xeb, 0x13, 0x02, 0x68, 0x34, 0xbc,
|
||||
0x14, 0x02, 0xd1, 0x39, 0xbc, 0x14, 0x02, 0x3d,
|
||||
0x39, 0x40, 0x86, 0x02, 0xb8, 0x39, 0x42, 0x86,
|
||||
0x02, 0xa3, 0x36, 0x40, 0x86, 0x02, 0x75, 0x35,
|
||||
0x40, 0x86, 0x02, 0xd8, 0x39, 0x42, 0x86, 0x02,
|
||||
0x69, 0x34, 0x93, 0x13, 0x02, 0x35, 0x39, 0x40,
|
||||
0x86, 0x02, 0x4b, 0x36, 0x40, 0x86, 0x02, 0x3d,
|
||||
0x39, 0x42, 0x86, 0x02, 0x38, 0x39, 0x42, 0x86,
|
||||
0x02, 0xa3, 0x36, 0x42, 0x86, 0x03, 0x69, 0x14,
|
||||
0x67, 0x14, 0x67, 0x14, 0x02, 0xb6, 0x36, 0x40,
|
||||
0x86, 0x02, 0x69, 0x34, 0x7c, 0x13, 0x02, 0x75,
|
||||
0x35, 0x42, 0x86, 0x02, 0xcc, 0x93, 0x40, 0x86,
|
||||
0x02, 0xcc, 0x33, 0x40, 0x86, 0x03, 0xd1, 0x39,
|
||||
0xbd, 0x19, 0xa1, 0x87, 0x02, 0x82, 0x34, 0x40,
|
||||
0x86, 0x02, 0x87, 0x34, 0x40, 0x86, 0x02, 0x69,
|
||||
0x14, 0x3e, 0x13, 0x02, 0xd6, 0x39, 0x40, 0x86,
|
||||
0x02, 0x68, 0x14, 0xbd, 0x19, 0x02, 0x46, 0x36,
|
||||
0x42, 0x86, 0x02, 0x4b, 0x36, 0x42, 0x86, 0x02,
|
||||
0x69, 0x34, 0x2c, 0x15, 0x03, 0xb6, 0x36, 0x42,
|
||||
0x86, 0xa1, 0x87, 0x02, 0xc4, 0x33, 0x40, 0x86,
|
||||
0x02, 0x26, 0x19, 0x40, 0x86, 0x02, 0x69, 0x14,
|
||||
0xb0, 0x19, 0x02, 0xde, 0x19, 0x42, 0x86, 0x02,
|
||||
0x69, 0x34, 0xa8, 0x13, 0x02, 0xcc, 0x33, 0x42,
|
||||
0x86, 0x02, 0x82, 0x34, 0x42, 0x86, 0x02, 0xd1,
|
||||
0x19, 0x93, 0x13, 0x02, 0x81, 0x14, 0x42, 0x86,
|
||||
0x02, 0x69, 0x34, 0x95, 0x86, 0x02, 0x68, 0x34,
|
||||
0xbb, 0x14, 0x02, 0xd1, 0x39, 0xbb, 0x14, 0x02,
|
||||
0x69, 0x34, 0xeb, 0x13, 0x02, 0xd1, 0x39, 0x84,
|
||||
0x13, 0x02, 0x69, 0x34, 0xbc, 0x14, 0x04, 0x69,
|
||||
0x54, 0x64, 0x87, 0x8b, 0x14, 0x69, 0x54, 0x02,
|
||||
0x26, 0x39, 0x40, 0x86, 0x02, 0xb4, 0x36, 0x40,
|
||||
0x86, 0x02, 0x47, 0x16, 0x42, 0x86, 0x02, 0xdc,
|
||||
0x39, 0x40, 0x86, 0x02, 0xca, 0x33, 0x40, 0x86,
|
||||
0x02, 0xf9, 0x26, 0x40, 0x86, 0x02, 0x69, 0x34,
|
||||
0x08, 0x87, 0x03, 0x69, 0x14, 0x69, 0x14, 0x66,
|
||||
0x14, 0x03, 0xd1, 0x59, 0x1d, 0x19, 0xd1, 0x59,
|
||||
0x02, 0xd4, 0x39, 0x40, 0x86, 0x02, 0xcf, 0x39,
|
||||
0x40, 0x86, 0x02, 0x68, 0x34, 0xa4, 0x13, 0x02,
|
||||
0xd1, 0x39, 0xa4, 0x13, 0x02, 0xd1, 0x19, 0xa8,
|
||||
0x13, 0x02, 0xd7, 0x39, 0x42, 0x86, 0x03, 0x69,
|
||||
0x34, 0xbc, 0x19, 0xa1, 0x87, 0x02, 0x68, 0x14,
|
||||
0xb0, 0x19, 0x02, 0x68, 0x14, 0x73, 0x13, 0x04,
|
||||
0x69, 0x14, 0x69, 0x14, 0x66, 0x14, 0x66, 0x14,
|
||||
0x03, 0x68, 0x34, 0xaf, 0x19, 0xa1, 0x87, 0x02,
|
||||
0x68, 0x34, 0x80, 0x16, 0x02, 0x73, 0x34, 0x42,
|
||||
0x86, 0x02, 0xd1, 0x39, 0x80, 0x16, 0x02, 0x68,
|
||||
0x34, 0xb0, 0x19, 0x02, 0x86, 0x34, 0x40, 0x86,
|
||||
0x02, 0x38, 0x19, 0x42, 0x86, 0x02, 0x69, 0x34,
|
||||
0xbb, 0x14, 0x02, 0xb5, 0x36, 0x42, 0x86, 0x02,
|
||||
0xcd, 0x39, 0x40, 0x86, 0x02, 0x68, 0x34, 0x95,
|
||||
0x86, 0x02, 0x68, 0x34, 0x27, 0x15, 0x03, 0x68,
|
||||
0x14, 0x68, 0x14, 0x66, 0x14, 0x02, 0x71, 0x34,
|
||||
0x40, 0x86, 0x02, 0xd1, 0x39, 0x27, 0x15, 0x02,
|
||||
0x2e, 0x16, 0xa8, 0x14, 0x02, 0xc3, 0x33, 0x42,
|
||||
0x86, 0x02, 0x69, 0x14, 0x66, 0x14, 0x02, 0x68,
|
||||
0x34, 0x96, 0x86, 0x02, 0x69, 0x34, 0xa4, 0x13,
|
||||
0x03, 0x69, 0x14, 0x64, 0x87, 0x68, 0x14, 0x02,
|
||||
0xb8, 0x39, 0x40, 0x86, 0x02, 0x68, 0x34, 0x3e,
|
||||
0x13, 0x03, 0xd1, 0x19, 0xaf, 0x19, 0xa1, 0x87,
|
||||
0x02, 0xd1, 0x39, 0x3e, 0x13, 0x02, 0x68, 0x34,
|
||||
0xbd, 0x19, 0x02, 0xd1, 0x19, 0xbb, 0x14, 0x02,
|
||||
0xd1, 0x19, 0x95, 0x86, 0x02, 0xdb, 0x39, 0x42,
|
||||
0x86, 0x02, 0x38, 0x39, 0x40, 0x86, 0x02, 0x69,
|
||||
0x34, 0x80, 0x16, 0x02, 0x69, 0x14, 0xeb, 0x13,
|
||||
0x04, 0x68, 0x14, 0x69, 0x14, 0x67, 0x14, 0x67,
|
||||
0x14, 0x02, 0x77, 0x34, 0x42, 0x86, 0x02, 0x46,
|
||||
0x36, 0x40, 0x86, 0x02, 0x68, 0x34, 0x92, 0x16,
|
||||
0x02, 0x4e, 0x36, 0x42, 0x86, 0x03, 0x69, 0x14,
|
||||
0xbd, 0x19, 0xa1, 0x87, 0x02, 0xde, 0x19, 0x40,
|
||||
0x86, 0x02, 0x69, 0x34, 0x27, 0x15, 0x03, 0xc3,
|
||||
0x13, 0x40, 0x86, 0xa1, 0x87, 0x02, 0x81, 0x14,
|
||||
0x40, 0x86, 0x03, 0xd1, 0x39, 0xaf, 0x19, 0xa1,
|
||||
0x87, 0x02, 0x68, 0x34, 0xbc, 0x19, 0x02, 0xd1,
|
||||
0x19, 0x80, 0x16, 0x02, 0xd9, 0x39, 0x42, 0x86,
|
||||
0x02, 0xd1, 0x39, 0xbc, 0x19, 0x02, 0xdc, 0x19,
|
||||
0x42, 0x86, 0x02, 0x68, 0x34, 0x73, 0x13, 0x02,
|
||||
0x69, 0x34, 0x3e, 0x13, 0x02, 0x47, 0x16, 0x40,
|
||||
0x86, 0x02, 0xd1, 0x39, 0xbd, 0x19, 0x02, 0x3e,
|
||||
0x39, 0x42, 0x86, 0x02, 0x69, 0x14, 0x95, 0x86,
|
||||
0x02, 0x68, 0x14, 0x96, 0x86, 0x03, 0x69, 0x34,
|
||||
0xbd, 0x19, 0xa1, 0x87, 0x02, 0xd7, 0x39, 0x40,
|
||||
0x86, 0x02, 0x45, 0x16, 0x42, 0x86, 0x02, 0x68,
|
||||
0x34, 0xed, 0x13, 0x03, 0x68, 0x34, 0xbc, 0x19,
|
||||
0xa1, 0x87, 0x02, 0xd1, 0x39, 0xed, 0x13, 0x02,
|
||||
0xd1, 0x39, 0x92, 0x16, 0x02, 0x73, 0x34, 0x40,
|
||||
0x86, 0x02, 0x38, 0x19, 0x40, 0x86, 0x02, 0xb5,
|
||||
0x36, 0x40, 0x86, 0x02, 0x68, 0x34, 0xaf, 0x19,
|
||||
0x02, 0xd1, 0x39, 0xaf, 0x19, 0x02, 0x69, 0x34,
|
||||
0xbc, 0x19, 0x02, 0xb6, 0x16, 0x42, 0x86, 0x02,
|
||||
0x26, 0x14, 0x25, 0x15, 0x02, 0xc3, 0x33, 0x40,
|
||||
0x86, 0x02, 0xdd, 0x39, 0x42, 0x86, 0x02, 0xcb,
|
||||
0x93, 0x42, 0x86, 0x02, 0xcb, 0x33, 0x42, 0x86,
|
||||
0x02, 0x81, 0x34, 0x42, 0x86, 0x02, 0xce, 0x39,
|
||||
0xa1, 0x87, 0x02, 0xdb, 0x39, 0x40, 0x86, 0x02,
|
||||
0x68, 0x34, 0x08, 0x87, 0x02, 0xd1, 0x19, 0xb0,
|
||||
0x19, 0x02, 0x77, 0x34, 0x40, 0x86, 0x02, 0x4e,
|
||||
0x36, 0x40, 0x86, 0x02, 0xce, 0x39, 0x42, 0x86,
|
||||
0x02, 0x4e, 0x16, 0x42, 0x86, 0x02, 0xd9, 0x39,
|
||||
0x40, 0x86, 0x02, 0xdc, 0x19, 0x40, 0x86, 0x02,
|
||||
0x3e, 0x39, 0x40, 0x86, 0x02, 0xb9, 0x39, 0x42,
|
||||
0x86, 0x02, 0xda, 0x19, 0x42, 0x86, 0x02, 0x42,
|
||||
0x16, 0x94, 0x81, 0x02, 0x45, 0x16, 0x40, 0x86,
|
||||
0x02, 0x69, 0x14, 0xbd, 0x19, 0x02, 0x70, 0x34,
|
||||
0x42, 0x86, 0x02, 0xce, 0x19, 0xa1, 0x87, 0x02,
|
||||
0xc3, 0x13, 0x42, 0x86, 0x02, 0x68, 0x14, 0x08,
|
||||
0x87, 0x02, 0xd1, 0x19, 0x7c, 0x13, 0x02, 0x68,
|
||||
0x14, 0x92, 0x16, 0x02, 0xb6, 0x16, 0x40, 0x86,
|
||||
0x02, 0x37, 0x39, 0x42, 0x86, 0x03, 0xce, 0x19,
|
||||
0x42, 0x86, 0xa1, 0x87, 0x03, 0x68, 0x14, 0x67,
|
||||
0x14, 0x67, 0x14, 0x02, 0xdd, 0x39, 0x40, 0x86,
|
||||
0x02, 0xcf, 0x19, 0x42, 0x86, 0x02, 0xd1, 0x19,
|
||||
0x2c, 0x15, 0x02, 0x4b, 0x13, 0xe9, 0x17, 0x02,
|
||||
0x68, 0x14, 0x67, 0x14, 0x02, 0xcb, 0x93, 0x40,
|
||||
0x86, 0x02, 0x6e, 0x34, 0x42, 0x86, 0x02, 0xcb,
|
||||
0x33, 0x40, 0x86, 0x02, 0x81, 0x34, 0x40, 0x86,
|
||||
0x02, 0xb6, 0x36, 0xa1, 0x87, 0x02, 0x45, 0x36,
|
||||
0x42, 0x86, 0x02, 0xb4, 0x16, 0x42, 0x86, 0x02,
|
||||
0x69, 0x14, 0x73, 0x13, 0x04, 0x69, 0x14, 0x69,
|
||||
0x14, 0x67, 0x14, 0x66, 0x14, 0x02, 0x35, 0x39,
|
||||
0x42, 0x86, 0x02, 0x68, 0x14, 0x93, 0x13, 0x02,
|
||||
0xb6, 0x36, 0x42, 0x86, 0x03, 0x68, 0x14, 0x69,
|
||||
0x14, 0x66, 0x14, 0x02, 0xce, 0x39, 0x40, 0x86,
|
||||
0x02, 0x4e, 0x16, 0x40, 0x86, 0x02, 0x87, 0x34,
|
||||
0x42, 0x86, 0x02, 0x86, 0x14, 0x42, 0x86, 0x02,
|
||||
0xd6, 0x39, 0x42, 0x86, 0x02, 0xc4, 0x33, 0x42,
|
||||
0x86, 0x02, 0x69, 0x34, 0x96, 0x86, 0x02, 0xb9,
|
||||
0x39, 0x40, 0x86, 0x02, 0x68, 0x14, 0xa8, 0x13,
|
||||
0x02, 0xd1, 0x19, 0x84, 0x13, 0x02, 0xda, 0x19,
|
||||
0x40, 0x86, 0x02, 0xd8, 0x19, 0x42, 0x86, 0x02,
|
||||
0xc3, 0x13, 0x40, 0x86, 0x02, 0xb9, 0x19, 0x42,
|
||||
0x86, 0x02, 0x3d, 0x19, 0x42, 0x86, 0x02, 0xcf,
|
||||
0x19, 0x40, 0x86, 0x04, 0x68, 0x14, 0x68, 0x14,
|
||||
0x67, 0x14, 0x67, 0x14, 0x03, 0xd1, 0x19, 0xd1,
|
||||
0x19, 0xd2, 0x19, 0x02, 0x68, 0x14, 0xbb, 0x14,
|
||||
0x02, 0x3b, 0x14, 0x44, 0x87, 0x02, 0xd1, 0x19,
|
||||
0x27, 0x15, 0x02, 0xb4, 0x16, 0x40, 0x86, 0x02,
|
||||
0xcd, 0x19, 0x42, 0x86, 0x02, 0xd3, 0x86, 0xa5,
|
||||
0x14, 0x02, 0x70, 0x14, 0x42, 0x86, 0x03, 0xb6,
|
||||
0x16, 0x42, 0x86, 0xa1, 0x87, 0x04, 0x69, 0x14,
|
||||
0x64, 0x87, 0x8b, 0x14, 0x69, 0x14, 0x02, 0x36,
|
||||
0x16, 0x2b, 0x93, 0x02, 0x68, 0x14, 0x80, 0x16,
|
||||
0x02, 0x86, 0x14, 0x40, 0x86, 0x02, 0x08, 0x14,
|
||||
0x1b, 0x0b, 0x02, 0xd1, 0x19, 0xbc, 0x19, 0x02,
|
||||
0xca, 0x13, 0x42, 0x86, 0x02, 0x41, 0x94, 0xe8,
|
||||
0x95, 0x02, 0xd8, 0x19, 0x40, 0x86, 0x02, 0xb9,
|
||||
0x19, 0x40, 0x86, 0x02, 0xd1, 0x19, 0xed, 0x13,
|
||||
0x02, 0xf9, 0x86, 0x42, 0x86, 0x03, 0xd1, 0x19,
|
||||
0xbd, 0x19, 0xa1, 0x87, 0x02, 0x3d, 0x19, 0x40,
|
||||
0x86, 0x02, 0xd6, 0x19, 0x42, 0x86, 0x03, 0x69,
|
||||
0x14, 0x66, 0x14, 0x66, 0x14, 0x02, 0xd1, 0x19,
|
||||
0xaf, 0x19, 0x03, 0x69, 0x14, 0x69, 0x14, 0x67,
|
||||
0x14, 0x02, 0xcd, 0x19, 0x40, 0x86, 0x02, 0x70,
|
||||
0x14, 0x40, 0x86, 0x03, 0x68, 0x14, 0xbc, 0x19,
|
||||
0xa1, 0x87, 0x02, 0x6e, 0x14, 0x42, 0x86, 0x02,
|
||||
0x69, 0x14, 0x92, 0x16, 0x03, 0x68, 0x14, 0x68,
|
||||
0x14, 0x67, 0x14, 0x02, 0x69, 0x14, 0x67, 0x14,
|
||||
0x02, 0x75, 0x95, 0x42, 0x86, 0x03, 0x69, 0x14,
|
||||
0x64, 0x87, 0x69, 0x14, 0x02, 0xd1, 0x19, 0xbc,
|
||||
0x14, 0x02, 0xdf, 0x19, 0x42, 0x86, 0x02, 0xca,
|
||||
0x13, 0x40, 0x86, 0x02, 0x82, 0x14, 0x42, 0x86,
|
||||
0x02, 0x69, 0x14, 0x93, 0x13, 0x02, 0x68, 0x14,
|
||||
0x7c, 0x13, 0x02, 0xf9, 0x86, 0x40, 0x86, 0x02,
|
||||
0xd6, 0x19, 0x40, 0x86, 0x02, 0x68, 0x14, 0x2c,
|
||||
0x15, 0x02, 0x69, 0x14, 0xa8, 0x13, 0x02, 0xd4,
|
||||
0x19, 0x42, 0x86, 0x04, 0x68, 0x14, 0x69, 0x14,
|
||||
0x66, 0x14, 0x66, 0x14, 0x02, 0x77, 0x14, 0x42,
|
||||
0x86, 0x02, 0x39, 0x19, 0x42, 0x86, 0x02, 0xd1,
|
||||
0x19, 0xa4, 0x13, 0x02, 0x6e, 0x14, 0x40, 0x86,
|
||||
0x03, 0xd1, 0x19, 0xd2, 0x19, 0xd2, 0x19, 0x02,
|
||||
0x69, 0x14, 0xbb, 0x14, 0x02, 0xd1, 0x19, 0x96,
|
||||
0x86, 0x02, 0x75, 0x95, 0x40, 0x86, 0x04, 0x68,
|
||||
0x14, 0x64, 0x87, 0x8b, 0x14, 0x68, 0x14, 0x02,
|
||||
0xd1, 0x19, 0x3e, 0x13, 0x02, 0xdf, 0x19, 0x40,
|
||||
0x86, 0x02, 0x82, 0x14, 0x40, 0x86, 0x02, 0x44,
|
||||
0x13, 0xeb, 0x17, 0x02, 0xdd, 0x19, 0x42, 0x86,
|
||||
0x02, 0x69, 0x14, 0x80, 0x16, 0x03, 0x68, 0x14,
|
||||
0xaf, 0x19, 0xa1, 0x87, 0x02, 0xa3, 0x16, 0x42,
|
||||
0x86, 0x02, 0x69, 0x14, 0x96, 0x86, 0x02, 0x46,
|
||||
0x16, 0x42, 0x86, 0x02, 0xb6, 0x16, 0xa1, 0x87,
|
||||
0x02, 0x68, 0x14, 0x27, 0x15, 0x02, 0x26, 0x14,
|
||||
0x1b, 0x0b, 0x02, 0xd4, 0x19, 0x40, 0x86, 0x02,
|
||||
0x77, 0x14, 0x40, 0x86, 0x02, 0x39, 0x19, 0x40,
|
||||
0x86, 0x02, 0x37, 0x19, 0x42, 0x86, 0x03, 0x69,
|
||||
0x14, 0x67, 0x14, 0x66, 0x14, 0x03, 0xc3, 0x13,
|
||||
0x42, 0x86, 0xa1, 0x87, 0x02, 0x68, 0x14, 0xbc,
|
||||
0x19, 0x02, 0xd1, 0x19, 0xeb, 0x13, 0x04, 0x69,
|
||||
0x14, 0x69, 0x14, 0x67, 0x14, 0x67, 0x14, 0x02,
|
||||
0xd1, 0x19, 0x08, 0x87, 0x02, 0x68, 0x14, 0xed,
|
||||
0x13, 0x03, 0x69, 0x14, 0xbc, 0x19, 0xa1, 0x87,
|
||||
0x02, 0xdd, 0x19, 0x40, 0x86, 0x02, 0xc3, 0x13,
|
||||
0xa1, 0x87, 0x03, 0x68, 0x14, 0x66, 0x14, 0x66,
|
||||
0x14, 0x03, 0x68, 0x14, 0x69, 0x14, 0x67, 0x14,
|
||||
0x02, 0xa3, 0x16, 0x40, 0x86, 0x02, 0xdb, 0x19,
|
||||
0x42, 0x86, 0x02, 0x68, 0x14, 0xaf, 0x19, 0x02,
|
||||
0x46, 0x16, 0x40, 0x86, 0x02, 0x35, 0x16, 0xab,
|
||||
0x14, 0x02, 0x68, 0x14, 0x95, 0x86, 0x02, 0x42,
|
||||
0x16, 0x95, 0x81, 0x02, 0xc4, 0x13, 0x42, 0x86,
|
||||
0x02, 0x15, 0x14, 0xba, 0x19, 0x02, 0x69, 0x14,
|
||||
0x08, 0x87, 0x03, 0xd1, 0x19, 0x1d, 0x19, 0xd1,
|
||||
0x19, 0x02, 0x69, 0x14, 0x7c, 0x13, 0x02, 0x37,
|
||||
0x19, 0x40, 0x86, 0x02, 0x73, 0x14, 0x42, 0x86,
|
||||
0x02, 0x69, 0x14, 0x2c, 0x15, 0x02, 0xb5, 0x16,
|
||||
0x42, 0x86, 0x02, 0x35, 0x19, 0x42, 0x86, 0x04,
|
||||
0x68, 0x14, 0x69, 0x14, 0x67, 0x14, 0x66, 0x14,
|
||||
0x02, 0x64, 0x87, 0x25, 0x15, 0x02, 0x64, 0x87,
|
||||
0x79, 0x1a, 0x02, 0x68, 0x14, 0xbc, 0x14, 0x03,
|
||||
0xce, 0x19, 0x40, 0x86, 0xa1, 0x87, 0x02, 0x87,
|
||||
0x14, 0x42, 0x86, 0x02, 0x4d, 0x16, 0x42, 0x86,
|
||||
0x04, 0x68, 0x14, 0x68, 0x14, 0x66, 0x14, 0x66,
|
||||
0x14, 0x02, 0xdb, 0x19, 0x40, 0x86, 0x02, 0xd9,
|
||||
0x19, 0x42, 0x86, 0x02, 0xc4, 0x13, 0x40, 0x86,
|
||||
0x02, 0xd1, 0x19, 0xbd, 0x19, 0x02, 0x68, 0x14,
|
||||
0xa4, 0x13, 0x02, 0x3e, 0x19, 0x42, 0x86, 0x02,
|
||||
0xf3, 0x93, 0xa7, 0x86, 0x03, 0x69, 0x14, 0xaf,
|
||||
0x19, 0xa1, 0x87, 0x02, 0xf3, 0x93, 0x08, 0x13,
|
||||
0x02, 0xd1, 0x19, 0xd2, 0x19, 0x02, 0x73, 0x14,
|
||||
0x40, 0x86, 0x02, 0xb5, 0x16, 0x40, 0x86, 0x02,
|
||||
0x35, 0x19, 0x40, 0x86, 0x02, 0x69, 0x14, 0x27,
|
||||
0x15, 0x02, 0xce, 0x19, 0x42, 0x86, 0x02, 0x71,
|
||||
0x14, 0x42, 0x86, 0x02, 0xd1, 0x19, 0x73, 0x13,
|
||||
0x02, 0x68, 0x14, 0x3e, 0x13, 0x02, 0xf4, 0x13,
|
||||
0x20, 0x86, 0x02, 0x87, 0x14, 0x40, 0x86, 0x03,
|
||||
0xb6, 0x16, 0x40, 0x86, 0xa1, 0x87, 0x02, 0x4d,
|
||||
0x16, 0x40, 0x86, 0x02, 0x69, 0x14, 0xbc, 0x19,
|
||||
0x02, 0x4b, 0x16, 0x42, 0x86, 0x02, 0xd9, 0x19,
|
||||
0x40, 0x86, 0x02, 0x3e, 0x19, 0x40, 0x86, 0x02,
|
||||
0x69, 0x14, 0xed, 0x13, 0x02, 0xd7, 0x19, 0x42,
|
||||
0x86, 0x02, 0xb8, 0x19, 0x42, 0x86, 0x03, 0x68,
|
||||
0x14, 0x67, 0x14, 0x66, 0x14, 0x02, 0x3c, 0x19,
|
||||
0x42, 0x86, 0x02, 0x68, 0x14, 0x66, 0x14, 0x03,
|
||||
0x68, 0x14, 0x64, 0x87, 0x68, 0x14, 0x02, 0x69,
|
||||
0x14, 0xaf, 0x19, 0x02, 0xce, 0x19, 0x40, 0x86,
|
||||
0x02, 0x71, 0x14, 0x40, 0x86, 0x02, 0x68, 0x14,
|
||||
0xeb, 0x13, 0x03, 0x68, 0x14, 0xbd, 0x19, 0xa1,
|
||||
0x87, 0x02, 0x6f, 0x14, 0x42, 0x86, 0x04, 0xd1,
|
||||
0x19, 0xd1, 0x19, 0xd2, 0x19, 0xd2, 0x19, 0x02,
|
||||
0x69, 0x14, 0xbc, 0x14, 0x02, 0xcc, 0x93, 0x42,
|
||||
0x86, 0x02, 0x4b, 0x16, 0x40, 0x86, 0x02, 0x26,
|
||||
0x19, 0x42, 0x86, 0x02, 0xd7, 0x19, 0x40, 0x86,
|
||||
};
|
||||
|
||||
#endif /* CONFIG_ALL_UNICODE */
|
||||
/* 64 tables / 33442 bytes, 5 index / 351 bytes */
|
||||
/* 71 tables / 36311 bytes, 5 index / 351 bytes */
|
||||
|
215
libunicode.c
215
libunicode.c
@ -499,6 +499,9 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
|
||||
case CR_OP_XOR:
|
||||
is_in = (a_idx & 1) ^ (b_idx & 1);
|
||||
break;
|
||||
case CR_OP_SUB:
|
||||
is_in = (a_idx & 1) & ((b_idx & 1) ^ 1);
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
@ -511,14 +514,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
|
||||
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op)
|
||||
{
|
||||
CharRange a = *cr;
|
||||
int ret;
|
||||
cr->len = 0;
|
||||
cr->size = 0;
|
||||
cr->points = NULL;
|
||||
ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
|
||||
ret = cr_op(cr, a.points, a.len, b_pt, b_len, op);
|
||||
cr_free(&a);
|
||||
return ret;
|
||||
}
|
||||
@ -1554,6 +1557,7 @@ static int unicode_prop_ops(CharRange *cr, ...)
|
||||
cr2 = &stack[stack_len - 1];
|
||||
cr3 = &stack[stack_len++];
|
||||
cr_init(cr3, cr->mem_opaque, cr->realloc_func);
|
||||
/* CR_OP_XOR may be used here */
|
||||
if (cr_op(cr3, cr1->points, cr1->len,
|
||||
cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
|
||||
goto fail;
|
||||
@ -1908,3 +1912,210 @@ BOOL lre_is_space_non_ascii(uint32_t c)
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
#define SEQ_MAX_LEN 16
|
||||
|
||||
static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque,
|
||||
CharRange *cr)
|
||||
{
|
||||
int i, c, j;
|
||||
uint32_t seq[SEQ_MAX_LEN];
|
||||
|
||||
switch(seq_prop_idx) {
|
||||
case UNICODE_SEQUENCE_PROP_Basic_Emoji:
|
||||
if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0)
|
||||
return -1;
|
||||
for(i = 0; i < cr->len; i += 2) {
|
||||
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
|
||||
seq[0] = c;
|
||||
cb(opaque, seq, 1);
|
||||
}
|
||||
}
|
||||
|
||||
cr->len = 0;
|
||||
|
||||
if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0)
|
||||
return -1;
|
||||
for(i = 0; i < cr->len; i += 2) {
|
||||
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
|
||||
seq[0] = c;
|
||||
seq[1] = 0xfe0f;
|
||||
cb(opaque, seq, 2);
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
|
||||
if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0)
|
||||
return -1;
|
||||
for(i = 0; i < cr->len; i += 2) {
|
||||
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
|
||||
for(j = 0; j < 5; j++) {
|
||||
seq[0] = c;
|
||||
seq[1] = 0x1f3fb + j;
|
||||
cb(opaque, seq, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
|
||||
if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0)
|
||||
return -1;
|
||||
for(i = 0; i < cr->len; i += 2) {
|
||||
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
|
||||
int c0, c1;
|
||||
c0 = c / 26;
|
||||
c1 = c % 26;
|
||||
seq[0] = 0x1F1E6 + c0;
|
||||
seq[1] = 0x1F1E6 + c1;
|
||||
cb(opaque, seq, 2);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
|
||||
{
|
||||
int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1;
|
||||
int mod_idx, hc_idx, i0, i1;
|
||||
const uint8_t *tab = unicode_rgi_emoji_zwj_sequence;
|
||||
|
||||
for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) {
|
||||
len = tab[i++];
|
||||
k = 0;
|
||||
mod = 0;
|
||||
mod_count = 0;
|
||||
hc_pos = -1;
|
||||
for(j = 0; j < len; j++) {
|
||||
code = tab[i++];
|
||||
code |= tab[i++] << 8;
|
||||
pres = code >> 15;
|
||||
mod1 = (code >> 13) & 3;
|
||||
code &= 0x1fff;
|
||||
if (code < 0x1000) {
|
||||
c = code + 0x2000;
|
||||
} else {
|
||||
c = 0x1f000 + (code - 0x1000);
|
||||
}
|
||||
if (c == 0x1f9b0)
|
||||
hc_pos = k;
|
||||
seq[k++] = c;
|
||||
if (mod1 != 0) {
|
||||
assert(mod_count < 2);
|
||||
mod = mod1;
|
||||
mod_pos[mod_count++] = k;
|
||||
seq[k++] = 0; /* will be filled later */
|
||||
}
|
||||
if (pres) {
|
||||
seq[k++] = 0xfe0f;
|
||||
}
|
||||
if (j < len - 1) {
|
||||
seq[k++] = 0x200d;
|
||||
}
|
||||
}
|
||||
|
||||
/* genrate all the variants */
|
||||
switch(mod) {
|
||||
case 1:
|
||||
n_mod = 5;
|
||||
break;
|
||||
case 2:
|
||||
n_mod = 25;
|
||||
break;
|
||||
case 3:
|
||||
n_mod = 20;
|
||||
break;
|
||||
default:
|
||||
n_mod = 1;
|
||||
break;
|
||||
}
|
||||
if (hc_pos >= 0)
|
||||
n_hc = 4;
|
||||
else
|
||||
n_hc = 1;
|
||||
for(hc_idx = 0; hc_idx < n_hc; hc_idx++) {
|
||||
for(mod_idx = 0; mod_idx < n_mod; mod_idx++) {
|
||||
if (hc_pos >= 0)
|
||||
seq[hc_pos] = 0x1f9b0 + hc_idx;
|
||||
|
||||
switch(mod) {
|
||||
case 1:
|
||||
seq[mod_pos[0]] = 0x1f3fb + mod_idx;
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
i0 = mod_idx / 5;
|
||||
i1 = mod_idx % 5;
|
||||
/* avoid identical values */
|
||||
if (mod == 3 && i0 >= i1)
|
||||
i0++;
|
||||
seq[mod_pos[0]] = 0x1f3fb + i0;
|
||||
seq[mod_pos[1]] = 0x1f3fb + i1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#if 0
|
||||
for(j = 0; j < k; j++)
|
||||
printf(" %04x", seq[j]);
|
||||
printf("\n");
|
||||
#endif
|
||||
cb(opaque, seq, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
|
||||
{
|
||||
for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) {
|
||||
j = 0;
|
||||
seq[j++] = 0x1F3F4;
|
||||
for(;;) {
|
||||
c = unicode_rgi_emoji_tag_sequence[i++];
|
||||
if (c == 0x00)
|
||||
break;
|
||||
seq[j++] = 0xe0000 + c;
|
||||
}
|
||||
seq[j++] = 0xe007f;
|
||||
cb(opaque, seq, j);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence:
|
||||
if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0)
|
||||
return -1;
|
||||
for(i = 0; i < cr->len; i += 2) {
|
||||
for(c = cr->points[i]; c < cr->points[i + 1]; c++) {
|
||||
seq[0] = c;
|
||||
seq[1] = 0xfe0f;
|
||||
seq[2] = 0x20e3;
|
||||
cb(opaque, seq, 3);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UNICODE_SEQUENCE_PROP_RGI_Emoji:
|
||||
/* all prevous sequences */
|
||||
for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) {
|
||||
int ret;
|
||||
ret = unicode_sequence_prop1(i, cb, opaque, cr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
cr->len = 0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return -2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* build a unicode sequence property */
|
||||
/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */
|
||||
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
|
||||
CharRange *cr)
|
||||
{
|
||||
int seq_prop_idx;
|
||||
seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name);
|
||||
if (seq_prop_idx < 0)
|
||||
return -2;
|
||||
return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr);
|
||||
}
|
||||
|
14
libunicode.h
14
libunicode.h
@ -45,6 +45,7 @@ typedef enum {
|
||||
CR_OP_UNION,
|
||||
CR_OP_INTER,
|
||||
CR_OP_XOR,
|
||||
CR_OP_SUB,
|
||||
} CharRangeOpEnum;
|
||||
|
||||
void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
|
||||
@ -73,19 +74,18 @@ static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len);
|
||||
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
|
||||
const uint32_t *b_pt, int b_len, int op);
|
||||
int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op);
|
||||
|
||||
static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
|
||||
{
|
||||
uint32_t b_pt[2];
|
||||
b_pt[0] = c1;
|
||||
b_pt[1] = c2 + 1;
|
||||
return cr_union1(cr, b_pt, 2);
|
||||
return cr_op1(cr, b_pt, 2, CR_OP_UNION);
|
||||
}
|
||||
|
||||
int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
|
||||
const uint32_t *b_pt, int b_len, int op);
|
||||
|
||||
int cr_invert(CharRange *cr);
|
||||
|
||||
int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
|
||||
@ -107,6 +107,10 @@ int unicode_script(CharRange *cr, const char *script_name, int is_ext);
|
||||
int unicode_general_category(CharRange *cr, const char *gc_name);
|
||||
int unicode_prop(CharRange *cr, const char *prop_name);
|
||||
|
||||
typedef void UnicodeSequencePropCB(void *opaque, const uint32_t *buf, int len);
|
||||
int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque,
|
||||
CharRange *cr);
|
||||
|
||||
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
|
||||
int lre_canonicalize(uint32_t c, int is_unicode);
|
||||
|
||||
|
@ -177,6 +177,12 @@ DEF(minus_zero, "-0")
|
||||
DEF(Infinity, "Infinity")
|
||||
DEF(minus_Infinity, "-Infinity")
|
||||
DEF(NaN, "NaN")
|
||||
DEF(hasIndices, "hasIndices")
|
||||
DEF(ignoreCase, "ignoreCase")
|
||||
DEF(multiline, "multiline")
|
||||
DEF(dotAll, "dotAll")
|
||||
DEF(sticky, "sticky")
|
||||
DEF(unicodeSets, "unicodeSets")
|
||||
/* the following 3 atoms are only used with CONFIG_ATOMICS */
|
||||
DEF(not_equal, "not-equal")
|
||||
DEF(timed_out, "timed-out")
|
||||
|
93
quickjs.c
93
quickjs.c
@ -44179,6 +44179,9 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
|
||||
case 'u':
|
||||
mask = LRE_FLAG_UNICODE;
|
||||
break;
|
||||
case 'v':
|
||||
mask = LRE_FLAG_UNICODE_SETS;
|
||||
break;
|
||||
case 'y':
|
||||
mask = LRE_FLAG_STICKY;
|
||||
break;
|
||||
@ -44188,14 +44191,20 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
|
||||
if ((re_flags & mask) != 0) {
|
||||
bad_flags:
|
||||
JS_FreeCString(ctx, str);
|
||||
return JS_ThrowSyntaxError(ctx, "invalid regular expression flags");
|
||||
goto bad_flags1;
|
||||
}
|
||||
re_flags |= mask;
|
||||
}
|
||||
JS_FreeCString(ctx, str);
|
||||
}
|
||||
|
||||
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
|
||||
/* 'u' and 'v' cannot be both set */
|
||||
if ((re_flags & LRE_FLAG_UNICODE_SETS) && (re_flags & LRE_FLAG_UNICODE)) {
|
||||
bad_flags1:
|
||||
return JS_ThrowSyntaxError(ctx, "invalid regular expression flags");
|
||||
}
|
||||
|
||||
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)));
|
||||
if (!str)
|
||||
return JS_EXCEPTION;
|
||||
re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
|
||||
@ -44499,49 +44508,34 @@ static JSValue js_regexp_get_flag(JSContext *ctx, JSValueConst this_val, int mas
|
||||
return JS_NewBool(ctx, flags & mask);
|
||||
}
|
||||
|
||||
#define RE_FLAG_COUNT 8
|
||||
|
||||
static JSValue js_regexp_get_flags(JSContext *ctx, JSValueConst this_val)
|
||||
{
|
||||
char str[8], *p = str;
|
||||
int res;
|
||||
|
||||
char str[RE_FLAG_COUNT], *p = str;
|
||||
int res, i;
|
||||
static const int flag_atom[RE_FLAG_COUNT] = {
|
||||
JS_ATOM_hasIndices,
|
||||
JS_ATOM_global,
|
||||
JS_ATOM_ignoreCase,
|
||||
JS_ATOM_multiline,
|
||||
JS_ATOM_dotAll,
|
||||
JS_ATOM_unicode,
|
||||
JS_ATOM_unicodeSets,
|
||||
JS_ATOM_sticky,
|
||||
};
|
||||
static const char flag_char[RE_FLAG_COUNT] = { 'd', 'g', 'i', 'm', 's', 'u', 'v', 'y' };
|
||||
|
||||
if (JS_VALUE_GET_TAG(this_val) != JS_TAG_OBJECT)
|
||||
return JS_ThrowTypeErrorNotAnObject(ctx);
|
||||
|
||||
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "hasIndices"));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 'd';
|
||||
res = JS_ToBoolFree(ctx, JS_GetProperty(ctx, this_val, JS_ATOM_global));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 'g';
|
||||
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "ignoreCase"));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 'i';
|
||||
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "multiline"));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 'm';
|
||||
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "dotAll"));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 's';
|
||||
res = JS_ToBoolFree(ctx, JS_GetProperty(ctx, this_val, JS_ATOM_unicode));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 'u';
|
||||
res = JS_ToBoolFree(ctx, JS_GetPropertyStr(ctx, this_val, "sticky"));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = 'y';
|
||||
for(i = 0; i < RE_FLAG_COUNT; i++) {
|
||||
res = JS_ToBoolFree(ctx, JS_GetProperty(ctx, this_val, flag_atom[i]));
|
||||
if (res < 0)
|
||||
goto exception;
|
||||
if (res)
|
||||
*p++ = flag_char[i];
|
||||
}
|
||||
return JS_NewStringLen(ctx, str, p - str);
|
||||
|
||||
exception:
|
||||
@ -45026,14 +45020,12 @@ static JSValue js_regexp_Symbol_match(JSContext *ctx, JSValueConst this_val,
|
||||
goto exception;
|
||||
p = JS_VALUE_GET_STRING(flags);
|
||||
|
||||
// TODO(bnoordhuis) query 'u' flag the same way?
|
||||
global = (-1 != string_indexof_char(p, 'g', 0));
|
||||
if (!global) {
|
||||
A = JS_RegExpExec(ctx, rx, S);
|
||||
} else {
|
||||
fullUnicode = JS_ToBoolFree(ctx, JS_GetProperty(ctx, rx, JS_ATOM_unicode));
|
||||
if (fullUnicode < 0)
|
||||
goto exception;
|
||||
fullUnicode = (string_indexof_char(p, 'u', 0) >= 0 ||
|
||||
string_indexof_char(p, 'v', 0) >= 0);
|
||||
|
||||
if (JS_SetProperty(ctx, rx, JS_ATOM_lastIndex, JS_NewInt32(ctx, 0)) < 0)
|
||||
goto exception;
|
||||
@ -45217,7 +45209,8 @@ static JSValue js_regexp_Symbol_matchAll(JSContext *ctx, JSValueConst this_val,
|
||||
it->iterated_string = S;
|
||||
strp = JS_VALUE_GET_STRING(flags);
|
||||
it->global = string_indexof_char(strp, 'g', 0) >= 0;
|
||||
it->unicode = string_indexof_char(strp, 'u', 0) >= 0;
|
||||
it->unicode = (string_indexof_char(strp, 'u', 0) >= 0 ||
|
||||
string_indexof_char(strp, 'v', 0) >= 0);
|
||||
it->done = FALSE;
|
||||
JS_SetOpaque(iter, it);
|
||||
|
||||
@ -45364,13 +45357,11 @@ static JSValue js_regexp_Symbol_replace(JSContext *ctx, JSValueConst this_val,
|
||||
goto exception;
|
||||
p = JS_VALUE_GET_STRING(flags);
|
||||
|
||||
// TODO(bnoordhuis) query 'u' flag the same way?
|
||||
fullUnicode = 0;
|
||||
is_global = (-1 != string_indexof_char(p, 'g', 0));
|
||||
if (is_global) {
|
||||
fullUnicode = JS_ToBoolFree(ctx, JS_GetProperty(ctx, rx, JS_ATOM_unicode));
|
||||
if (fullUnicode < 0)
|
||||
goto exception;
|
||||
fullUnicode = (string_indexof_char(p, 'u', 0) >= 0 ||
|
||||
string_indexof_char(p, 'v', 0) >= 0);
|
||||
if (JS_SetProperty(ctx, rx, JS_ATOM_lastIndex, JS_NewInt32(ctx, 0)) < 0)
|
||||
goto exception;
|
||||
}
|
||||
@ -45596,7 +45587,8 @@ static JSValue js_regexp_Symbol_split(JSContext *ctx, JSValueConst this_val,
|
||||
if (JS_IsException(flags))
|
||||
goto exception;
|
||||
strp = JS_VALUE_GET_STRING(flags);
|
||||
unicodeMatching = string_indexof_char(strp, 'u', 0) >= 0;
|
||||
unicodeMatching = (string_indexof_char(strp, 'u', 0) >= 0 ||
|
||||
string_indexof_char(strp, 'v', 0) >= 0);
|
||||
if (string_indexof_char(strp, 'y', 0) < 0) {
|
||||
flags = JS_ConcatString3(ctx, "", flags, "y");
|
||||
if (JS_IsException(flags))
|
||||
@ -45707,6 +45699,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
|
||||
JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
|
||||
JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
|
||||
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
|
||||
JS_CGETSET_MAGIC_DEF("unicodeSets", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE_SETS ),
|
||||
JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
|
||||
JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
|
||||
JS_CFUNC_DEF("exec", 1, js_regexp_exec ),
|
||||
|
30
test262.conf
30
test262.conf
@ -180,7 +180,7 @@ regexp-match-indices
|
||||
regexp-modifiers=skip
|
||||
regexp-named-groups
|
||||
regexp-unicode-property-escapes
|
||||
regexp-v-flag=skip
|
||||
regexp-v-flag
|
||||
RegExp.escape
|
||||
resizable-arraybuffer=skip
|
||||
rest-parameters
|
||||
@ -250,32 +250,6 @@ test262/test/built-ins/ThrowTypeError/unique-per-realm-function-proto.js
|
||||
#test262/test/built-ins/RegExp/CharacterClassEscapes/
|
||||
#test262/test/built-ins/RegExp/property-escapes/
|
||||
|
||||
# feature regexp-v-flag is missing in the tests
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-digit-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-digit-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-whitespace-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-whitespace-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-whitespace-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-whitespace-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-word-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-word-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-word-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-non-word-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-whitespace-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-whitespace-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-whitespace-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-whitespace-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-word-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-word-class-escape-negative-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-word-class-escape-positive-cases.js
|
||||
test262/test/built-ins/RegExp/CharacterClassEscapes/character-class-word-class-escape-positive-cases.js
|
||||
|
||||
# not yet in official specification
|
||||
test262/test/built-ins/String/prototype/match/cstm-matcher-on-bigint-primitive.js
|
||||
test262/test/built-ins/String/prototype/match/cstm-matcher-on-bigint-primitive.js
|
||||
@ -341,8 +315,6 @@ test262/test/staging/sm/Set/symmetric-difference.js
|
||||
test262/test/staging/sm/Set/union.js
|
||||
test262/test/staging/sm/extensions/censor-strict-caller.js
|
||||
test262/test/staging/sm/JSON/parse-with-source.js
|
||||
test262/test/staging/sm/RegExp/flags.js
|
||||
test262/test/staging/sm/RegExp/prototype.js
|
||||
|
||||
# not standard
|
||||
test262/test/staging/sm/Function/builtin-no-construct.js
|
||||
|
@ -16,13 +16,9 @@ test262/test/staging/sm/JSON/parse-number-syntax.js:39: Test262Error: parsing st
|
||||
test262/test/staging/sm/JSON/parse-syntax-errors-02.js:51: Test262Error: parsing string <["Illegal backslash escape: \x15"]> threw a non-SyntaxError exception: Test262Error: string <["Illegal backslash escape: \x15"]> shouldn't have parsed as JSON Expected SameValue(«false», «true») to be true Expected SameValue(«true», «false») to be true
|
||||
test262/test/staging/sm/Math/cbrt-approx.js:26: Error: got 1.39561242508609, expected a number near 1.3956124250860895 (relative error: 2)
|
||||
test262/test/staging/sm/RegExp/constructor-ordering-2.js:15: Test262Error: Expected SameValue(«false», «true») to be true
|
||||
test262/test/staging/sm/RegExp/match-trace.js:13: Test262Error: Expected SameValue(«"get:flags,get:unicode,set:lastIndex,get:exec,call:exec,get:result[0],get:exec,call:exec,get:result[0],get:exec,call:exec,"», «"get:flags,set:lastIndex,get:exec,call:exec,get:result[0],get:exec,call:exec,get:result[0],get:exec,call:exec,"») to be true
|
||||
test262/test/staging/sm/RegExp/regress-613820-1.js:13: Test262Error: Expected SameValue(«"aaa"», «"aa"») to be true
|
||||
test262/test/staging/sm/RegExp/regress-613820-2.js:13: Test262Error: Expected SameValue(«"f"», «undefined») to be true
|
||||
test262/test/staging/sm/RegExp/regress-613820-3.js:13: Test262Error: Expected SameValue(«"aab"», «"aa"») to be true
|
||||
test262/test/staging/sm/RegExp/replace-trace.js:13: Test262Error: Expected SameValue(«"get:flags,get:unicode,set:lastIndex,get:exec,call:exec,get:result[0],get:exec,call:exec,get:result[length],get:result[0],get:result[index],get:result[groups],"», «"get:flags,set:lastIndex,get:exec,call:exec,get:result[0],get:exec,call:exec,get:result[length],get:result[0],get:result[index],get:result[groups],"») to be true
|
||||
test262/test/staging/sm/RegExp/unicode-ignoreCase-escape.js:22: Test262Error: Actual argument shouldn't be nullish.
|
||||
test262/test/staging/sm/RegExp/unicode-ignoreCase-word-boundary.js:13: Test262Error: Expected SameValue(«false», «true») to be true
|
||||
test262/test/staging/sm/String/match-defines-match-elements.js:52: Test262Error: Expected SameValue(«true», «false») to be true
|
||||
test262/test/staging/sm/TypedArray/constructor-buffer-sequence.js:73: Error: Assertion failed: expected exception ExpectedError, got Error: Poisoned Value
|
||||
test262/test/staging/sm/TypedArray/prototype-constructor-identity.js:17: Test262Error: Expected SameValue(«2», «6») to be true
|
||||
|
@ -751,6 +751,34 @@ function test_regexp()
|
||||
assert(a, ["123a23", "3"]);
|
||||
a = /()*?a/.exec(",");
|
||||
assert(a, null);
|
||||
|
||||
/* test \b escape */
|
||||
assert(/[\q{a\b}]/.test("a\b"), true);
|
||||
assert(/[\b]/.test("\b"), true);
|
||||
|
||||
/* test case insensitive matching (test262 hardly tests it) */
|
||||
assert("aAbBcC#4".replace(/\p{Lower}/gu,"X"), "XAXBXC#4");
|
||||
|
||||
assert("aAbBcC#4".replace(/\p{Lower}/gui,"X"), "XXXXXX#4");
|
||||
assert("aAbBcC#4".replace(/\p{Upper}/gui,"X"), "XXXXXX#4");
|
||||
assert("aAbBcC#4".replace(/\P{Lower}/gui,"X"), "XXXXXXXX");
|
||||
assert("aAbBcC#4".replace(/\P{Upper}/gui,"X"), "XXXXXXXX");
|
||||
assert("aAbBcC".replace(/[^b]/gui, "X"), "XXbBXX");
|
||||
assert("aAbBcC".replace(/[^A-B]/gui, "X"), "aAbBXX");
|
||||
|
||||
assert("aAbBcC#4".replace(/\p{Lower}/gvi,"X"), "XXXXXX#4");
|
||||
assert("aAbBcC#4".replace(/\P{Lower}/gvi,"X"), "aAbBcCXX");
|
||||
assert("aAbBcC#4".replace(/[^\P{Lower}]/gvi,"X"), "XXXXXX#4");
|
||||
assert("aAbBcC#4".replace(/\P{Upper}/gvi,"X"), "aAbBcCXX");
|
||||
assert("aAbBcC".replace(/[^b]/gvi, "X"), "XXbBXX");
|
||||
assert("aAbBcC".replace(/[^A-B]/gvi, "X"), "aAbBXX");
|
||||
assert("aAbBcC".replace(/[[a-c]&&B]/gvi, "X"), "aAXXcC");
|
||||
assert("aAbBcC".replace(/[[a-c]--B]/gvi, "X"), "XXbBXX");
|
||||
|
||||
assert("abcAbC".replace(/[\q{AbC}]/gvi,"X"), "XX");
|
||||
/* Note: SpiderMonkey and v8 may not be correct */
|
||||
assert("abcAbC".replace(/[\q{BC|A}]/gvi,"X"), "XXXX");
|
||||
assert("abcAbC".replace(/[\q{BC|A}--a]/gvi,"X"), "aXAX");
|
||||
}
|
||||
|
||||
function test_symbol()
|
||||
|
@ -1,8 +1,9 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
url="ftp://ftp.unicode.org/Public/16.0.0/ucd"
|
||||
emoji_url="${url}/emoji/emoji-data.txt"
|
||||
version="16.0.0"
|
||||
emoji_version="16.0"
|
||||
url="ftp://ftp.unicode.org/Public"
|
||||
|
||||
files="CaseFolding.txt DerivedNormalizationProps.txt PropList.txt \
|
||||
SpecialCasing.txt CompositionExclusions.txt ScriptExtensions.txt \
|
||||
@ -12,8 +13,11 @@ PropertyValueAliases.txt"
|
||||
mkdir -p unicode
|
||||
|
||||
for f in $files; do
|
||||
g="${url}/${f}"
|
||||
g="${url}/${version}/ucd/${f}"
|
||||
wget $g -O unicode/$f
|
||||
done
|
||||
|
||||
wget $emoji_url -O unicode/emoji-data.txt
|
||||
wget "${url}/${version}/ucd/emoji/emoji-data.txt" -O unicode/emoji-data.txt
|
||||
|
||||
wget "${url}/emoji/${emoji_version}/emoji-sequences.txt" -O unicode/emoji-sequences.txt
|
||||
wget "${url}/emoji/${emoji_version}/emoji-zwj-sequences.txt" -O unicode/emoji-zwj-sequences.txt
|
||||
|
541
unicode_gen.c
541
unicode_gen.c
@ -156,6 +156,153 @@ char *get_line(char *buf, int buf_size, FILE *f)
|
||||
return buf;
|
||||
}
|
||||
|
||||
typedef struct REString {
|
||||
struct REString *next;
|
||||
uint32_t hash;
|
||||
uint32_t len;
|
||||
uint32_t flags;
|
||||
uint32_t buf[];
|
||||
} REString;
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_strings;
|
||||
uint32_t hash_size;
|
||||
int hash_bits;
|
||||
REString **hash_table;
|
||||
} REStringList;
|
||||
|
||||
static uint32_t re_string_hash(int len, const uint32_t *buf)
|
||||
{
|
||||
int i;
|
||||
uint32_t h;
|
||||
h = 1;
|
||||
for(i = 0; i < len; i++)
|
||||
h = h * 263 + buf[i];
|
||||
return h * 0x61C88647;
|
||||
}
|
||||
|
||||
static void re_string_list_init(REStringList *s)
|
||||
{
|
||||
s->n_strings = 0;
|
||||
s->hash_size = 0;
|
||||
s->hash_bits = 0;
|
||||
s->hash_table = NULL;
|
||||
}
|
||||
|
||||
static __maybe_unused void re_string_list_free(REStringList *s)
|
||||
{
|
||||
REString *p, *p_next;
|
||||
int i;
|
||||
for(i = 0; i < s->hash_size; i++) {
|
||||
for(p = s->hash_table[i]; p != NULL; p = p_next) {
|
||||
p_next = p->next;
|
||||
free(p);
|
||||
}
|
||||
}
|
||||
free(s->hash_table);
|
||||
}
|
||||
|
||||
static void lre_print_char(int c, BOOL is_range)
|
||||
{
|
||||
if (c == '\'' || c == '\\' ||
|
||||
(is_range && (c == '-' || c == ']'))) {
|
||||
printf("\\%c", c);
|
||||
} else if (c >= ' ' && c <= 126) {
|
||||
printf("%c", c);
|
||||
} else {
|
||||
printf("\\u{%04x}", c);
|
||||
}
|
||||
}
|
||||
|
||||
static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
|
||||
{
|
||||
REString *p;
|
||||
int i, j, k;
|
||||
|
||||
printf("%s:\n", str);
|
||||
|
||||
j = 0;
|
||||
for(i = 0; i < s->hash_size; i++) {
|
||||
for(p = s->hash_table[i]; p != NULL; p = p->next) {
|
||||
printf(" %d/%d: '", j, s->n_strings);
|
||||
for(k = 0; k < p->len; k++) {
|
||||
lre_print_char(p->buf[k], FALSE);
|
||||
}
|
||||
printf("'\n");
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static REString *re_string_find2(REStringList *s, int len, const uint32_t *buf,
|
||||
uint32_t h0, BOOL add_flag)
|
||||
{
|
||||
uint32_t h = 0; /* avoid warning */
|
||||
REString *p;
|
||||
if (s->n_strings != 0) {
|
||||
h = h0 >> (32 - s->hash_bits);
|
||||
for(p = s->hash_table[h]; p != NULL; p = p->next) {
|
||||
if (p->hash == h0 && p->len == len &&
|
||||
!memcmp(p->buf, buf, len * sizeof(buf[0]))) {
|
||||
return p;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* not found */
|
||||
if (!add_flag)
|
||||
return NULL;
|
||||
/* increase the size of the hash table if needed */
|
||||
if (unlikely((s->n_strings + 1) > s->hash_size)) {
|
||||
REString **new_hash_table, *p_next;
|
||||
int new_hash_bits, i;
|
||||
uint32_t new_hash_size;
|
||||
new_hash_bits = max_int(s->hash_bits + 1, 4);
|
||||
new_hash_size = 1 << new_hash_bits;
|
||||
new_hash_table = malloc(sizeof(new_hash_table[0]) * new_hash_size);
|
||||
if (!new_hash_table)
|
||||
return NULL;
|
||||
memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
|
||||
for(i = 0; i < s->hash_size; i++) {
|
||||
for(p = s->hash_table[i]; p != NULL; p = p_next) {
|
||||
p_next = p->next;
|
||||
h = p->hash >> (32 - new_hash_bits);
|
||||
p->next = new_hash_table[h];
|
||||
new_hash_table[h] = p;
|
||||
}
|
||||
}
|
||||
free(s->hash_table);
|
||||
s->hash_bits = new_hash_bits;
|
||||
s->hash_size = new_hash_size;
|
||||
s->hash_table = new_hash_table;
|
||||
h = h0 >> (32 - s->hash_bits);
|
||||
}
|
||||
|
||||
p = malloc(sizeof(REString) + len * sizeof(buf[0]));
|
||||
if (!p)
|
||||
return NULL;
|
||||
p->next = s->hash_table[h];
|
||||
s->hash_table[h] = p;
|
||||
s->n_strings++;
|
||||
p->hash = h0;
|
||||
p->len = len;
|
||||
p->flags = 0;
|
||||
memcpy(p->buf, buf, sizeof(buf[0]) * len);
|
||||
return p;
|
||||
}
|
||||
|
||||
static REString *re_string_find(REStringList *s, int len, const uint32_t *buf,
|
||||
BOOL add_flag)
|
||||
{
|
||||
uint32_t h0;
|
||||
h0 = re_string_hash(len, buf);
|
||||
return re_string_find2(s, len, buf, h0, add_flag);
|
||||
}
|
||||
|
||||
static void re_string_add(REStringList *s, int len, const uint32_t *buf)
|
||||
{
|
||||
re_string_find(s, len, buf, TRUE);
|
||||
}
|
||||
|
||||
#define UNICODE_GENERAL_CATEGORY
|
||||
|
||||
typedef enum {
|
||||
@ -225,6 +372,23 @@ static const char *unicode_prop_short_name[] = {
|
||||
|
||||
#undef UNICODE_PROP_LIST
|
||||
|
||||
#define UNICODE_SEQUENCE_PROP_LIST
|
||||
|
||||
typedef enum {
|
||||
#define DEF(id) SEQUENCE_PROP_ ## id,
|
||||
#include "unicode_gen_def.h"
|
||||
#undef DEF
|
||||
SEQUENCE_PROP_COUNT,
|
||||
} UnicodeSequencePropEnum1;
|
||||
|
||||
static const char *unicode_sequence_prop_name[] = {
|
||||
#define DEF(id) #id,
|
||||
#include "unicode_gen_def.h"
|
||||
#undef DEF
|
||||
};
|
||||
|
||||
#undef UNICODE_SEQUENCE_PROP_LIST
|
||||
|
||||
typedef struct {
|
||||
/* case conv */
|
||||
uint8_t u_len;
|
||||
@ -247,7 +411,15 @@ typedef struct {
|
||||
int *decomp_data;
|
||||
} CCInfo;
|
||||
|
||||
typedef struct {
|
||||
int count;
|
||||
int size;
|
||||
int *tab;
|
||||
} UnicodeSequenceProperties;
|
||||
|
||||
CCInfo *unicode_db;
|
||||
REStringList rgi_emoji_zwj_sequence;
|
||||
DynBuf rgi_emoji_tag_sequence;
|
||||
|
||||
int find_name(const char **tab, int tab_len, const char *name)
|
||||
{
|
||||
@ -751,6 +923,147 @@ void parse_prop_list(const char *filename)
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
#define SEQ_MAX_LEN 16
|
||||
|
||||
static BOOL is_emoji_modifier(uint32_t c)
|
||||
{
|
||||
return (c >= 0x1f3fb && c <= 0x1f3ff);
|
||||
}
|
||||
|
||||
static void add_sequence_prop(int idx, int seq_len, int *seq)
|
||||
{
|
||||
int i;
|
||||
|
||||
assert(idx < SEQUENCE_PROP_COUNT);
|
||||
switch(idx) {
|
||||
case SEQUENCE_PROP_Basic_Emoji:
|
||||
/* convert to 2 properties lists */
|
||||
if (seq_len == 1) {
|
||||
set_prop(seq[0], PROP_Basic_Emoji1, 1);
|
||||
} else if (seq_len == 2 && seq[1] == 0xfe0f) {
|
||||
set_prop(seq[0], PROP_Basic_Emoji2, 1);
|
||||
} else {
|
||||
abort();
|
||||
}
|
||||
break;
|
||||
case SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
|
||||
assert(seq_len == 2);
|
||||
assert(is_emoji_modifier(seq[1]));
|
||||
assert(get_prop(seq[0], PROP_Emoji_Modifier_Base));
|
||||
set_prop(seq[0], PROP_RGI_Emoji_Modifier_Sequence, 1);
|
||||
break;
|
||||
case SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
|
||||
{
|
||||
int code;
|
||||
assert(seq_len == 2);
|
||||
assert(seq[0] >= 0x1F1E6 && seq[0] <= 0x1F1FF);
|
||||
assert(seq[1] >= 0x1F1E6 && seq[1] <= 0x1F1FF);
|
||||
code = (seq[0] - 0x1F1E6) * 26 + (seq[1] - 0x1F1E6);
|
||||
/* XXX: would be more compact with a simple bitmap -> 676 bits */
|
||||
set_prop(code, PROP_RGI_Emoji_Flag_Sequence, 1);
|
||||
}
|
||||
break;
|
||||
case SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
|
||||
re_string_add(&rgi_emoji_zwj_sequence, seq_len, (uint32_t *)seq);
|
||||
break;
|
||||
case SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
|
||||
{
|
||||
assert(seq_len >= 3);
|
||||
assert(seq[0] == 0x1F3F4);
|
||||
assert(seq[seq_len - 1] == 0xE007F);
|
||||
for(i = 1; i < seq_len - 1; i++) {
|
||||
assert(seq[i] >= 0xe0001 && seq[i] <= 0xe007e);
|
||||
dbuf_putc(&rgi_emoji_tag_sequence, seq[i] - 0xe0000);
|
||||
}
|
||||
dbuf_putc(&rgi_emoji_tag_sequence, 0);
|
||||
}
|
||||
break;
|
||||
case SEQUENCE_PROP_Emoji_Keycap_Sequence:
|
||||
assert(seq_len == 3);
|
||||
assert(seq[1] == 0xfe0f);
|
||||
assert(seq[2] == 0x20e3);
|
||||
set_prop(seq[0], PROP_Emoji_Keycap_Sequence, 1);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
void parse_sequence_prop_list(const char *filename)
|
||||
{
|
||||
FILE *f;
|
||||
char line[4096], *p, buf[256], *q, *p_start;
|
||||
uint32_t c0, c1, c;
|
||||
int idx, seq_len;
|
||||
int seq[SEQ_MAX_LEN];
|
||||
|
||||
f = fopen(filename, "rb");
|
||||
if (!f) {
|
||||
perror(filename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
if (!get_line(line, sizeof(line), f))
|
||||
break;
|
||||
p = line;
|
||||
while (isspace(*p))
|
||||
p++;
|
||||
if (*p == '#' || *p == '@' || *p == '\0')
|
||||
continue;
|
||||
p_start = p;
|
||||
|
||||
/* find the sequence property name */
|
||||
p = strchr(p, ';');
|
||||
if (!p)
|
||||
continue;
|
||||
p++;
|
||||
p += strspn(p, " \t");
|
||||
q = buf;
|
||||
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t' && *p != ';') {
|
||||
if ((q - buf) < sizeof(buf) - 1)
|
||||
*q++ = *p;
|
||||
p++;
|
||||
}
|
||||
*q = '\0';
|
||||
idx = find_name(unicode_sequence_prop_name,
|
||||
countof(unicode_sequence_prop_name), buf);
|
||||
if (idx < 0) {
|
||||
fprintf(stderr, "Property not found: %s\n", buf);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
p = p_start;
|
||||
c0 = strtoul(p, (char **)&p, 16);
|
||||
assert(c0 <= CHARCODE_MAX);
|
||||
|
||||
if (*p == '.' && p[1] == '.') {
|
||||
p += 2;
|
||||
c1 = strtoul(p, (char **)&p, 16);
|
||||
assert(c1 <= CHARCODE_MAX);
|
||||
for(c = c0; c <= c1; c++) {
|
||||
seq[0] = c;
|
||||
add_sequence_prop(idx, 1, seq);
|
||||
}
|
||||
} else {
|
||||
seq_len = 0;
|
||||
seq[seq_len++] = c0;
|
||||
for(;;) {
|
||||
while (isspace(*p))
|
||||
p++;
|
||||
if (*p == ';' || *p == '\0')
|
||||
break;
|
||||
c0 = strtoul(p, (char **)&p, 16);
|
||||
assert(c0 <= CHARCODE_MAX);
|
||||
assert(seq_len < countof(seq));
|
||||
seq[seq_len++] = c0;
|
||||
}
|
||||
add_sequence_prop(idx, seq_len, seq);
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void parse_scripts(const char *filename)
|
||||
{
|
||||
FILE *f;
|
||||
@ -1654,7 +1967,7 @@ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
|
||||
maxw = 0;
|
||||
for(i = 0; i < len; i++) {
|
||||
w = strlen(tab_name[i]);
|
||||
if (tab_short_name[i][0] != '\0') {
|
||||
if (tab_short_name && tab_short_name[i][0] != '\0') {
|
||||
w += 1 + strlen(tab_short_name[i]);
|
||||
}
|
||||
if (maxw < w)
|
||||
@ -1666,7 +1979,7 @@ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
|
||||
for(i = 0; i < len; i++) {
|
||||
fprintf(f, " \"");
|
||||
w = fprintf(f, "%s", tab_name[i]);
|
||||
if (tab_short_name[i][0] != '\0') {
|
||||
if (tab_short_name && tab_short_name[i][0] != '\0') {
|
||||
w += fprintf(f, ",%s", tab_short_name[i]);
|
||||
}
|
||||
fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
|
||||
@ -1930,6 +2243,218 @@ void build_prop_list_table(FILE *f)
|
||||
fprintf(f, "};\n\n");
|
||||
}
|
||||
|
||||
static BOOL is_emoji_hair_color(uint32_t c)
|
||||
{
|
||||
return (c >= 0x1F9B0 && c <= 0x1F9B3);
|
||||
}
|
||||
|
||||
#define EMOJI_MOD_NONE 0
|
||||
#define EMOJI_MOD_TYPE1 1
|
||||
#define EMOJI_MOD_TYPE2 2
|
||||
#define EMOJI_MOD_TYPE2D 3
|
||||
|
||||
static BOOL mark_zwj_string(REStringList *sl, uint32_t *buf, int len, int mod_type, int *mod_pos,
|
||||
int hc_pos, BOOL mark_flag)
|
||||
{
|
||||
REString *p;
|
||||
int i, n_mod, i0, i1, hc_count, j;
|
||||
|
||||
#if 0
|
||||
if (mark_flag)
|
||||
printf("mod_type=%d\n", mod_type);
|
||||
#endif
|
||||
|
||||
switch(mod_type) {
|
||||
case EMOJI_MOD_NONE:
|
||||
n_mod = 1;
|
||||
break;
|
||||
case EMOJI_MOD_TYPE1:
|
||||
n_mod = 5;
|
||||
break;
|
||||
case EMOJI_MOD_TYPE2:
|
||||
n_mod = 25;
|
||||
break;
|
||||
case EMOJI_MOD_TYPE2D:
|
||||
n_mod = 20;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
if (hc_pos >= 0)
|
||||
hc_count = 4;
|
||||
else
|
||||
hc_count = 1;
|
||||
/* check that all the related strings are present */
|
||||
for(j = 0; j < hc_count; j++) {
|
||||
for(i = 0; i < n_mod; i++) {
|
||||
switch(mod_type) {
|
||||
case EMOJI_MOD_NONE:
|
||||
break;
|
||||
case EMOJI_MOD_TYPE1:
|
||||
buf[mod_pos[0]] = 0x1f3fb + i;
|
||||
break;
|
||||
case EMOJI_MOD_TYPE2:
|
||||
case EMOJI_MOD_TYPE2D:
|
||||
i0 = i / 5;
|
||||
i1 = i % 5;
|
||||
/* avoid identical values */
|
||||
if (mod_type == EMOJI_MOD_TYPE2D && i0 >= i1)
|
||||
i0++;
|
||||
buf[mod_pos[0]] = 0x1f3fb + i0;
|
||||
buf[mod_pos[1]] = 0x1f3fb + i1;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
if (hc_pos >= 0)
|
||||
buf[hc_pos] = 0x1F9B0 + j;
|
||||
|
||||
p = re_string_find(sl, len, buf, FALSE);
|
||||
if (!p)
|
||||
return FALSE;
|
||||
if (mark_flag)
|
||||
p->flags |= 1;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void zwj_encode_string(DynBuf *dbuf, const uint32_t *buf, int len, int mod_type, int *mod_pos,
|
||||
int hc_pos)
|
||||
{
|
||||
int i, j;
|
||||
int c, code;
|
||||
uint32_t buf1[SEQ_MAX_LEN];
|
||||
|
||||
j = 0;
|
||||
for(i = 0; i < len;) {
|
||||
c = buf[i++];
|
||||
if (c >= 0x2000 && c <= 0x2fff) {
|
||||
code = c - 0x2000;
|
||||
} else if (c >= 0x1f000 && c <= 0x1ffff) {
|
||||
code = c - 0x1f000 + 0x1000;
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
if (i < len && is_emoji_modifier(buf[i])) {
|
||||
/* modifier */
|
||||
code |= (mod_type << 13);
|
||||
i++;
|
||||
}
|
||||
if (i < len && buf[i] == 0xfe0f) {
|
||||
/* presentation selector present */
|
||||
code |= 0x8000;
|
||||
i++;
|
||||
}
|
||||
if (i < len) {
|
||||
/* zero width join */
|
||||
assert(buf[i] == 0x200d);
|
||||
i++;
|
||||
}
|
||||
buf1[j++] = code;
|
||||
}
|
||||
dbuf_putc(dbuf, j);
|
||||
for(i = 0; i < j; i++) {
|
||||
dbuf_putc(dbuf, buf1[i]);
|
||||
dbuf_putc(dbuf, buf1[i] >> 8);
|
||||
}
|
||||
}
|
||||
|
||||
static void build_rgi_emoji_zwj_sequence(FILE *f, REStringList *sl)
|
||||
{
|
||||
int mod_pos[2], mod_count, hair_color_pos, j, h;
|
||||
REString *p;
|
||||
uint32_t buf[SEQ_MAX_LEN];
|
||||
DynBuf dbuf;
|
||||
|
||||
#if 0
|
||||
{
|
||||
for(h = 0; h < sl->hash_size; h++) {
|
||||
for(p = sl->hash_table[h]; p != NULL; p = p->next) {
|
||||
for(j = 0; j < p->len; j++)
|
||||
printf(" %04x", p->buf[j]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
// printf("rgi_emoji_zwj_sequence: n=%d\n", sl->n_strings);
|
||||
|
||||
dbuf_init(&dbuf);
|
||||
|
||||
/* avoid duplicating strings with emoji modifiers or hair colors */
|
||||
for(h = 0; h < sl->hash_size; h++) {
|
||||
for(p = sl->hash_table[h]; p != NULL; p = p->next) {
|
||||
if (p->flags) /* already examined */
|
||||
continue;
|
||||
mod_count = 0;
|
||||
hair_color_pos = -1;
|
||||
for(j = 0; j < p->len; j++) {
|
||||
if (is_emoji_modifier(p->buf[j])) {
|
||||
assert(mod_count < 2);
|
||||
mod_pos[mod_count++] = j;
|
||||
} else if (is_emoji_hair_color(p->buf[j])) {
|
||||
hair_color_pos = j;
|
||||
}
|
||||
buf[j] = p->buf[j];
|
||||
}
|
||||
|
||||
if (mod_count != 0 || hair_color_pos >= 0) {
|
||||
int mod_type;
|
||||
if (mod_count == 0)
|
||||
mod_type = EMOJI_MOD_NONE;
|
||||
else if (mod_count == 1)
|
||||
mod_type = EMOJI_MOD_TYPE1;
|
||||
else
|
||||
mod_type = EMOJI_MOD_TYPE2;
|
||||
|
||||
if (mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, FALSE)) {
|
||||
mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, TRUE);
|
||||
} else if (mod_type == EMOJI_MOD_TYPE2) {
|
||||
mod_type = EMOJI_MOD_TYPE2D;
|
||||
if (mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, FALSE)) {
|
||||
mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, TRUE);
|
||||
} else {
|
||||
dump_str("not_found", (int *)p->buf, p->len);
|
||||
goto keep;
|
||||
}
|
||||
}
|
||||
if (hair_color_pos >= 0)
|
||||
buf[hair_color_pos] = 0x1f9b0;
|
||||
/* encode the string */
|
||||
zwj_encode_string(&dbuf, buf, p->len, mod_type, mod_pos, hair_color_pos);
|
||||
} else {
|
||||
keep:
|
||||
zwj_encode_string(&dbuf, buf, p->len, EMOJI_MOD_NONE, NULL, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Encode */
|
||||
dump_byte_table(f, "unicode_rgi_emoji_zwj_sequence", dbuf.buf, dbuf.size);
|
||||
|
||||
dbuf_free(&dbuf);
|
||||
}
|
||||
|
||||
void build_sequence_prop_list_table(FILE *f)
|
||||
{
|
||||
int i;
|
||||
fprintf(f, "typedef enum {\n");
|
||||
for(i = 0; i < SEQUENCE_PROP_COUNT; i++)
|
||||
fprintf(f, " UNICODE_SEQUENCE_PROP_%s,\n", unicode_sequence_prop_name[i]);
|
||||
fprintf(f, " UNICODE_SEQUENCE_PROP_COUNT,\n");
|
||||
fprintf(f, "} UnicodeSequencePropertyEnum;\n\n");
|
||||
|
||||
dump_name_table(f, "unicode_sequence_prop_name_table",
|
||||
unicode_sequence_prop_name, SEQUENCE_PROP_COUNT, NULL);
|
||||
|
||||
dump_byte_table(f, "unicode_rgi_emoji_tag_sequence", rgi_emoji_tag_sequence.buf, rgi_emoji_tag_sequence.size);
|
||||
|
||||
build_rgi_emoji_zwj_sequence(f, &rgi_emoji_zwj_sequence);
|
||||
}
|
||||
|
||||
#ifdef USE_TEST
|
||||
int check_conv(uint32_t *res, uint32_t c, int conv_type)
|
||||
{
|
||||
@ -3156,6 +3681,8 @@ int main(int argc, char *argv[])
|
||||
outfilename = argv[arg++];
|
||||
|
||||
unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
|
||||
re_string_list_init(&rgi_emoji_zwj_sequence);
|
||||
dbuf_init(&rgi_emoji_tag_sequence);
|
||||
|
||||
snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
|
||||
|
||||
@ -3190,6 +3717,14 @@ int main(int argc, char *argv[])
|
||||
unicode_db_path);
|
||||
parse_prop_list(filename);
|
||||
|
||||
snprintf(filename, sizeof(filename), "%s/emoji-sequences.txt",
|
||||
unicode_db_path);
|
||||
parse_sequence_prop_list(filename);
|
||||
|
||||
snprintf(filename, sizeof(filename), "%s/emoji-zwj-sequences.txt",
|
||||
unicode_db_path);
|
||||
parse_sequence_prop_list(filename);
|
||||
|
||||
// dump_unicode_data(unicode_db);
|
||||
build_conv_table(unicode_db);
|
||||
|
||||
@ -3234,10 +3769,12 @@ int main(int argc, char *argv[])
|
||||
build_script_table(fo);
|
||||
build_script_ext_table(fo);
|
||||
build_prop_list_table(fo);
|
||||
build_sequence_prop_list_table(fo);
|
||||
fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
|
||||
fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
|
||||
total_tables, total_table_bytes, total_index, total_index_bytes);
|
||||
fclose(fo);
|
||||
}
|
||||
re_string_list_free(&rgi_emoji_zwj_sequence);
|
||||
return 0;
|
||||
}
|
||||
|
@ -234,6 +234,11 @@ DEF(XID_Continue1, "")
|
||||
DEF(Changes_When_Titlecased1, "")
|
||||
DEF(Changes_When_Casefolded1, "")
|
||||
DEF(Changes_When_NFKC_Casefolded1, "")
|
||||
DEF(Basic_Emoji1, "")
|
||||
DEF(Basic_Emoji2, "")
|
||||
DEF(RGI_Emoji_Modifier_Sequence, "")
|
||||
DEF(RGI_Emoji_Flag_Sequence, "")
|
||||
DEF(Emoji_Keycap_Sequence, "")
|
||||
|
||||
/* Prop list exported to JS */
|
||||
DEF(ASCII_Hex_Digit, "AHex")
|
||||
@ -301,3 +306,13 @@ DEF(XID_Start, "XIDS")
|
||||
DEF(Cased1, "")
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef UNICODE_SEQUENCE_PROP_LIST
|
||||
DEF(Basic_Emoji)
|
||||
DEF(Emoji_Keycap_Sequence)
|
||||
DEF(RGI_Emoji_Modifier_Sequence)
|
||||
DEF(RGI_Emoji_Flag_Sequence)
|
||||
DEF(RGI_Emoji_Tag_Sequence)
|
||||
DEF(RGI_Emoji_ZWJ_Sequence)
|
||||
DEF(RGI_Emoji)
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user