Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.

https://www.unicode.org/reports/tr29/tr29-43.html#GB9c
This commit is contained in:
Mari Imaizumi 2025-02-24 12:09:26 +09:00
parent e63c516046
commit 6670926a91
Notes: git 2025-03-18 12:18:35 +00:00

View File

@ -5952,17 +5952,19 @@ create_node_from_array(int kind, Node **np, Node **node_array)
* *
* Target Array name Index * Target Array name Index
* *
* node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F G H
* top_alts alts[5] 0 1 2 3 4* * top_alts alts[5] 0 1 2 3 4*
* alts+1 list[4] 0 1 2 3* * alts+2 list[4] 0 1 2 3*
* list+1 core_alts[7] 0 1 2 3 4 5 6* * list+1 core_alts[8] 0 1 2 3 4 5 6 7*
* core_alts+0 H_list[4] 0 1 2 3* * core_alts+0 H_list[4] 0 1 2 3*
* H_list+1 H_alt2[4] 0 1 2 3* * H_list+1 H_alt2[4] 0 1 2 3*
* h_alt2+1 H_list2[3] 0 1 2* * H_alt2+1 H_list2[3] 0 1 2*
* core_alts+4 XP_list[4] 0 1 2 3* * core_alts+4 XP_list[3] 0 1 2*
* XP_list+1 Ex_list[4] 0 1 2 3* * XP_list+1 Ex_list[4] 0 1 2 3*
* core_alts+5 CC_list[3] 0 1 2*
* CC_list+1 CC_inner_list[5] 0 1 2 3 4*
*/ */
#define NODE_COMMON_SIZE 16 #define NODE_COMMON_SIZE 18
static int static int
node_extended_grapheme_cluster(Node** np, ScanEnv* env) node_extended_grapheme_cluster(Node** np, ScanEnv* env)
@ -6029,9 +6031,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
/* core := hangul-syllable /* core := hangul-syllable
* | ri-sequence * | ri-sequence
* | xpicto-sequence * | xpicto-sequence
* | conjunctCluster
* | [^Control CR LF] */ * | [^Control CR LF] */
{ {
Node **core_alts = list + 2; /* size: 7 */ Node **core_alts = list + 2; /* size: 8 */
/* hangul-syllable := /* hangul-syllable :=
* L* (V+ | LV V* | LVT) T* * L* (V+ | LV V* | LVT) T*
@ -6099,10 +6102,49 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
R_ERR(create_node_from_array(LIST, core_alts+4, XP_list)); R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
} }
/* conjunctCluster := \p{InCB=Consonant} ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})+ */
{
// \p{InCB=Consonant}
Node **CC_list = core_alts + 6; /* size: 3 */
R_ERR(create_property_node(CC_list+0, env, "InCB=Consonant"));
{
Node **CC_inner_list = CC_list + 2; /* size: 5 */
{
// [\p{InCB=Extend} \p{InCB=Linker}]*
R_ERR(create_property_node(CC_inner_list+0, env, "InCB=Extend"));
R_ERR(add_property_to_cc(NCCLASS(CC_inner_list[0]), "InCB=Linker", 0, env));
R_ERR(quantify_node(CC_inner_list+0, 0, REPEAT_INFINITE));
}
// \p{InCB=Linker}
R_ERR(create_property_node(CC_inner_list+1, env, "InCB=Linker"));
{
// [\p{InCB=Extend} \p{InCB=Linker}]*
R_ERR(create_property_node(CC_inner_list+2, env, "InCB=Extend"));
R_ERR(add_property_to_cc(NCCLASS(CC_inner_list[2]), "InCB=Linker", 0, env));
R_ERR(quantify_node(CC_inner_list+2, 0, REPEAT_INFINITE));
}
// \p{InCB=Consonant}
R_ERR(create_property_node(CC_inner_list+3, env, "InCB=Consonant"));
// ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})
R_ERR(create_node_from_array(LIST, CC_list+1, CC_inner_list));
// (...)+
R_ERR(quantify_node(CC_list+1, 1, REPEAT_INFINITE));
}
// \p{InCB=Consonant} ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})+
R_ERR(create_node_from_array(LIST, core_alts+5, CC_list));
}
/* [^Control CR LF] */ /* [^Control CR LF] */
core_alts[5] = node_new_cclass(); core_alts[6] = node_new_cclass();
if (IS_NULL(core_alts[5])) goto err; if (IS_NULL(core_alts[6])) goto err;
cc = NCCLASS(core_alts[5]); cc = NCCLASS(core_alts[6]);
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
BBuf *inverted_buf = NULL; BBuf *inverted_buf = NULL;