mirror of
https://github.com/vim/vim.git
synced 2025-09-23 03:43:49 -04:00
patch 9.1.0011: regexp cannot match combining chars in collection
Problem: regexp cannot match combining chars in collection Solution: Check for combining characters in regex collections for the NFA and BT Regex Engine Also, while at it, make debug mode work again. fixes #10286 closes: #12871 Signed-off-by: Christian Brabandt <cb@256bit.org>
This commit is contained in:
111
src/regexp_nfa.c
111
src/regexp_nfa.c
@@ -1764,6 +1764,7 @@ collection:
|
||||
endp = skip_anyof(p);
|
||||
if (*endp == ']')
|
||||
{
|
||||
int plen;
|
||||
/*
|
||||
* Try to reverse engineer character classes. For example,
|
||||
* recognize that [0-9] stands for \d and [A-Za-z_] for \h,
|
||||
@@ -2033,13 +2034,43 @@ collection:
|
||||
else
|
||||
{
|
||||
if (got_coll_char == TRUE && startc == 0)
|
||||
{
|
||||
EMIT(0x0a);
|
||||
EMIT(NFA_CONCAT);
|
||||
}
|
||||
else
|
||||
{
|
||||
EMIT(startc);
|
||||
EMIT(NFA_CONCAT);
|
||||
if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
|
||||
{
|
||||
EMIT(NFA_CONCAT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
|
||||
{
|
||||
int i = utf_ptr2len(regparse);
|
||||
|
||||
c = utf_ptr2char(regparse + i);
|
||||
|
||||
// Add composing characters
|
||||
for (;;)
|
||||
{
|
||||
if (c == 0)
|
||||
// \x00 is translated to \x0a, start at \x01.
|
||||
EMIT(1);
|
||||
else
|
||||
EMIT(c);
|
||||
EMIT(NFA_CONCAT);
|
||||
if ((i += utf_char2len(c)) >= plen)
|
||||
break;
|
||||
c = utf_ptr2char(regparse + i);
|
||||
}
|
||||
EMIT(NFA_COMPOSING);
|
||||
EMIT(NFA_CONCAT);
|
||||
}
|
||||
MB_PTR_ADV(regparse);
|
||||
} // while (p < endp)
|
||||
|
||||
@@ -6418,6 +6449,84 @@ nfa_regmatch(
|
||||
result_if_matched = (t->state->c == NFA_START_COLL);
|
||||
for (;;)
|
||||
{
|
||||
if (state->c == NFA_COMPOSING)
|
||||
{
|
||||
int mc = curc;
|
||||
int len = 0;
|
||||
nfa_state_T *end;
|
||||
nfa_state_T *sta;
|
||||
int cchars[MAX_MCO];
|
||||
int ccount = 0;
|
||||
int j;
|
||||
|
||||
sta = t->state->out->out;
|
||||
len = 0;
|
||||
if (utf_iscomposing(sta->c))
|
||||
{
|
||||
// Only match composing character(s), ignore base
|
||||
// character. Used for ".{composing}" and "{composing}"
|
||||
// (no preceding character).
|
||||
len += mb_char2len(mc);
|
||||
}
|
||||
if (rex.reg_icombine && len == 0)
|
||||
{
|
||||
// If \Z was present, then ignore composing characters.
|
||||
// When ignoring the base character this always matches.
|
||||
if (sta->c != curc)
|
||||
result = FAIL;
|
||||
else
|
||||
result = OK;
|
||||
while (sta->c != NFA_END_COMPOSING)
|
||||
sta = sta->out;
|
||||
}
|
||||
// Check base character matches first, unless ignored.
|
||||
else if (len > 0 || mc == sta->c)
|
||||
// if (len > 0 || mc == sta->c)
|
||||
{
|
||||
if (len == 0)
|
||||
{
|
||||
len += mb_char2len(mc);
|
||||
sta = sta->out;
|
||||
}
|
||||
|
||||
// We don't care about the order of composing characters.
|
||||
// Get them into cchars[] first.
|
||||
while (len < clen)
|
||||
{
|
||||
mc = mb_ptr2char(rex.input + len);
|
||||
cchars[ccount++] = mc;
|
||||
len += mb_char2len(mc);
|
||||
if (ccount == MAX_MCO)
|
||||
break;
|
||||
}
|
||||
|
||||
// Check that each composing char in the pattern matches a
|
||||
// composing char in the text. We do not check if all
|
||||
// composing chars are matched.
|
||||
result = OK;
|
||||
while (sta->c != NFA_END_COMPOSING)
|
||||
{
|
||||
for (j = 0; j < ccount; ++j)
|
||||
if (cchars[j] == sta->c)
|
||||
break;
|
||||
if (j == ccount)
|
||||
{
|
||||
result = FAIL;
|
||||
break;
|
||||
}
|
||||
sta = sta->out;
|
||||
}
|
||||
}
|
||||
else
|
||||
result = FAIL;
|
||||
|
||||
if (t->state->out->out1->c == NFA_END_COMPOSING)
|
||||
{
|
||||
end = t->state->out->out1;
|
||||
ADD_STATE_IF_MATCH(end);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (state->c == NFA_END_COLL)
|
||||
{
|
||||
result = !result_if_matched;
|
||||
|
Reference in New Issue
Block a user