0
0
mirror of https://github.com/vim/vim.git synced 2025-07-04 23:07:33 -04:00

updated for version 7.4.293

Problem:    It is not possible to ignore composing characters at a specific
            point in a pattern.
Solution:   Add the %C item.
This commit is contained in:
Bram Moolenaar 2014-05-13 19:37:29 +02:00
parent 6082bea6ac
commit 8df5acfda9
4 changed files with 69 additions and 13 deletions

View File

@ -545,6 +545,7 @@ Character classes {not in Vi}: */character-classes*
|/\%u| \%u \%u match specified multibyte character (eg \%u20ac)
|/\%U| \%U \%U match specified large multibyte character (eg
\%U12345678)
|/\%C| \%C \%C match any composing characters
Example matches ~
\<\I\i* or
@ -1207,12 +1208,18 @@ will probably never match.
8. Composing characters *patterns-composing*
*/\Z*
When "\Z" appears anywhere in the pattern, composing characters are ignored.
Thus only the base characters need to match, the composing characters may be
different and the number of composing characters may differ. Only relevant
when 'encoding' is "utf-8".
When "\Z" appears anywhere in the pattern, all composing characters are
ignored. Thus only the base characters need to match, the composing
characters may be different and the number of composing characters may differ.
Only relevant when 'encoding' is "utf-8".
Exception: If the pattern starts with one or more composing characters, these
must match.
*/\%C*
Use "\%C" to skip any composing characters. For example, the pattern "a" does
not match in "càt" (where the a has the composing character 0x0300), but
"a\%C" does. Note that this does not match "cát" (where the á is character
0xe1, it does not have a compositing character). It does match "cat" (where
the a is just an a).
When a composing character appears at the start of the pattern of after an
item that doesn't include the composing character, a match is found at any

View File

@ -244,6 +244,7 @@
#define RE_MARK 207 /* mark cmp Match mark position */
#define RE_VISUAL 208 /* Match Visual area */
#define RE_COMPOSING 209 /* any composing characters */
/*
* Magic characters have a special meaning, they don't match literally.
@ -2208,6 +2209,10 @@ regatom(flagp)
ret = regnode(RE_VISUAL);
break;
case 'C':
ret = regnode(RE_COMPOSING);
break;
/* \%[abc]: Emit as a list of branches, all ending at the last
* branch which matches nothing. */
case '[':
@ -4710,11 +4715,13 @@ regmatch(scan)
status = RA_NOMATCH;
}
#ifdef FEAT_MBYTE
/* Check for following composing character. */
/* Check for following composing character, unless %C
* follows (skips over all composing chars). */
if (status != RA_NOMATCH
&& enc_utf8
&& UTF_COMPOSINGLIKE(reginput, reginput + len)
&& !ireg_icombine)
&& !ireg_icombine
&& OP(next) != RE_COMPOSING)
{
/* raaron: This code makes a composing character get
* ignored, which is the correct behavior (sometimes)
@ -4791,6 +4798,16 @@ regmatch(scan)
status = RA_NOMATCH;
break;
#endif
case RE_COMPOSING:
#ifdef FEAT_MBYTE
if (enc_utf8)
{
/* Skip composing characters. */
while (utf_iscomposing(utf_ptr2char(reginput)))
mb_cptr_adv(reginput);
}
#endif
break;
case NOTHING:
break;

View File

@ -81,6 +81,7 @@ enum
NFA_COMPOSING, /* Next nodes in NFA are part of the
composing multibyte char */
NFA_END_COMPOSING, /* End of a composing char in the NFA */
NFA_ANY_COMPOSING, /* \%C: Any composing characters. */
NFA_OPT_CHARS, /* \%[abc] */
/* The following are used only in the postfix form, not in the NFA */
@ -1418,6 +1419,10 @@ nfa_regatom()
EMIT(NFA_VISUAL);
break;
case 'C':
EMIT(NFA_ANY_COMPOSING);
break;
case '[':
{
int n;
@ -2429,6 +2434,7 @@ nfa_set_code(c)
case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
@ -2967,6 +2973,7 @@ nfa_max_width(startstate, depth)
case NFA_NLOWER_IC:
case NFA_UPPER_IC:
case NFA_NUPPER_IC:
case NFA_ANY_COMPOSING:
/* possibly non-ascii */
#ifdef FEAT_MBYTE
if (has_mbyte)
@ -4152,6 +4159,7 @@ match_follows(startstate, depth)
continue;
case NFA_ANY:
case NFA_ANY_COMPOSING:
case NFA_IDENT:
case NFA_SIDENT:
case NFA_KWORD:
@ -4395,7 +4403,7 @@ skip_add:
switch (state->c)
{
case NFA_MATCH:
nfa_match = TRUE;
// nfa_match = TRUE;
break;
case NFA_SPLIT:
@ -5151,6 +5159,7 @@ failure_chance(state, depth)
case NFA_MATCH:
case NFA_MCLOSE:
case NFA_ANY_COMPOSING:
/* empty match works always */
return 0;
@ -5573,6 +5582,12 @@ nfa_regmatch(prog, start, submatch, m)
{
case NFA_MATCH:
{
#ifdef FEAT_MBYTE
/* If the match ends before a composing characters and
* ireg_icombine is not set, that is not really a match. */
if (enc_utf8 && !ireg_icombine && utf_iscomposing(curc))
break;
#endif
nfa_match = TRUE;
copy_sub(&submatch->norm, &t->subs.norm);
#ifdef FEAT_SYN_HL
@ -6120,6 +6135,23 @@ nfa_regmatch(prog, start, submatch, m)
}
break;
case NFA_ANY_COMPOSING:
/* On a composing character skip over it. Otherwise do
* nothing. Always matches. */
#ifdef FEAT_MBYTE
if (enc_utf8 && utf_iscomposing(curc))
{
add_off = clen;
}
else
#endif
{
add_here = TRUE;
add_off = 0;
}
add_state = t->state->out;
break;
/*
* Character classes like \a for alpha, \d for digit etc.
*/
@ -6484,12 +6516,10 @@ nfa_regmatch(prog, start, submatch, m)
if (!result && ireg_ic)
result = MB_TOLOWER(c) == MB_TOLOWER(curc);
#ifdef FEAT_MBYTE
/* If there is a composing character which is not being
* ignored there can be no match. Match with composing
* character uses NFA_COMPOSING above. */
if (result && enc_utf8 && !ireg_icombine
&& clen != utf_char2len(curc))
result = FALSE;
/* If ireg_icombine is not set only skip over the character
* itself. When it is set skip over composing characters. */
if (result && enc_utf8 && !ireg_icombine)
clen = utf_char2len(curc);
#endif
ADD_STATE_IF_MATCH(t->state);
break;

View File

@ -734,6 +734,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
293,
/**/
292,
/**/