updated for version 7.4.293

Problem: It is not possible to ignore composing characters at a specific point in a pattern. Solution: Add the %C item.
2025-07-04 23:07:33 -04:00 · 2014-05-13 19:37:29 +02:00 · 2014-05-13 19:37:29 +02:00 · 8df5acfda9
commit 8df5acfda9
parent 6082bea6ac
4 changed files with 69 additions and 13 deletions
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@ -545,6 +545,7 @@ Character classes {not in Vi}:				*/character-classes*
 |/\%u|	\%u	\%u	match specified multibyte character (eg \%u20ac)
 |/\%U|	\%U	\%U	match specified large multibyte character (eg
 			\%U12345678)
 |/\%C|	\%C	\%C	match any composing characters
 Example			matches ~
 \<\I\i*		or
@ -1207,12 +1208,18 @@ will probably never match.
 8. Composing characters					*patterns-composing*
 							*/\Z*
-When "\Z" appears anywhere in the pattern, composing characters are ignored.
+When "\Z" appears anywhere in the pattern, all composing characters are
-Thus only the base characters need to match, the composing characters may be
+ignored.  Thus only the base characters need to match, the composing
-different and the number of composing characters may differ.  Only relevant
+characters may be different and the number of composing characters may differ.
-when 'encoding' is "utf-8".
+Only relevant when 'encoding' is "utf-8".
 Exception: If the pattern starts with one or more composing characters, these
 must match.
 							*/\%C*
 Use "\%C" to skip any composing characters.  For example, the pattern "a" does
 not match in "càt" (where the a has the composing character 0x0300), but
 "a\%C" does.  Note that this does not match "cát" (where the á is character
 0xe1, it does not have a compositing character).  It does match "cat" (where
 the a is just an a).
 When a composing character appears at the start of the pattern of after an
 item that doesn't include the composing character, a match is found at any
--- a/src/regexp.c
+++ b/src/regexp.c
@ -244,6 +244,7 @@
 #define RE_MARK		207	/* mark cmp  Match mark position */
 #define RE_VISUAL	208	/*	Match Visual area */
 #define RE_COMPOSING	209	/* any composing characters */
 /*
 * Magic characters have a special meaning, they don't match literally.
@ -2208,6 +2209,10 @@ regatom(flagp)
 		    ret = regnode(RE_VISUAL);
 		    break;
 		case 'C':
 		    ret = regnode(RE_COMPOSING);
 		    break;
 		/* \%[abc]: Emit as a list of branches, all ending at the last
 		 * branch which matches nothing. */
 		case '[':
@ -4710,11 +4715,13 @@ regmatch(scan)
 			    status = RA_NOMATCH;
 		    }
 #ifdef FEAT_MBYTE
-		    /* Check for following composing character. */
+		    /* Check for following composing character, unless %C
 		     * follows (skips over all composing chars). */
 		    if (status != RA_NOMATCH
 			    && enc_utf8
 			    && UTF_COMPOSINGLIKE(reginput, reginput + len)
-			    && !ireg_icombine)
+			    && !ireg_icombine
 			    && OP(next) != RE_COMPOSING)
 		    {
 			/* raaron: This code makes a composing character get
 			 * ignored, which is the correct behavior (sometimes)
@ -4791,6 +4798,16 @@ regmatch(scan)
 		status = RA_NOMATCH;
 	    break;
 #endif
 	  case RE_COMPOSING:
 #ifdef FEAT_MBYTE
 	    if (enc_utf8)
 	    {
 		/* Skip composing characters. */
 		while (utf_iscomposing(utf_ptr2char(reginput)))
 		    mb_cptr_adv(reginput);
 	    }
 #endif
 	    break;
 	  case NOTHING:
 	    break;
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@ -81,6 +81,7 @@ enum
    NFA_COMPOSING,		    /* Next nodes in NFA are part of the
 				       composing multibyte char */
    NFA_END_COMPOSING,		    /* End of a composing char in the NFA */
    NFA_ANY_COMPOSING,		    /* \%C: Any composing characters. */
    NFA_OPT_CHARS,		    /* \%[abc] */
    /* The following are used only in the postfix form, not in the NFA */
@ -1418,6 +1419,10 @@ nfa_regatom()
 		    EMIT(NFA_VISUAL);
 		    break;
 		case 'C':
 		    EMIT(NFA_ANY_COMPOSING);
 		    break;
 		case '[':
 		    {
 			int	    n;
@ -2429,6 +2434,7 @@ nfa_set_code(c)
 	case NFA_MARK_LT:	STRCPY(code, "NFA_MARK_LT "); break;
 	case NFA_CURSOR:	STRCPY(code, "NFA_CURSOR "); break;
 	case NFA_VISUAL:	STRCPY(code, "NFA_VISUAL "); break;
 	case NFA_ANY_COMPOSING:	STRCPY(code, "NFA_ANY_COMPOSING "); break;
 	case NFA_STAR:		STRCPY(code, "NFA_STAR "); break;
 	case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
@ -2967,6 +2973,7 @@ nfa_max_width(startstate, depth)
 	    case NFA_NLOWER_IC:
 	    case NFA_UPPER_IC:
 	    case NFA_NUPPER_IC:
 	    case NFA_ANY_COMPOSING:
 		/* possibly non-ascii */
 #ifdef FEAT_MBYTE
 		if (has_mbyte)
@ -4152,6 +4159,7 @@ match_follows(startstate, depth)
 		continue;
 	    case NFA_ANY:
 	    case NFA_ANY_COMPOSING:
 	    case NFA_IDENT:
 	    case NFA_SIDENT:
 	    case NFA_KWORD:
@ -4395,7 +4403,7 @@ skip_add:
    switch (state->c)
    {
 	case NFA_MATCH:
-	    nfa_match = TRUE;
+//	    nfa_match = TRUE;
 	    break;
 	case NFA_SPLIT:
@ -5151,6 +5159,7 @@ failure_chance(state, depth)
 	case NFA_MATCH:
 	case NFA_MCLOSE:
 	case NFA_ANY_COMPOSING:
 	    /* empty match works always */
 	    return 0;
@ -5573,6 +5582,12 @@ nfa_regmatch(prog, start, submatch, m)
 	    {
 	    case NFA_MATCH:
 	      {
 #ifdef FEAT_MBYTE
 		/* If the match ends before a composing characters and
 		 * ireg_icombine is not set, that is not really a match. */
 		if (enc_utf8 && !ireg_icombine && utf_iscomposing(curc))
 		    break;
 #endif
 		nfa_match = TRUE;
 		copy_sub(&submatch->norm, &t->subs.norm);
 #ifdef FEAT_SYN_HL
@ -6120,6 +6135,23 @@ nfa_regmatch(prog, start, submatch, m)
 		}
 		break;
 	    case NFA_ANY_COMPOSING:
 		/* On a composing character skip over it.  Otherwise do
 		 * nothing.  Always matches. */
 #ifdef FEAT_MBYTE
 		if (enc_utf8 && utf_iscomposing(curc))
 		{
 		    add_off = clen;
 		}
 		else
 #endif
 		{
 		    add_here = TRUE;
 		    add_off = 0;
 		}
 		add_state = t->state->out;
 		break;
 	    /*
 	     * Character classes like \a for alpha, \d for digit etc.
 	     */
@ -6484,12 +6516,10 @@ nfa_regmatch(prog, start, submatch, m)
 		if (!result && ireg_ic)
 		    result = MB_TOLOWER(c) == MB_TOLOWER(curc);
 #ifdef FEAT_MBYTE
-		/* If there is a composing character which is not being
+		/* If ireg_icombine is not set only skip over the character
-		 * ignored there can be no match. Match with composing
+		 * itself.  When it is set skip over composing characters. */
-		 * character uses NFA_COMPOSING above. */
+		if (result && enc_utf8 && !ireg_icombine)
-		if (result && enc_utf8 && !ireg_icombine
+		    clen = utf_char2len(curc);
 						&& clen != utf_char2len(curc))
 		    result = FALSE;
 #endif
 		ADD_STATE_IF_MATCH(t->state);
 		break;
--- a/src/version.c
+++ b/src/version.c
@ -734,6 +734,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
    293,
 /**/
    292,
 /**/