1
0
forked from aniani/vim

updated for version 7.4.001

Problem:    Character classes such as [a-z] to not react to 'ignorecase'.
            Breaks man page highlighting. (Mario Grgic)
Solution:   Add separate items for classes that react to 'ignorecase'.  Clean
            up logic handling character classes.  Add more tests.
This commit is contained in:
Bram Moolenaar
2013-08-14 12:06:49 +02:00
parent 3b1db36689
commit 1cfad52a03
4 changed files with 147 additions and 46 deletions

View File

@@ -29,6 +29,9 @@
# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log" # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
#endif #endif
/* Added to NFA_ANY - NFA_NUPPER_IC to include a NL. */
#define NFA_ADD_NL 31
enum enum
{ {
NFA_SPLIT = -1024, NFA_SPLIT = -1024,
@@ -183,6 +186,13 @@ enum
NFA_NLOWER, /* Match non-lowercase char */ NFA_NLOWER, /* Match non-lowercase char */
NFA_UPPER, /* Match uppercase char */ NFA_UPPER, /* Match uppercase char */
NFA_NUPPER, /* Match non-uppercase char */ NFA_NUPPER, /* Match non-uppercase char */
NFA_LOWER_IC, /* Match [a-z] */
NFA_NLOWER_IC, /* Match [^a-z] */
NFA_UPPER_IC, /* Match [A-Z] */
NFA_NUPPER_IC, /* Match [^A-Z] */
NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
NFA_CURSOR, /* Match cursor pos */ NFA_CURSOR, /* Match cursor pos */
NFA_LNUM, /* Match line number */ NFA_LNUM, /* Match line number */
@@ -199,9 +209,6 @@ enum
NFA_MARK_LT, /* Match < mark */ NFA_MARK_LT, /* Match < mark */
NFA_VISUAL, /* Match Visual area */ NFA_VISUAL, /* Match Visual area */
NFA_FIRST_NL = NFA_ANY + ADD_NL,
NFA_LAST_NL = NFA_NUPPER + ADD_NL,
/* Character classes [:alnum:] etc */ /* Character classes [:alnum:] etc */
NFA_CLASS_ALNUM, NFA_CLASS_ALNUM,
NFA_CLASS_ALPHA, NFA_CLASS_ALPHA,
@@ -578,6 +585,8 @@ realloc_post_list()
* On failure, return 0 (=FAIL) * On failure, return 0 (=FAIL)
* Start points to the first char of the range, while end should point * Start points to the first char of the range, while end should point
* to the closing brace. * to the closing brace.
* Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
* need to be interpreted as [a-zA-Z].
*/ */
static int static int
nfa_recognize_char_class(start, end, extra_newl) nfa_recognize_char_class(start, end, extra_newl)
@@ -681,7 +690,7 @@ nfa_recognize_char_class(start, end, extra_newl)
return FAIL; return FAIL;
if (newl == TRUE) if (newl == TRUE)
extra_newl = ADD_NL; extra_newl = NFA_ADD_NL;
switch (config) switch (config)
{ {
@@ -710,13 +719,13 @@ nfa_recognize_char_class(start, end, extra_newl)
case CLASS_not | CLASS_az | CLASS_AZ: case CLASS_not | CLASS_az | CLASS_AZ:
return extra_newl + NFA_NALPHA; return extra_newl + NFA_NALPHA;
case CLASS_az: case CLASS_az:
return extra_newl + NFA_LOWER; return extra_newl + NFA_LOWER_IC;
case CLASS_not | CLASS_az: case CLASS_not | CLASS_az:
return extra_newl + NFA_NLOWER; return extra_newl + NFA_NLOWER_IC;
case CLASS_AZ: case CLASS_AZ:
return extra_newl + NFA_UPPER; return extra_newl + NFA_UPPER_IC;
case CLASS_not | CLASS_AZ: case CLASS_not | CLASS_AZ:
return extra_newl + NFA_NUPPER; return extra_newl + NFA_NUPPER_IC;
} }
return FAIL; return FAIL;
} }
@@ -914,7 +923,7 @@ nfa_regatom()
break; break;
} }
extra = ADD_NL; extra = NFA_ADD_NL;
/* "\_[" is collection plus newline */ /* "\_[" is collection plus newline */
if (c == '[') if (c == '[')
@@ -970,7 +979,7 @@ nfa_regatom()
} }
#endif #endif
EMIT(nfa_classcodes[p - classchars]); EMIT(nfa_classcodes[p - classchars]);
if (extra == ADD_NL) if (extra == NFA_ADD_NL)
{ {
EMIT(NFA_NEWL); EMIT(NFA_NEWL);
EMIT(NFA_OR); EMIT(NFA_OR);
@@ -1240,21 +1249,21 @@ collection:
{ {
/* /*
* Try to reverse engineer character classes. For example, * Try to reverse engineer character classes. For example,
* recognize that [0-9] stands for \d and [A-Za-z_] with \h, * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
* and perform the necessary substitutions in the NFA. * and perform the necessary substitutions in the NFA.
*/ */
result = nfa_recognize_char_class(regparse, endp, result = nfa_recognize_char_class(regparse, endp,
extra == ADD_NL); extra == NFA_ADD_NL);
if (result != FAIL) if (result != FAIL)
{ {
if (result >= NFA_DIGIT && result <= NFA_NUPPER) if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
EMIT(result);
else /* must be char class + newline */
{ {
EMIT(result - ADD_NL); EMIT(result - NFA_ADD_NL);
EMIT(NFA_NEWL); EMIT(NFA_NEWL);
EMIT(NFA_OR); EMIT(NFA_OR);
} }
else
EMIT(result);
regparse = endp; regparse = endp;
mb_ptr_adv(regparse); mb_ptr_adv(regparse);
return OK; return OK;
@@ -1504,7 +1513,7 @@ collection:
* collection, add an OR below. But not for negated * collection, add an OR below. But not for negated
* range. */ * range. */
if (!negated) if (!negated)
extra = ADD_NL; extra = NFA_ADD_NL;
} }
else else
{ {
@@ -1537,7 +1546,7 @@ collection:
EMIT(NFA_END_COLL); EMIT(NFA_END_COLL);
/* \_[] also matches \n but it's not negated */ /* \_[] also matches \n but it's not negated */
if (extra == ADD_NL) if (extra == NFA_ADD_NL)
{ {
EMIT(reg_string ? NL : NFA_NEWL); EMIT(reg_string ? NL : NFA_NEWL);
EMIT(NFA_OR); EMIT(NFA_OR);
@@ -2011,7 +2020,7 @@ nfa_set_code(c)
if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
{ {
addnl = TRUE; addnl = TRUE;
c -= ADD_NL; c -= NFA_ADD_NL;
} }
STRCPY(code, ""); STRCPY(code, "");
@@ -2217,6 +2226,10 @@ nfa_set_code(c)
case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break; case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break; case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break; case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
default: default:
STRCPY(code, "CHAR(x)"); STRCPY(code, "CHAR(x)");
@@ -2687,6 +2700,10 @@ nfa_max_width(startstate, depth)
case NFA_NLOWER: case NFA_NLOWER:
case NFA_UPPER: case NFA_UPPER:
case NFA_NUPPER: case NFA_NUPPER:
case NFA_LOWER_IC:
case NFA_NLOWER_IC:
case NFA_UPPER_IC:
case NFA_NUPPER_IC:
/* possibly non-ascii */ /* possibly non-ascii */
#ifdef FEAT_MBYTE #ifdef FEAT_MBYTE
if (has_mbyte) if (has_mbyte)
@@ -3841,6 +3858,10 @@ match_follows(startstate, depth)
case NFA_NLOWER: case NFA_NLOWER:
case NFA_UPPER: case NFA_UPPER:
case NFA_NUPPER: case NFA_NUPPER:
case NFA_LOWER_IC:
case NFA_NLOWER_IC:
case NFA_UPPER_IC:
case NFA_NUPPER_IC:
case NFA_START_COLL: case NFA_START_COLL:
case NFA_START_NEG_COLL: case NFA_START_NEG_COLL:
case NFA_NEWL: case NFA_NEWL:
@@ -5872,6 +5893,28 @@ nfa_regmatch(prog, start, submatch, m)
ADD_STATE_IF_MATCH(t->state); ADD_STATE_IF_MATCH(t->state);
break; break;
case NFA_LOWER_IC: /* [a-z] */
result = ri_lower(curc) || (ireg_ic && ri_upper(curc));
ADD_STATE_IF_MATCH(t->state);
break;
case NFA_NLOWER_IC: /* [^a-z] */
result = curc != NUL
&& !(ri_lower(curc) || (ireg_ic && ri_upper(curc)));
ADD_STATE_IF_MATCH(t->state);
break;
case NFA_UPPER_IC: /* [A-Z] */
result = ri_upper(curc) || (ireg_ic && ri_lower(curc));
ADD_STATE_IF_MATCH(t->state);
break;
case NFA_NUPPER_IC: /* ^[A-Z] */
result = curc != NUL
&& !(ri_upper(curc) || (ireg_ic && ri_lower(curc)));
ADD_STATE_IF_MATCH(t->state);
break;
case NFA_BACKREF1: case NFA_BACKREF1:
case NFA_BACKREF2: case NFA_BACKREF2:
case NFA_BACKREF3: case NFA_BACKREF3:

View File

@@ -289,15 +289,29 @@ STARTTEST
:call add(tl, [2, '.a\%$', " a\n "]) :call add(tl, [2, '.a\%$', " a\n "])
:call add(tl, [2, '.a\%$', " a\n_a", "_a"]) :call add(tl, [2, '.a\%$', " a\n_a", "_a"])
:" :"
:"""" Test recognition of some character classes :"""" Test recognition of character classes
:call add(tl, [2, '[0-9]', '8', '8']) :call add(tl, [2, '[0-7]\+', 'x0123456789x', '01234567'])
:call add(tl, [2, '[^0-9]', '8']) :call add(tl, [2, '[^0-7]\+', '0a;X+% 897', 'a;X+% 89'])
:call add(tl, [2, '[0-9a-fA-F]*', '0a7', '0a7']) :call add(tl, [2, '[0-9]\+', 'x0123456789x', '0123456789'])
:call add(tl, [2, '[^0-9A-Fa-f]\+', '0a7']) :call add(tl, [2, '[^0-9]\+', '0a;X+% 9', 'a;X+% '])
:call add(tl, [2, '[a-z_A-Z0-9]\+', 'aso_sfoij', 'aso_sfoij']) :call add(tl, [2, '[0-9a-fA-F]\+', 'x0189abcdefg', '0189abcdef'])
:call add(tl, [2, '[a-z]', 'a', 'a']) :call add(tl, [2, '[^0-9A-Fa-f]\+', '0189g;X+% ab', 'g;X+% '])
:call add(tl, [2, '[a-zA-Z]', 'a', 'a']) :call add(tl, [2, '[a-z_A-Z0-9]\+', ';+aso_SfOij ', 'aso_SfOij'])
:call add(tl, [2, '[A-Z]', 'a']) :call add(tl, [2, '[^a-z_A-Z0-9]\+', 'aSo_;+% sfOij', ';+% '])
:call add(tl, [2, '[a-z_A-Z]\+', '0abyz_ABYZ;', 'abyz_ABYZ'])
:call add(tl, [2, '[^a-z_A-Z]\+', 'abAB_09;+% yzYZ', '09;+% '])
:call add(tl, [2, '[a-z]\+', '0abcxyz1', 'abcxyz'])
:call add(tl, [2, '[a-z]\+', 'AabxyzZ', 'abxyz'])
:call add(tl, [2, '[^a-z]\+', 'a;X09+% x', ';X09+% '])
:call add(tl, [2, '[^a-z]\+', 'abX0;%yz', 'X0;%'])
:call add(tl, [2, '[a-zA-Z]\+', '0abABxzXZ9', 'abABxzXZ'])
:call add(tl, [2, '[^a-zA-Z]\+', 'ab09_;+ XZ', '09_;+ '])
:call add(tl, [2, '[A-Z]\+', 'aABXYZz', 'ABXYZ'])
:call add(tl, [2, '[^A-Z]\+', 'ABx0;%YZ', 'x0;%'])
:call add(tl, [2, '[a-z]\+\c', '0abxyzABXYZ;', 'abxyzABXYZ'])
:call add(tl, [2, '[A-Z]\+\c', '0abABxzXZ9', 'abABxzXZ'])
:call add(tl, [2, '\c[^a-z]\+', 'ab09_;+ XZ', '09_;+ '])
:call add(tl, [2, '\c[^A-Z]\+', 'ab09_;+ XZ', '09_;+ '])
:call add(tl, [2, '\C[^A-Z]\+', 'ABCOIJDEOIFNSD jsfoij sa', ' jsfoij sa']) :call add(tl, [2, '\C[^A-Z]\+', 'ABCOIJDEOIFNSD jsfoij sa', ' jsfoij sa'])
:" :"
:"""" Tests for \z features :"""" Tests for \z features

View File

@@ -650,30 +650,72 @@ OK 2 - .a\%$
OK 0 - .a\%$ OK 0 - .a\%$
OK 1 - .a\%$ OK 1 - .a\%$
OK 2 - .a\%$ OK 2 - .a\%$
OK 0 - [0-9] OK 0 - [0-7]\+
OK 1 - [0-9] OK 1 - [0-7]\+
OK 2 - [0-9] OK 2 - [0-7]\+
OK 0 - [^0-9] OK 0 - [^0-7]\+
OK 1 - [^0-9] OK 1 - [^0-7]\+
OK 2 - [^0-9] OK 2 - [^0-7]\+
OK 0 - [0-9a-fA-F]* OK 0 - [0-9]\+
OK 1 - [0-9a-fA-F]* OK 1 - [0-9]\+
OK 2 - [0-9a-fA-F]* OK 2 - [0-9]\+
OK 0 - [^0-9]\+
OK 1 - [^0-9]\+
OK 2 - [^0-9]\+
OK 0 - [0-9a-fA-F]\+
OK 1 - [0-9a-fA-F]\+
OK 2 - [0-9a-fA-F]\+
OK 0 - [^0-9A-Fa-f]\+ OK 0 - [^0-9A-Fa-f]\+
OK 1 - [^0-9A-Fa-f]\+ OK 1 - [^0-9A-Fa-f]\+
OK 2 - [^0-9A-Fa-f]\+ OK 2 - [^0-9A-Fa-f]\+
OK 0 - [a-z_A-Z0-9]\+ OK 0 - [a-z_A-Z0-9]\+
OK 1 - [a-z_A-Z0-9]\+ OK 1 - [a-z_A-Z0-9]\+
OK 2 - [a-z_A-Z0-9]\+ OK 2 - [a-z_A-Z0-9]\+
OK 0 - [a-z] OK 0 - [^a-z_A-Z0-9]\+
OK 1 - [a-z] OK 1 - [^a-z_A-Z0-9]\+
OK 2 - [a-z] OK 2 - [^a-z_A-Z0-9]\+
OK 0 - [a-zA-Z] OK 0 - [a-z_A-Z]\+
OK 1 - [a-zA-Z] OK 1 - [a-z_A-Z]\+
OK 2 - [a-zA-Z] OK 2 - [a-z_A-Z]\+
OK 0 - [A-Z] OK 0 - [^a-z_A-Z]\+
OK 1 - [A-Z] OK 1 - [^a-z_A-Z]\+
OK 2 - [A-Z] OK 2 - [^a-z_A-Z]\+
OK 0 - [a-z]\+
OK 1 - [a-z]\+
OK 2 - [a-z]\+
OK 0 - [a-z]\+
OK 1 - [a-z]\+
OK 2 - [a-z]\+
OK 0 - [^a-z]\+
OK 1 - [^a-z]\+
OK 2 - [^a-z]\+
OK 0 - [^a-z]\+
OK 1 - [^a-z]\+
OK 2 - [^a-z]\+
OK 0 - [a-zA-Z]\+
OK 1 - [a-zA-Z]\+
OK 2 - [a-zA-Z]\+
OK 0 - [^a-zA-Z]\+
OK 1 - [^a-zA-Z]\+
OK 2 - [^a-zA-Z]\+
OK 0 - [A-Z]\+
OK 1 - [A-Z]\+
OK 2 - [A-Z]\+
OK 0 - [^A-Z]\+
OK 1 - [^A-Z]\+
OK 2 - [^A-Z]\+
OK 0 - [a-z]\+\c
OK 1 - [a-z]\+\c
OK 2 - [a-z]\+\c
OK 0 - [A-Z]\+\c
OK 1 - [A-Z]\+\c
OK 2 - [A-Z]\+\c
OK 0 - \c[^a-z]\+
OK 1 - \c[^a-z]\+
OK 2 - \c[^a-z]\+
OK 0 - \c[^A-Z]\+
OK 1 - \c[^A-Z]\+
OK 2 - \c[^A-Z]\+
OK 0 - \C[^A-Z]\+ OK 0 - \C[^A-Z]\+
OK 1 - \C[^A-Z]\+ OK 1 - \C[^A-Z]\+
OK 2 - \C[^A-Z]\+ OK 2 - \C[^A-Z]\+

View File

@@ -727,6 +727,8 @@ static char *(features[]) =
static int included_patches[] = static int included_patches[] =
{ /* Add new patch number below this line */ { /* Add new patch number below this line */
/**/
1,
/**/ /**/
0 0
}; };