1
0
forked from aniani/vim

patch 9.0.1617: charidx() result is not consistent with byteidx()

Problem:    charidx() and utf16idx() result is not consistent with byteidx().
Solution:   When the index is equal to the length of the text return the
            lenght of the text instead of -1. (Yegappan Lakshmanan,
            closes #12503)
This commit is contained in:
Yegappan Lakshmanan
2023-06-08 17:09:45 +01:00
committed by Bram Moolenaar
parent 5bf042810b
commit 577922b917
6 changed files with 132 additions and 52 deletions

View File

@@ -1528,11 +1528,13 @@ charidx({string}, {idx} [, {countcc} [, {utf16}]])
When {utf16} is present and TRUE, {idx} is used as the UTF-16
index in the String {expr} instead of as the byte index.
Returns -1 if the arguments are invalid or if {idx} is greater
than the index of the last byte in {string}. An error is
given if the first argument is not a string, the second
argument is not a number or when the third argument is present
and is not zero or one.
Returns -1 if the arguments are invalid or if there are less
than {idx} bytes. If there are exactly {idx} bytes the length
of the string in characters is returned.
An error is given and -1 is returned if the first argument is
not a string, the second argument is not a number or when the
third argument is present and is not zero or one.
See |byteidx()| and |byteidxcomp()| for getting the byte index
from the character index and |utf16idx()| for getting the
@@ -10119,8 +10121,8 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*
<
*utf16idx()*
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
Same as |charidx()| but returns the UTF-16 index of the byte
at {idx} in {string} (after converting it to UTF-16).
Same as |charidx()| but returns the UTF-16 code unit index of
the byte at {idx} in {string} (after converting it to UTF-16).
When {charidx} is present and TRUE, {idx} is used as the
character index in the String {string} instead of as the byte
@@ -10128,6 +10130,10 @@ utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
An {idx} in the middle of a UTF-8 sequence is rounded upwards
to the end of that sequence.
Returns -1 if the arguments are invalid or if there are less
than {idx} bytes in {string}. If there are exactly {idx} bytes
the length of the string in UTF-16 code units is returned.
See |byteidx()| and |byteidxcomp()| for getting the byte index
from the UTF-16 index and |charidx()| for getting the
character index from the UTF-16 index.

View File

@@ -1099,6 +1099,7 @@ static argcheck_T arg3_string_string_number[] = {arg_string, arg_string, arg_num
static argcheck_T arg4_number_number_string_any[] = {arg_number, arg_number, arg_string, NULL};
static argcheck_T arg4_string_string_any_string[] = {arg_string, arg_string, NULL, arg_string};
static argcheck_T arg4_string_string_number_string[] = {arg_string, arg_string, arg_number, arg_string};
static argcheck_T arg4_string_number_bool_bool[] = {arg_string, arg_number, arg_bool, arg_bool};
/* Function specific argument types (not covered by the above) */
static argcheck_T arg15_assert_fails[] = {arg_string_or_nr, arg_string_or_list_any, NULL, arg_number, arg_string};
static argcheck_T arg34_assert_inrange[] = {arg_float_or_nr, arg_float_or_nr, arg_float_or_nr, arg_string};
@@ -1814,7 +1815,7 @@ static funcentry_T global_functions[] =
ret_number, f_charclass},
{"charcol", 1, 2, FEARG_1, arg2_string_or_list_number,
ret_number, f_charcol},
{"charidx", 2, 4, FEARG_1, arg3_string_number_bool,
{"charidx", 2, 4, FEARG_1, arg4_string_number_bool_bool,
ret_number, f_charidx},
{"chdir", 1, 1, FEARG_1, arg1_string,
ret_string, f_chdir},
@@ -2798,7 +2799,7 @@ static funcentry_T global_functions[] =
ret_dict_any, f_undotree},
{"uniq", 1, 3, FEARG_1, arg13_sortuniq,
ret_first_arg, f_uniq},
{"utf16idx", 2, 4, FEARG_1, arg3_string_number_bool,
{"utf16idx", 2, 4, FEARG_1, arg4_string_number_bool_bool,
ret_number, f_utf16idx},
{"values", 1, 1, FEARG_1, arg1_dict_any,
ret_list_member, f_values},
@@ -3630,7 +3631,7 @@ f_copy(typval_T *argvars, typval_T *rettv)
/*
* Set the cursor position.
* If 'charcol' is TRUE, then use the column number as a character offset.
* If "charcol" is TRUE, then use the column number as a character offset.
* Otherwise use the column number as a byte offset.
*/
static void

View File

@@ -1054,7 +1054,8 @@ byteidx_common(typval_T *argvars, typval_T *rettv, int comp UNUSED)
if (in_vim9script()
&& (check_for_string_arg(argvars, 0) == FAIL
|| check_for_number_arg(argvars, 1) == FAIL))
|| check_for_number_arg(argvars, 1) == FAIL
|| check_for_opt_bool_arg(argvars, 2) == FAIL))
return;
char_u *str = tv_get_string_chk(&argvars[0]);
@@ -1158,7 +1159,14 @@ f_charidx(typval_T *argvars, typval_T *rettv)
for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++)
{
if (*p == NUL)
{
// If the index is exactly the number of bytes or utf-16 code units
// in the string then return the length of the string in
// characters.
if (utf16idx ? (idx == 0) : (p == (str + idx)))
rettv->vval.v_number = len;
return;
}
if (utf16idx)
{
idx--;
@@ -1775,7 +1783,14 @@ f_utf16idx(typval_T *argvars, typval_T *rettv)
for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++)
{
if (*p == NUL)
{
// If the index is exactly the number of bytes or characters in the
// string then return the length of the string in utf-16 code
// units.
if (charidx ? (idx == 0) : (p == (str + idx)))
rettv->vval.v_number = len;
return;
}
int clen = ptr2len(p);
int c = (clen > 1) ? utf_ptr2char(p) : *p;
if (c > 0xFFFF)

View File

@@ -1395,7 +1395,8 @@ func Test_charidx()
call assert_equal(1, charidx(a, 3))
call assert_equal(2, charidx(a, 4))
call assert_equal(3, charidx(a, 7))
call assert_equal(-1, charidx(a, 8))
call assert_equal(4, charidx(a, 8))
call assert_equal(-1, charidx(a, 9))
call assert_equal(-1, charidx(a, -1))
" count composing characters
@@ -1403,14 +1404,18 @@ func Test_charidx()
call assert_equal(2, a->charidx(2, 1))
call assert_equal(3, a->charidx(4, 1))
call assert_equal(5, a->charidx(7, 1))
call assert_equal(-1, a->charidx(8, 1))
call assert_equal(6, a->charidx(8, 1))
call assert_equal(-1, a->charidx(9, 1))
" empty string
call assert_equal(-1, charidx('', 0))
call assert_equal(-1, charidx('', 0, 1))
call assert_equal(0, charidx('', 0))
call assert_equal(-1, charidx('', 1))
call assert_equal(0, charidx('', 0, 1))
call assert_equal(-1, charidx('', 1, 1))
" error cases
call assert_equal(-1, charidx(test_null_string(), 0))
call assert_equal(0, charidx(test_null_string(), 0))
call assert_equal(-1, charidx(test_null_string(), 1))
call assert_fails('let x = charidx([], 1)', 'E1174:')
call assert_fails('let x = charidx("abc", [])', 'E1210:')
call assert_fails('let x = charidx("abc", 1, [])', 'E1212:')
@@ -1422,10 +1427,10 @@ endfunc
func Test_charidx_from_utf16_index()
" string with single byte characters
let str = "abc"
for i in range(3)
for i in range(4)
call assert_equal(i, charidx(str, i, v:false, v:true))
endfor
call assert_equal(-1, charidx(str, 3, v:false, v:true))
call assert_equal(-1, charidx(str, 4, v:false, v:true))
" string with two byte characters
let str = "a©©b"
@@ -1433,7 +1438,8 @@ func Test_charidx_from_utf16_index()
call assert_equal(1, charidx(str, 1, v:false, v:true))
call assert_equal(2, charidx(str, 2, v:false, v:true))
call assert_equal(3, charidx(str, 3, v:false, v:true))
call assert_equal(-1, charidx(str, 4, v:false, v:true))
call assert_equal(4, charidx(str, 4, v:false, v:true))
call assert_equal(-1, charidx(str, 5, v:false, v:true))
" string with four byte characters
let str = "a😊😊b"
@@ -1443,38 +1449,48 @@ func Test_charidx_from_utf16_index()
call assert_equal(2, charidx(str, 3, v:false, v:true))
call assert_equal(2, charidx(str, 4, v:false, v:true))
call assert_equal(3, charidx(str, 5, v:false, v:true))
call assert_equal(-1, charidx(str, 6, v:false, v:true))
call assert_equal(4, charidx(str, 6, v:false, v:true))
call assert_equal(-1, charidx(str, 7, v:false, v:true))
" string with composing characters
let str = '-á-b́'
for i in str->strcharlen()->range()
call assert_equal(i, charidx(str, i, v:false, v:true))
endfor
call assert_equal(-1, charidx(str, 4, v:false, v:true))
call assert_equal(4, charidx(str, 4, v:false, v:true))
call assert_equal(-1, charidx(str, 5, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, charidx(str, i, v:true, v:true))
endfor
call assert_equal(-1, charidx(str, 6, v:true, v:true))
call assert_equal(6, charidx(str, 6, v:true, v:true))
call assert_equal(-1, charidx(str, 7, v:true, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
for i in str->strcharlen()->range()
call assert_equal(i, charidx(str, i, v:false, v:true))
endfor
call assert_equal(-1, charidx(str, 4, v:false, v:true))
call assert_equal(4, charidx(str, 4, v:false, v:true))
call assert_equal(-1, charidx(str, 5, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, charidx(str, i, v:true, v:true))
endfor
call assert_equal(-1, charidx(str, 8, v:true, v:true))
call assert_equal(8, charidx(str, 8, v:true, v:true))
call assert_equal(-1, charidx(str, 9, v:true, v:true))
" empty string
call assert_equal(-1, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 0, v:true, v:true))
call assert_equal(0, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 1, v:false, v:true))
call assert_equal(0, charidx('', 0, v:true, v:true))
call assert_equal(-1, charidx('', 1, v:true, v:true))
" error cases
call assert_equal(-1, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 0, v:true, v:true))
call assert_equal(-1, charidx(test_null_string(), 0, v:false, v:true))
call assert_equal(0, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 1, v:false, v:true))
call assert_equal(0, charidx('', 0, v:true, v:true))
call assert_equal(-1, charidx('', 1, v:true, v:true))
call assert_equal(0, charidx(test_null_string(), 0, v:false, v:true))
call assert_equal(-1, charidx(test_null_string(), 1, v:false, v:true))
call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:')
call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:')
endfunc
@@ -1483,10 +1499,10 @@ endfunc
func Test_utf16idx_from_byteidx()
" UTF-16 index of a string with single byte characters
let str = "abc"
for i in range(3)
for i in range(4)
call assert_equal(i, utf16idx(str, i))
endfor
call assert_equal(-1, utf16idx(str, 3))
call assert_equal(-1, utf16idx(str, 4))
" UTF-16 index of a string with two byte characters
let str = 'a©©b'
@@ -1496,7 +1512,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(2, str->utf16idx(3))
call assert_equal(2, str->utf16idx(4))
call assert_equal(3, str->utf16idx(5))
call assert_equal(-1, str->utf16idx(6))
call assert_equal(4, str->utf16idx(6))
call assert_equal(-1, str->utf16idx(7))
" UTF-16 index of a string with four byte characters
let str = 'a😊😊b'
@@ -1510,7 +1527,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(4, utf16idx(str, 7))
call assert_equal(4, utf16idx(str, 8))
call assert_equal(5, utf16idx(str, 9))
call assert_equal(-1, utf16idx(str, 10))
call assert_equal(6, utf16idx(str, 10))
call assert_equal(-1, utf16idx(str, 11))
" UTF-16 index of a string with composing characters
let str = '-á-b́'
@@ -1522,7 +1540,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(3, utf16idx(str, 5))
call assert_equal(3, utf16idx(str, 6))
call assert_equal(3, utf16idx(str, 7))
call assert_equal(-1, utf16idx(str, 8))
call assert_equal(4, utf16idx(str, 8))
call assert_equal(-1, utf16idx(str, 9))
call assert_equal(0, utf16idx(str, 0, v:true))
call assert_equal(1, utf16idx(str, 1, v:true))
call assert_equal(2, utf16idx(str, 2, v:true))
@@ -1531,7 +1550,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(4, utf16idx(str, 5, v:true))
call assert_equal(5, utf16idx(str, 6, v:true))
call assert_equal(5, utf16idx(str, 7, v:true))
call assert_equal(-1, utf16idx(str, 8, v:true))
call assert_equal(6, utf16idx(str, 8, v:true))
call assert_equal(-1, utf16idx(str, 9, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
@@ -1547,7 +1567,8 @@ func Test_utf16idx_from_byteidx()
call assert_equal(3, utf16idx(str, 9))
call assert_equal(3, utf16idx(str, 10))
call assert_equal(3, utf16idx(str, 11))
call assert_equal(-1, utf16idx(str, 12))
call assert_equal(4, utf16idx(str, 12))
call assert_equal(-1, utf16idx(str, 13))
call assert_equal(0, utf16idx(str, 0, v:true))
call assert_equal(1, utf16idx(str, 1, v:true))
call assert_equal(2, utf16idx(str, 2, v:true))
@@ -1560,16 +1581,21 @@ func Test_utf16idx_from_byteidx()
call assert_equal(6, utf16idx(str, 9, v:true))
call assert_equal(7, utf16idx(str, 10, v:true))
call assert_equal(7, utf16idx(str, 11, v:true))
call assert_equal(-1, utf16idx(str, 12, v:true))
call assert_equal(8, utf16idx(str, 12, v:true))
call assert_equal(-1, utf16idx(str, 13, v:true))
" empty string
call assert_equal(-1, utf16idx('', 0))
call assert_equal(-1, utf16idx('', 0, v:true))
call assert_equal(0, utf16idx('', 0))
call assert_equal(-1, utf16idx('', 1))
call assert_equal(0, utf16idx('', 0, v:true))
call assert_equal(-1, utf16idx('', 1, v:true))
" error cases
call assert_equal(-1, utf16idx("", 0))
call assert_equal(0, utf16idx("", 0))
call assert_equal(-1, utf16idx("", 1))
call assert_equal(-1, utf16idx("abc", -1))
call assert_equal(-1, utf16idx(test_null_string(), 0))
call assert_equal(0, utf16idx(test_null_string(), 0))
call assert_equal(-1, utf16idx(test_null_string(), 1))
call assert_fails('let l = utf16idx([], 0)', 'E1174:')
call assert_fails('let l = utf16idx("ab", [])', 'E1210:')
call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:')
@@ -1581,14 +1607,16 @@ func Test_utf16idx_from_charidx()
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 3, v:false, v:true))
call assert_equal(3, utf16idx(str, 3, v:false, v:true))
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
" UTF-16 index of a string with two byte characters
let str = "a©©b"
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
call assert_equal(4, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
" UTF-16 index of a string with four byte characters
let str = "a😊😊b"
@@ -1596,36 +1624,44 @@ func Test_utf16idx_from_charidx()
call assert_equal(2, utf16idx(str, 1, v:false, v:true))
call assert_equal(4, utf16idx(str, 2, v:false, v:true))
call assert_equal(5, utf16idx(str, 3, v:false, v:true))
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
call assert_equal(6, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
" UTF-16 index of a string with composing characters
let str = '-á-b́'
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
call assert_equal(4, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, utf16idx(str, i, v:true, v:true))
endfor
call assert_equal(-1, utf16idx(str, 6, v:true, v:true))
call assert_equal(6, utf16idx(str, 6, v:true, v:true))
call assert_equal(-1, utf16idx(str, 7, v:true, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
call assert_equal(4, utf16idx(str, 4, v:false, v:true))
call assert_equal(-1, utf16idx(str, 5, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, utf16idx(str, i, v:true, v:true))
endfor
call assert_equal(-1, utf16idx(str, 8, v:true, v:true))
call assert_equal(8, utf16idx(str, 8, v:true, v:true))
call assert_equal(-1, utf16idx(str, 9, v:true, v:true))
" empty string
call assert_equal(-1, utf16idx('', 0, v:false, v:true))
call assert_equal(-1, utf16idx('', 0, v:true, v:true))
call assert_equal(0, utf16idx('', 0, v:false, v:true))
call assert_equal(-1, utf16idx('', 1, v:false, v:true))
call assert_equal(0, utf16idx('', 0, v:true, v:true))
call assert_equal(-1, utf16idx('', 1, v:true, v:true))
" error cases
call assert_equal(-1, utf16idx(test_null_string(), 0, v:true, v:true))
call assert_equal(0, utf16idx(test_null_string(), 0, v:true, v:true))
call assert_equal(-1, utf16idx(test_null_string(), 1, v:true, v:true))
call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:')
endfunc

View File

@@ -460,6 +460,7 @@ enddef
def Test_byteidx()
v9.CheckDefAndScriptFailure(['byteidx(1, 2)'], ['E1013: Argument 1: type mismatch, expected string but got number', 'E1174: String required for argument 1'])
v9.CheckDefAndScriptFailure(['byteidx("a", "b")'], ['E1013: Argument 2: type mismatch, expected number but got string', 'E1210: Number required for argument 2'])
v9.CheckDefAndScriptFailure(['byteidx("a", 0, "")'], ['E1013: Argument 3: type mismatch, expected bool but got string', 'E1212: Bool required for argument 3'])
byteidx('', 0)->assert_equal(0)
byteidx('', 1)->assert_equal(-1)
enddef
@@ -467,6 +468,7 @@ enddef
def Test_byteidxcomp()
v9.CheckDefAndScriptFailure(['byteidxcomp(1, 2)'], ['E1013: Argument 1: type mismatch, expected string but got number', 'E1174: String required for argument 1'])
v9.CheckDefAndScriptFailure(['byteidxcomp("a", "b")'], ['E1013: Argument 2: type mismatch, expected number but got string', 'E1210: Number required for argument 2'])
v9.CheckDefAndScriptFailure(['byteidxcomp("a", 0, "")'], ['E1013: Argument 3: type mismatch, expected bool but got string', 'E1212: Bool required for argument 3'])
enddef
def Test_call_call()
@@ -702,7 +704,8 @@ def Test_charidx()
v9.CheckDefAndScriptFailure(['charidx(0z10, 1)'], ['E1013: Argument 1: type mismatch, expected string but got blob', 'E1174: String required for argument 1'])
v9.CheckDefAndScriptFailure(['charidx("a", "b")'], ['E1013: Argument 2: type mismatch, expected number but got string', 'E1210: Number required for argument 2'])
v9.CheckDefAndScriptFailure(['charidx("a", 1, "")'], ['E1013: Argument 3: type mismatch, expected bool but got string', 'E1212: Bool required for argument 3'])
charidx('', 0)->assert_equal(-1)
v9.CheckDefAndScriptFailure(['charidx("a", 1, 0, "")'], ['E1013: Argument 4: type mismatch, expected bool but got string', 'E1212: Bool required for argument 4'])
charidx('', 0)->assert_equal(0)
charidx('', 1)->assert_equal(-1)
enddef
@@ -4305,6 +4308,14 @@ def Test_strtrans()
strtrans('')->assert_equal('')
enddef
def Test_strutf16len()
v9.CheckDefAndScriptFailure(['strutf16len([])'], ['E1013: Argument 1: type mismatch, expected string but got list<unknown>', 'E1174: String required for argument 1'])
v9.CheckDefAndScriptFailure(['strutf16len("a", "")'], ['E1013: Argument 2: type mismatch, expected bool but got string', 'E1212: Bool required for argument 2'])
""->strutf16len()->assert_equal(0)
'-ą́-ą́'->strutf16len(true)->assert_equal(8)
'-ą́-ą́'->strutf16len(false)->assert_equal(4)
enddef
def Test_strwidth()
v9.CheckDefAndScriptFailure(['strwidth(10)'], ['E1013: Argument 1: type mismatch, expected string but got number', 'E1174: String required for argument 1'])
assert_equal(4, strwidth('abcd'))
@@ -4727,6 +4738,15 @@ def Test_uniq()
v9.CheckDefFailure(['var l: list<number> = uniq(["a", "b"])'], 'E1012: Type mismatch; expected list<number> but got list<string>')
enddef
def Test_utf16idx()
v9.CheckDefAndScriptFailure(['utf16idx(0z10, 1)'], ['E1013: Argument 1: type mismatch, expected string but got blob', 'E1174: String required for argument 1'])
v9.CheckDefAndScriptFailure(['utf16idx("a", "b")'], ['E1013: Argument 2: type mismatch, expected number but got string', 'E1210: Number required for argument 2'])
v9.CheckDefAndScriptFailure(['utf16idx("a", 1, "")'], ['E1013: Argument 3: type mismatch, expected bool but got string', 'E1212: Bool required for argument 3'])
v9.CheckDefAndScriptFailure(['utf16idx("a", 1, 0, "")'], ['E1013: Argument 4: type mismatch, expected bool but got string', 'E1212: Bool required for argument 4'])
utf16idx('', 0)->assert_equal(0)
utf16idx('', 1)->assert_equal(-1)
enddef
def Test_uniq_const()
var lines =<< trim END
const l = [1, 2, 3, 4]

View File

@@ -695,6 +695,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
1617,
/**/
1616,
/**/