1
0
forked from aniani/vim

patch 9.0.1485: no functions for converting from/to UTF-16 index

Problem:    no functions for converting from/to UTF-16 index.
Solution:   Add UTF-16 flag to existing funtions and add strutf16len() and
            utf16idx(). (Yegappan Lakshmanan, closes #12216)
This commit is contained in:
Christian Brabandt
2023-04-24 21:09:54 +01:00
committed by Bram Moolenaar
parent e1b4822137
commit 67672ef097
8 changed files with 677 additions and 56 deletions

View File

@@ -81,8 +81,10 @@ bufnr([{buf} [, {create}]]) Number Number of the buffer {buf}
bufwinid({buf}) Number window ID of buffer {buf} bufwinid({buf}) Number window ID of buffer {buf}
bufwinnr({buf}) Number window number of buffer {buf} bufwinnr({buf}) Number window number of buffer {buf}
byte2line({byte}) Number line number at byte count {byte} byte2line({byte}) Number line number at byte count {byte}
byteidx({expr}, {nr}) Number byte index of {nr}'th char in {expr} byteidx({expr}, {nr} [, {utf16}])
byteidxcomp({expr}, {nr}) Number byte index of {nr}'th char in {expr} Number byte index of {nr}'th char in {expr}
byteidxcomp({expr}, {nr} [, {utf16}])
Number byte index of {nr}'th char in {expr}
call({func}, {arglist} [, {dict}]) call({func}, {arglist} [, {dict}])
any call {func} with arguments {arglist} any call {func} with arguments {arglist}
ceil({expr}) Float round {expr} up ceil({expr}) Float round {expr} up
@@ -117,7 +119,7 @@ changenr() Number current change number
char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr} char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr}
charclass({string}) Number character class of {string} charclass({string}) Number character class of {string}
charcol({expr} [, {winid}]) Number column number of cursor or mark charcol({expr} [, {winid}]) Number column number of cursor or mark
charidx({string}, {idx} [, {countcc}]) charidx({string}, {idx} [, {countcc} [, {utf16}]])
Number char index of byte {idx} in {string} Number char index of byte {idx} in {string}
chdir({dir}) String change current working directory chdir({dir}) String change current working directory
cindent({lnum}) Number C indent for line {lnum} cindent({lnum}) Number C indent for line {lnum}
@@ -604,6 +606,8 @@ strptime({format}, {timestring})
strridx({haystack}, {needle} [, {start}]) strridx({haystack}, {needle} [, {start}])
Number last index of {needle} in {haystack} Number last index of {needle} in {haystack}
strtrans({expr}) String translate string to make it printable strtrans({expr}) String translate string to make it printable
strutf16len({string} [, {countcc}])
Number number of UTF-16 code units in {string}
strwidth({expr}) Number display cell length of the String {expr} strwidth({expr}) Number display cell length of the String {expr}
submatch({nr} [, {list}]) String or List submatch({nr} [, {list}]) String or List
specific match in ":s" or substitute() specific match in ":s" or substitute()
@@ -704,6 +708,8 @@ undofile({name}) String undo file name for {name}
undotree() List undo file tree undotree() List undo file tree
uniq({list} [, {func} [, {dict}]]) uniq({list} [, {func} [, {dict}]])
List remove adjacent duplicates from a list List remove adjacent duplicates from a list
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
Number UTF-16 index of byte {idx} in {string}
values({dict}) List values in {dict} values({dict}) List values in {dict}
virtcol({expr} [, {list}]) Number or List virtcol({expr} [, {list}]) Number or List
screen column of cursor or mark screen column of cursor or mark
@@ -1363,7 +1369,7 @@ byte2line({byte}) *byte2line()*
< {not available when compiled without the |+byte_offset| < {not available when compiled without the |+byte_offset|
feature} feature}
byteidx({expr}, {nr}) *byteidx()* byteidx({expr}, {nr} [, {utf16}]) *byteidx()*
Return byte index of the {nr}'th character in the String Return byte index of the {nr}'th character in the String
{expr}. Use zero for the first character, it then returns {expr}. Use zero for the first character, it then returns
zero. zero.
@@ -1373,6 +1379,13 @@ byteidx({expr}, {nr}) *byteidx()*
length is added to the preceding base character. See length is added to the preceding base character. See
|byteidxcomp()| below for counting composing characters |byteidxcomp()| below for counting composing characters
separately. separately.
When {utf16} is present and TRUE, {nr} is used as the UTF-16
index in the String {expr} instead of as the character index.
The UTF-16 index is the index in the string when it is encoded
with 16-bit words. If the specified UTF-16 index is in the
middle of a character (e.g. in a 4-byte character), then the
byte index of the first byte in the character is returned.
Refer to |string-offset-encoding| for more information.
Example : > Example : >
echo matchstr(str, ".", byteidx(str, 3)) echo matchstr(str, ".", byteidx(str, 3))
< will display the fourth character. Another way to do the < will display the fourth character. Another way to do the
@@ -1384,11 +1397,17 @@ byteidx({expr}, {nr}) *byteidx()*
If there are less than {nr} characters -1 is returned. If there are less than {nr} characters -1 is returned.
If there are exactly {nr} characters the length of the string If there are exactly {nr} characters the length of the string
in bytes is returned. in bytes is returned.
See |charidx()| and |utf16idx()| for getting the character and
UTF-16 index respectively from the byte index.
Examples: >
echo byteidx('a😊😊', 2) returns 5
echo byteidx('a😊😊', 2, 1) returns 1
echo byteidx('a😊😊', 3, 1) returns 5
<
Can also be used as a |method|: > Can also be used as a |method|: >
GetName()->byteidx(idx) GetName()->byteidx(idx)
byteidxcomp({expr}, {nr}) *byteidxcomp()* byteidxcomp({expr}, {nr} [, {utf16}]) *byteidxcomp()*
Like byteidx(), except that a composing character is counted Like byteidx(), except that a composing character is counted
as a separate character. Example: > as a separate character. Example: >
let s = 'e' .. nr2char(0x301) let s = 'e' .. nr2char(0x301)
@@ -1493,27 +1512,36 @@ charcol({expr} [, {winid}]) *charcol()*
GetPos()->col() GetPos()->col()
< <
*charidx()* *charidx()*
charidx({string}, {idx} [, {countcc}]) charidx({string}, {idx} [, {countcc} [, {utf16}]])
Return the character index of the byte at {idx} in {string}. Return the character index of the byte at {idx} in {string}.
The index of the first character is zero. The index of the first character is zero.
If there are no multibyte characters the returned value is If there are no multibyte characters the returned value is
equal to {idx}. equal to {idx}.
When {countcc} is omitted or |FALSE|, then composing characters When {countcc} is omitted or |FALSE|, then composing characters
are not counted separately, their byte length is are not counted separately, their byte length is added to the
added to the preceding base character. preceding base character.
When {countcc} is |TRUE|, then composing characters are When {countcc} is |TRUE|, then composing characters are
counted as separate characters. counted as separate characters.
When {utf16} is present and TRUE, {idx} is used as the UTF-16
index in the String {expr} instead of as the byte index.
Returns -1 if the arguments are invalid or if {idx} is greater Returns -1 if the arguments are invalid or if {idx} is greater
than the index of the last byte in {string}. An error is than the index of the last byte in {string}. An error is
given if the first argument is not a string, the second given if the first argument is not a string, the second
argument is not a number or when the third argument is present argument is not a number or when the third argument is present
and is not zero or one. and is not zero or one.
See |byteidx()| and |byteidxcomp()| for getting the byte index See |byteidx()| and |byteidxcomp()| for getting the byte index
from the character index. from the character index and |utf16idx()| for getting the
UTF-16 index from the character index.
Refer to |string-offset-encoding| for more information.
Examples: > Examples: >
echo charidx('áb́ć', 3) returns 1 echo charidx('áb́ć', 3) returns 1
echo charidx('áb́ć', 6, 1) returns 4 echo charidx('áb́ć', 6, 1) returns 4
echo charidx('áb́ć', 16) returns -1 echo charidx('áb́ć', 16) returns -1
echo charidx('a😊😊', 4, 0, 1) returns 2
< <
Can also be used as a |method|: > Can also be used as a |method|: >
GetName()->charidx(idx) GetName()->charidx(idx)
@@ -9244,6 +9272,28 @@ strtrans({string}) *strtrans()*
Can also be used as a |method|: > Can also be used as a |method|: >
GetString()->strtrans() GetString()->strtrans()
strutf16len({string} [, {countcc}]) *strutf16len()*
The result is a Number, which is the number of UTF-16 code
units in String {string} (after converting it to UTF-16).
When {countcc} is TRUE, composing characters are counted
separately.
When {countcc} is omitted or FALSE, composing characters are
ignored.
Returns zero on error.
Also see |strlen()| and |strcharlen()|.
Examples: >
echo strutf16len('a') returns 1
echo strutf16len('©') returns 1
echo strutf16len('😊') returns 2
echo strutf16len('ą́') returns 1
echo strutf16len('ą́', v:true) returns 3
Can also be used as a |method|: >
GetText()->strutf16len()
<
strwidth({string}) *strwidth()* strwidth({string}) *strwidth()*
The result is a Number, which is the number of display cells The result is a Number, which is the number of display cells
String {string} occupies. A Tab character is counted as one String {string} occupies. A Tab character is counted as one
@@ -10059,6 +10109,34 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*
Can also be used as a |method|: > Can also be used as a |method|: >
mylist->uniq() mylist->uniq()
<
*utf16idx()*
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
Same as |charidx()| but returns the UTF-16 index of the byte
at {idx} in {string} (after converting it to UTF-16).
When {charidx} is present and TRUE, {idx} is used as the
character index in the String {string} instead of as the byte
index.
An {idx} in the middle of a UTF-8 sequence is rounded upwards
to the end of that sequence.
See |byteidx()| and |byteidxcomp()| for getting the byte index
from the UTF-16 index and |charidx()| for getting the
character index from the UTF-16 index.
Refer to |string-offset-encoding| for more information.
Examples: >
echo utf16idx('a😊😊', 3) returns 2
echo utf16idx('a😊😊', 7) returns 4
echo utf16idx('a😊😊', 1, 0, 1) returns 2
echo utf16idx('a😊😊', 2, 0, 1) returns 4
echo utf16idx('aą́c', 6) returns 2
echo utf16idx('aą́c', 6, 1) returns 4
echo utf16idx('a😊😊', 9) returns -1
<
Can also be used as a |method|: >
GetName()->utf16idx(idx)
values({dict}) *values()* values({dict}) *values()*
Return a |List| with all the values of {dict}. The |List| is Return a |List| with all the values of {dict}. The |List| is

View File

@@ -1580,6 +1580,33 @@ Examples: >
echo $"The square root of {{9}} is {sqrt(9)}" echo $"The square root of {{9}} is {sqrt(9)}"
< The square root of {9} is 3.0 ~ < The square root of {9} is 3.0 ~
*string-offset-encoding*
A string consists of multiple characters. How the characters are stored
depends on 'encoding'. Most common is UTF-8, which uses one byte for ASCII
characters, two bytes for other latin characters and more bytes for other
characters.
A string offset can count characters or bytes. Other programs may use
UTF-16 encoding (16-bit words) and an offset of UTF-16 words. Some functions
use byte offsets, usually for UTF-8 encoding. Other functions use character
offsets, in which case the encoding doesn't matter.
The different offsets for the string "a©😊" are below:
UTF-8 offsets:
[0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
UTF-16 offsets:
[0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A
UTF-32 (character) offsets:
[0]: 00000061, [1]: 000000A9, [2]: 0001F60A
You can use the "g8" and "ga" commands on a character to see the
decimal/hex/octal values.
The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
between these indices. The functions |strlen()|, |strutf16len()| and
|strcharlen()| return the number of bytes, UTF-16 code units and characters in
a string respectively.
option *expr-option* *E112* *E113* option *expr-option* *E112* *E113*
------ ------

View File

@@ -754,6 +754,7 @@ String manipulation: *string-functions*
strlen() length of a string in bytes strlen() length of a string in bytes
strcharlen() length of a string in characters strcharlen() length of a string in characters
strchars() number of characters in a string strchars() number of characters in a string
strutf16len() number of UTF-16 code units in a string
strwidth() size of string when displayed strwidth() size of string when displayed
strdisplaywidth() size of string when displayed, deals with tabs strdisplaywidth() size of string when displayed, deals with tabs
setcellwidths() set character cell width overrides setcellwidths() set character cell width overrides
@@ -771,6 +772,7 @@ String manipulation: *string-functions*
byteidx() byte index of a character in a string byteidx() byte index of a character in a string
byteidxcomp() like byteidx() but count composing characters byteidxcomp() like byteidx() but count composing characters
charidx() character index of a byte in a string charidx() character index of a byte in a string
utf16idx() UTF-16 index of a byte in a string
repeat() repeat a string multiple times repeat() repeat a string multiple times
eval() evaluate a string expression eval() evaluate a string expression
execute() execute an Ex command and get the output execute() execute an Ex command and get the output

View File

@@ -1751,9 +1751,9 @@ static funcentry_T global_functions[] =
ret_number, f_bufwinnr}, ret_number, f_bufwinnr},
{"byte2line", 1, 1, FEARG_1, arg1_number, {"byte2line", 1, 1, FEARG_1, arg1_number,
ret_number, f_byte2line}, ret_number, f_byte2line},
{"byteidx", 2, 2, FEARG_1, arg2_string_number, {"byteidx", 2, 3, FEARG_1, arg3_string_number_bool,
ret_number, f_byteidx}, ret_number, f_byteidx},
{"byteidxcomp", 2, 2, FEARG_1, arg2_string_number, {"byteidxcomp", 2, 3, FEARG_1, arg3_string_number_bool,
ret_number, f_byteidxcomp}, ret_number, f_byteidxcomp},
{"call", 2, 3, FEARG_1, arg3_any_list_dict, {"call", 2, 3, FEARG_1, arg3_any_list_dict,
ret_any, f_call}, ret_any, f_call},
@@ -1803,7 +1803,7 @@ static funcentry_T global_functions[] =
ret_number, f_charclass}, ret_number, f_charclass},
{"charcol", 1, 2, FEARG_1, arg2_string_or_list_number, {"charcol", 1, 2, FEARG_1, arg2_string_or_list_number,
ret_number, f_charcol}, ret_number, f_charcol},
{"charidx", 2, 3, FEARG_1, arg3_string_number_bool, {"charidx", 2, 4, FEARG_1, arg3_string_number_bool,
ret_number, f_charidx}, ret_number, f_charidx},
{"chdir", 1, 1, FEARG_1, arg1_string, {"chdir", 1, 1, FEARG_1, arg1_string,
ret_string, f_chdir}, ret_string, f_chdir},
@@ -2601,6 +2601,8 @@ static funcentry_T global_functions[] =
ret_number, f_strridx}, ret_number, f_strridx},
{"strtrans", 1, 1, FEARG_1, arg1_string, {"strtrans", 1, 1, FEARG_1, arg1_string,
ret_string, f_strtrans}, ret_string, f_strtrans},
{"strutf16len", 1, 2, FEARG_1, arg2_string_bool,
ret_number, f_strutf16len},
{"strwidth", 1, 1, FEARG_1, arg1_string, {"strwidth", 1, 1, FEARG_1, arg1_string,
ret_number, f_strwidth}, ret_number, f_strwidth},
{"submatch", 1, 2, FEARG_1, arg2_number_bool, {"submatch", 1, 2, FEARG_1, arg2_number_bool,
@@ -2785,6 +2787,8 @@ static funcentry_T global_functions[] =
ret_dict_any, f_undotree}, ret_dict_any, f_undotree},
{"uniq", 1, 3, FEARG_1, arg13_sortuniq, {"uniq", 1, 3, FEARG_1, arg13_sortuniq,
ret_first_arg, f_uniq}, ret_first_arg, f_uniq},
{"utf16idx", 2, 4, FEARG_1, arg3_string_number_bool,
ret_number, f_utf16idx},
{"values", 1, 1, FEARG_1, arg1_dict_any, {"values", 1, 1, FEARG_1, arg1_dict_any,
ret_list_member, f_values}, ret_list_member, f_values},
{"virtcol", 1, 2, FEARG_1, arg2_string_or_list_bool, {"virtcol", 1, 2, FEARG_1, arg2_string_or_list_bool,

View File

@@ -36,12 +36,14 @@ void f_string(typval_T *argvars, typval_T *rettv);
void f_strlen(typval_T *argvars, typval_T *rettv); void f_strlen(typval_T *argvars, typval_T *rettv);
void f_strcharlen(typval_T *argvars, typval_T *rettv); void f_strcharlen(typval_T *argvars, typval_T *rettv);
void f_strchars(typval_T *argvars, typval_T *rettv); void f_strchars(typval_T *argvars, typval_T *rettv);
void f_strutf16len(typval_T *argvars, typval_T *rettv);
void f_strdisplaywidth(typval_T *argvars, typval_T *rettv); void f_strdisplaywidth(typval_T *argvars, typval_T *rettv);
void f_strwidth(typval_T *argvars, typval_T *rettv); void f_strwidth(typval_T *argvars, typval_T *rettv);
void f_strcharpart(typval_T *argvars, typval_T *rettv); void f_strcharpart(typval_T *argvars, typval_T *rettv);
void f_strpart(typval_T *argvars, typval_T *rettv); void f_strpart(typval_T *argvars, typval_T *rettv);
void f_strridx(typval_T *argvars, typval_T *rettv); void f_strridx(typval_T *argvars, typval_T *rettv);
void f_strtrans(typval_T *argvars, typval_T *rettv); void f_strtrans(typval_T *argvars, typval_T *rettv);
void f_utf16idx(typval_T *argvars, typval_T *rettv);
void f_tolower(typval_T *argvars, typval_T *rettv); void f_tolower(typval_T *argvars, typval_T *rettv);
void f_toupper(typval_T *argvars, typval_T *rettv); void f_toupper(typval_T *argvars, typval_T *rettv);
void f_tr(typval_T *argvars, typval_T *rettv); void f_tr(typval_T *argvars, typval_T *rettv);

View File

@@ -1006,10 +1006,6 @@ string_reduce(
static void static void
byteidx(typval_T *argvars, typval_T *rettv, int comp UNUSED) byteidx(typval_T *argvars, typval_T *rettv, int comp UNUSED)
{ {
char_u *t;
char_u *str;
varnumber_T idx;
rettv->vval.v_number = -1; rettv->vval.v_number = -1;
if (in_vim9script() if (in_vim9script()
@@ -1017,20 +1013,42 @@ byteidx(typval_T *argvars, typval_T *rettv, int comp UNUSED)
|| check_for_number_arg(argvars, 1) == FAIL)) || check_for_number_arg(argvars, 1) == FAIL))
return; return;
str = tv_get_string_chk(&argvars[0]); char_u *str = tv_get_string_chk(&argvars[0]);
idx = tv_get_number_chk(&argvars[1], NULL); varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
if (str == NULL || idx < 0) if (str == NULL || idx < 0)
return; return;
t = str; varnumber_T utf16idx = FALSE;
if (argvars[2].v_type != VAR_UNKNOWN)
{
utf16idx = tv_get_bool(&argvars[2]);
if (utf16idx < 0 || utf16idx > 1)
{
semsg(_(e_using_number_as_bool_nr), utf16idx);
return;
}
}
int (*ptr2len)(char_u *);
if (enc_utf8 && comp)
ptr2len = utf_ptr2len;
else
ptr2len = mb_ptr2len;
char_u *t = str;
for ( ; idx > 0; idx--) for ( ; idx > 0; idx--)
{ {
if (*t == NUL) // EOL reached if (*t == NUL) // EOL reached
return; return;
if (enc_utf8 && comp) if (utf16idx)
t += utf_ptr2len(t); {
else int clen = ptr2len(t);
t += (*mb_ptr2len)(t); int c = (clen > 1) ? utf_ptr2char(t) : *t;
if (c > 0xFFFF)
idx--;
}
if (idx > 0)
t += ptr2len(t);
} }
rettv->vval.v_number = (varnumber_T)(t - str); rettv->vval.v_number = (varnumber_T)(t - str);
} }
@@ -1059,42 +1077,49 @@ f_byteidxcomp(typval_T *argvars, typval_T *rettv)
void void
f_charidx(typval_T *argvars, typval_T *rettv) f_charidx(typval_T *argvars, typval_T *rettv)
{ {
char_u *str;
varnumber_T idx;
varnumber_T countcc = FALSE;
char_u *p;
int len;
int (*ptr2len)(char_u *);
rettv->vval.v_number = -1; rettv->vval.v_number = -1;
if ((check_for_string_arg(argvars, 0) == FAIL if (check_for_string_arg(argvars, 0) == FAIL
|| check_for_number_arg(argvars, 1) == FAIL || check_for_number_arg(argvars, 1) == FAIL
|| check_for_opt_bool_arg(argvars, 2) == FAIL)) || check_for_opt_bool_arg(argvars, 2) == FAIL
|| (argvars[2].v_type != VAR_UNKNOWN
&& check_for_opt_bool_arg(argvars, 3) == FAIL))
return; return;
str = tv_get_string_chk(&argvars[0]); char_u *str = tv_get_string_chk(&argvars[0]);
idx = tv_get_number_chk(&argvars[1], NULL); varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
if (str == NULL || idx < 0) if (str == NULL || idx < 0)
return; return;
varnumber_T countcc = FALSE;
varnumber_T utf16idx = FALSE;
if (argvars[2].v_type != VAR_UNKNOWN) if (argvars[2].v_type != VAR_UNKNOWN)
countcc = tv_get_bool(&argvars[2]);
if (countcc < 0 || countcc > 1)
{ {
semsg(_(e_using_number_as_bool_nr), countcc); countcc = tv_get_bool(&argvars[2]);
return; if (argvars[3].v_type != VAR_UNKNOWN)
utf16idx = tv_get_bool(&argvars[3]);
} }
int (*ptr2len)(char_u *);
if (enc_utf8 && countcc) if (enc_utf8 && countcc)
ptr2len = utf_ptr2len; ptr2len = utf_ptr2len;
else else
ptr2len = mb_ptr2len; ptr2len = mb_ptr2len;
for (p = str, len = 0; p <= str + idx; len++) char_u *p;
int len;
for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++)
{ {
if (*p == NUL) if (*p == NUL)
return; return;
if (utf16idx)
{
idx--;
int clen = ptr2len(p);
int c = (clen > 1) ? utf_ptr2char(p) : *p;
if (c > 0xFFFF)
idx--;
}
p += ptr2len(p); p += ptr2len(p);
} }
@@ -1358,6 +1383,38 @@ f_strchars(typval_T *argvars, typval_T *rettv)
strchar_common(argvars, rettv, skipcc); strchar_common(argvars, rettv, skipcc);
} }
/*
* "strutf16len()" function
*/
void
f_strutf16len(typval_T *argvars, typval_T *rettv)
{
rettv->vval.v_number = -1;
if (check_for_string_arg(argvars, 0) == FAIL
|| check_for_opt_bool_arg(argvars, 1) == FAIL)
return;
varnumber_T countcc = FALSE;
if (argvars[1].v_type != VAR_UNKNOWN)
countcc = tv_get_bool(&argvars[1]);
char_u *s = tv_get_string(&argvars[0]);
varnumber_T len = 0;
int (*func_mb_ptr2char_adv)(char_u **pp);
int ch;
func_mb_ptr2char_adv = countcc ? mb_cptr2char_adv : mb_ptr2char_adv;
while (*s != NUL)
{
ch = func_mb_ptr2char_adv(&s);
if (ch > 0xFFFF)
++len;
++len;
}
rettv->vval.v_number = len;
}
/* /*
* "strdisplaywidth()" function * "strdisplaywidth()" function
*/ */
@@ -1619,6 +1676,61 @@ f_strtrans(typval_T *argvars, typval_T *rettv)
rettv->vval.v_string = transstr(tv_get_string(&argvars[0])); rettv->vval.v_string = transstr(tv_get_string(&argvars[0]));
} }
/*
*
* "utf16idx()" function
*/
void
f_utf16idx(typval_T *argvars, typval_T *rettv)
{
rettv->vval.v_number = -1;
if (check_for_string_arg(argvars, 0) == FAIL
|| check_for_opt_number_arg(argvars, 1) == FAIL
|| check_for_opt_bool_arg(argvars, 2) == FAIL
|| (argvars[2].v_type != VAR_UNKNOWN
&& check_for_opt_bool_arg(argvars, 3) == FAIL))
return;
char_u *str = tv_get_string_chk(&argvars[0]);
varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
if (str == NULL || idx < 0)
return;
varnumber_T countcc = FALSE;
varnumber_T charidx = FALSE;
if (argvars[2].v_type != VAR_UNKNOWN)
{
countcc = tv_get_bool(&argvars[2]);
if (argvars[3].v_type != VAR_UNKNOWN)
charidx = tv_get_bool(&argvars[3]);
}
int (*ptr2len)(char_u *);
if (enc_utf8 && countcc)
ptr2len = utf_ptr2len;
else
ptr2len = mb_ptr2len;
char_u *p;
int len;
for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++)
{
if (*p == NUL)
return;
int clen = ptr2len(p);
int c = (clen > 1) ? utf_ptr2char(p) : *p;
if (c > 0xFFFF)
len++;
p += ptr2len(p);
if (charidx)
idx--;
}
rettv->vval.v_number = len > 0 ? len - 1 : 0;
}
/* /*
* "tolower(string)" function * "tolower(string)" function
*/ */

View File

@@ -1192,19 +1192,14 @@ func Test_byte2line_line2byte()
bw! bw!
endfunc endfunc
" Test for byteidx() and byteidxcomp() functions " Test for byteidx() using a character index
func Test_byteidx() func Test_byteidx()
let a = '.é.' " one char of two bytes let a = '.é.' " one char of two bytes
call assert_equal(0, byteidx(a, 0)) call assert_equal(0, byteidx(a, 0))
call assert_equal(0, byteidxcomp(a, 0))
call assert_equal(1, byteidx(a, 1)) call assert_equal(1, byteidx(a, 1))
call assert_equal(1, byteidxcomp(a, 1))
call assert_equal(3, byteidx(a, 2)) call assert_equal(3, byteidx(a, 2))
call assert_equal(3, byteidxcomp(a, 2))
call assert_equal(4, byteidx(a, 3)) call assert_equal(4, byteidx(a, 3))
call assert_equal(4, byteidxcomp(a, 3))
call assert_equal(-1, byteidx(a, 4)) call assert_equal(-1, byteidx(a, 4))
call assert_equal(-1, byteidxcomp(a, 4))
let b = '.é.' " normal e with composing char let b = '.é.' " normal e with composing char
call assert_equal(0, b->byteidx(0)) call assert_equal(0, b->byteidx(0))
@@ -1212,18 +1207,184 @@ func Test_byteidx()
call assert_equal(4, b->byteidx(2)) call assert_equal(4, b->byteidx(2))
call assert_equal(5, b->byteidx(3)) call assert_equal(5, b->byteidx(3))
call assert_equal(-1, b->byteidx(4)) call assert_equal(-1, b->byteidx(4))
call assert_fails("call byteidx([], 0)", 'E730:')
" string with multiple composing characters
let str = '-ą́-ą́'
call assert_equal(0, byteidx(str, 0))
call assert_equal(1, byteidx(str, 1))
call assert_equal(6, byteidx(str, 2))
call assert_equal(7, byteidx(str, 3))
call assert_equal(12, byteidx(str, 4))
call assert_equal(-1, byteidx(str, 5))
" empty string
call assert_equal(0, byteidx('', 0))
call assert_equal(-1, byteidx('', 1))
" error cases
call assert_fails("call byteidx([], 0)", 'E730:')
call assert_fails("call byteidx('abc', [])", 'E745:')
endfunc
" Test for byteidxcomp() using a character index
func Test_byteidxcomp()
let a = '.é.' " one char of two bytes
call assert_equal(0, byteidxcomp(a, 0))
call assert_equal(1, byteidxcomp(a, 1))
call assert_equal(3, byteidxcomp(a, 2))
call assert_equal(4, byteidxcomp(a, 3))
call assert_equal(-1, byteidxcomp(a, 4))
let b = '.é.' " normal e with composing char
call assert_equal(0, b->byteidxcomp(0)) call assert_equal(0, b->byteidxcomp(0))
call assert_equal(1, b->byteidxcomp(1)) call assert_equal(1, b->byteidxcomp(1))
call assert_equal(2, b->byteidxcomp(2)) call assert_equal(2, b->byteidxcomp(2))
call assert_equal(4, b->byteidxcomp(3)) call assert_equal(4, b->byteidxcomp(3))
call assert_equal(5, b->byteidxcomp(4)) call assert_equal(5, b->byteidxcomp(4))
call assert_equal(-1, b->byteidxcomp(5)) call assert_equal(-1, b->byteidxcomp(5))
" string with multiple composing characters
let str = '-ą́-ą́'
call assert_equal(0, byteidxcomp(str, 0))
call assert_equal(1, byteidxcomp(str, 1))
call assert_equal(2, byteidxcomp(str, 2))
call assert_equal(4, byteidxcomp(str, 3))
call assert_equal(6, byteidxcomp(str, 4))
call assert_equal(7, byteidxcomp(str, 5))
call assert_equal(8, byteidxcomp(str, 6))
call assert_equal(10, byteidxcomp(str, 7))
call assert_equal(12, byteidxcomp(str, 8))
call assert_equal(-1, byteidxcomp(str, 9))
" empty string
call assert_equal(0, byteidxcomp('', 0))
call assert_equal(-1, byteidxcomp('', 1))
" error cases
call assert_fails("call byteidxcomp([], 0)", 'E730:') call assert_fails("call byteidxcomp([], 0)", 'E730:')
call assert_fails("call byteidxcomp('abc', [])", 'E745:')
endfunc endfunc
" Test for charidx() " Test for byteidx() using a UTF-16 index
func Test_byteidx_from_utf16_index()
" string with single byte characters
let str = "abc"
for i in range(3)
call assert_equal(i, byteidx(str, i, v:true))
endfor
call assert_equal(3, byteidx(str, 3, v:true))
call assert_equal(-1, byteidx(str, 4, v:true))
" string with two byte characters
let str = "a©©b"
call assert_equal(0, byteidx(str, 0, v:true))
call assert_equal(1, byteidx(str, 1, v:true))
call assert_equal(3, byteidx(str, 2, v:true))
call assert_equal(5, byteidx(str, 3, v:true))
call assert_equal(6, byteidx(str, 4, v:true))
call assert_equal(-1, byteidx(str, 5, v:true))
" string with two byte characters
let str = "a😊😊b"
call assert_equal(0, byteidx(str, 0, v:true))
call assert_equal(1, byteidx(str, 1, v:true))
call assert_equal(1, byteidx(str, 2, v:true))
call assert_equal(5, byteidx(str, 3, v:true))
call assert_equal(5, byteidx(str, 4, v:true))
call assert_equal(9, byteidx(str, 5, v:true))
call assert_equal(10, byteidx(str, 6, v:true))
call assert_equal(-1, byteidx(str, 7, v:true))
" string with composing characters
let str = '-á-b́'
call assert_equal(0, byteidx(str, 0, v:true))
call assert_equal(1, byteidx(str, 1, v:true))
call assert_equal(4, byteidx(str, 2, v:true))
call assert_equal(5, byteidx(str, 3, v:true))
call assert_equal(8, byteidx(str, 4, v:true))
call assert_equal(-1, byteidx(str, 5, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
call assert_equal(0, byteidx(str, 0, v:true))
call assert_equal(1, byteidx(str, 1, v:true))
call assert_equal(6, byteidx(str, 2, v:true))
call assert_equal(7, byteidx(str, 3, v:true))
call assert_equal(12, byteidx(str, 4, v:true))
call assert_equal(-1, byteidx(str, 5, v:true))
" empty string
call assert_equal(0, byteidx('', 0, v:true))
call assert_equal(-1, byteidx('', 1, v:true))
" error cases
call assert_fails('call byteidx(str, 0, [])', 'E745:')
endfunc
" Test for byteidxcomp() using a UTF-16 index
func Test_byteidxcomp_from_utf16_index()
" string with single byte characters
let str = "abc"
for i in range(3)
call assert_equal(i, byteidxcomp(str, i, v:true))
endfor
call assert_equal(3, byteidxcomp(str, 3, v:true))
call assert_equal(-1, byteidxcomp(str, 4, v:true))
" string with two byte characters
let str = "a©©b"
call assert_equal(0, byteidxcomp(str, 0, v:true))
call assert_equal(1, byteidxcomp(str, 1, v:true))
call assert_equal(3, byteidxcomp(str, 2, v:true))
call assert_equal(5, byteidxcomp(str, 3, v:true))
call assert_equal(6, byteidxcomp(str, 4, v:true))
call assert_equal(-1, byteidxcomp(str, 5, v:true))
" string with two byte characters
let str = "a😊😊b"
call assert_equal(0, byteidxcomp(str, 0, v:true))
call assert_equal(1, byteidxcomp(str, 1, v:true))
call assert_equal(1, byteidxcomp(str, 2, v:true))
call assert_equal(5, byteidxcomp(str, 3, v:true))
call assert_equal(5, byteidxcomp(str, 4, v:true))
call assert_equal(9, byteidxcomp(str, 5, v:true))
call assert_equal(10, byteidxcomp(str, 6, v:true))
call assert_equal(-1, byteidxcomp(str, 7, v:true))
" string with composing characters
let str = '-á-b́'
call assert_equal(0, byteidxcomp(str, 0, v:true))
call assert_equal(1, byteidxcomp(str, 1, v:true))
call assert_equal(2, byteidxcomp(str, 2, v:true))
call assert_equal(4, byteidxcomp(str, 3, v:true))
call assert_equal(5, byteidxcomp(str, 4, v:true))
call assert_equal(6, byteidxcomp(str, 5, v:true))
call assert_equal(8, byteidxcomp(str, 6, v:true))
call assert_equal(-1, byteidxcomp(str, 7, v:true))
call assert_fails('call byteidxcomp(str, 0, [])', 'E745:')
" string with multiple composing characters
let str = '-ą́-ą́'
call assert_equal(0, byteidxcomp(str, 0, v:true))
call assert_equal(1, byteidxcomp(str, 1, v:true))
call assert_equal(2, byteidxcomp(str, 2, v:true))
call assert_equal(4, byteidxcomp(str, 3, v:true))
call assert_equal(6, byteidxcomp(str, 4, v:true))
call assert_equal(7, byteidxcomp(str, 5, v:true))
call assert_equal(8, byteidxcomp(str, 6, v:true))
call assert_equal(10, byteidxcomp(str, 7, v:true))
call assert_equal(12, byteidxcomp(str, 8, v:true))
call assert_equal(-1, byteidxcomp(str, 9, v:true))
" empty string
call assert_equal(0, byteidxcomp('', 0, v:true))
call assert_equal(-1, byteidxcomp('', 1, v:true))
" error cases
call assert_fails('call byteidxcomp(str, 0, [])', 'E745:')
endfunc
" Test for charidx() using a byte index
func Test_charidx() func Test_charidx()
let a = 'xáb́y' let a = 'xáb́y'
call assert_equal(0, charidx(a, 0)) call assert_equal(0, charidx(a, 0))
@@ -1232,17 +1393,20 @@ func Test_charidx()
call assert_equal(3, charidx(a, 7)) call assert_equal(3, charidx(a, 7))
call assert_equal(-1, charidx(a, 8)) call assert_equal(-1, charidx(a, 8))
call assert_equal(-1, charidx(a, -1)) call assert_equal(-1, charidx(a, -1))
call assert_equal(-1, charidx('', 0))
call assert_equal(-1, charidx(test_null_string(), 0))
" count composing characters " count composing characters
call assert_equal(0, charidx(a, 0, 1)) call assert_equal(0, a->charidx(0, 1))
call assert_equal(2, charidx(a, 2, 1)) call assert_equal(2, a->charidx(2, 1))
call assert_equal(3, charidx(a, 4, 1)) call assert_equal(3, a->charidx(4, 1))
call assert_equal(5, charidx(a, 7, 1)) call assert_equal(5, a->charidx(7, 1))
call assert_equal(-1, charidx(a, 8, 1)) call assert_equal(-1, a->charidx(8, 1))
" empty string
call assert_equal(-1, charidx('', 0))
call assert_equal(-1, charidx('', 0, 1)) call assert_equal(-1, charidx('', 0, 1))
" error cases
call assert_equal(-1, charidx(test_null_string(), 0))
call assert_fails('let x = charidx([], 1)', 'E1174:') call assert_fails('let x = charidx([], 1)', 'E1174:')
call assert_fails('let x = charidx("abc", [])', 'E1210:') call assert_fails('let x = charidx("abc", [])', 'E1210:')
call assert_fails('let x = charidx("abc", 1, [])', 'E1212:') call assert_fails('let x = charidx("abc", 1, [])', 'E1212:')
@@ -1250,6 +1414,237 @@ func Test_charidx()
call assert_fails('let x = charidx("abc", 1, 2)', 'E1212:') call assert_fails('let x = charidx("abc", 1, 2)', 'E1212:')
endfunc endfunc
" Test for charidx() using a UTF-16 index
func Test_charidx_from_utf16_index()
" string with single byte characters
let str = "abc"
for i in range(3)
call assert_equal(i, charidx(str, i, v:false, v:true))
endfor
call assert_equal(-1, charidx(str, 3, v:false, v:true))
" string with two byte characters
let str = "a©©b"
call assert_equal(0, charidx(str, 0, v:false, v:true))
call assert_equal(1, charidx(str, 1, v:false, v:true))
call assert_equal(2, charidx(str, 2, v:false, v:true))
call assert_equal(3, charidx(str, 3, v:false, v:true))
call assert_equal(-1, charidx(str, 4, v:false, v:true))
" string with four byte characters
let str = "a😊😊b"
call assert_equal(0, charidx(str, 0, v:false, v:true))
call assert_equal(1, charidx(str, 1, v:false, v:true))
call assert_equal(1, charidx(str, 2, v:false, v:true))
call assert_equal(2, charidx(str, 3, v:false, v:true))
call assert_equal(2, charidx(str, 4, v:false, v:true))
call assert_equal(3, charidx(str, 5, v:false, v:true))
call assert_equal(-1, charidx(str, 6, v:false, v:true))
" string with composing characters
let str = '-á-b́'
for i in str->strcharlen()->range()
call assert_equal(i, charidx(str, i, v:false, v:true))
endfor
call assert_equal(-1, charidx(str, 4, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, charidx(str, i, v:true, v:true))
endfor
call assert_equal(-1, charidx(str, 6, v:true, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
for i in str->strcharlen()->range()
call assert_equal(i, charidx(str, i, v:false, v:true))
endfor
call assert_equal(-1, charidx(str, 4, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, charidx(str, i, v:true, v:true))
endfor
call assert_equal(-1, charidx(str, 8, v:true, v:true))
" empty string
call assert_equal(-1, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 0, v:true, v:true))
" error cases
call assert_equal(-1, charidx('', 0, v:false, v:true))
call assert_equal(-1, charidx('', 0, v:true, v:true))
call assert_equal(-1, charidx(test_null_string(), 0, v:false, v:true))
call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:')
call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:')
endfunc
" Test for utf16idx() using a byte index
func Test_utf16idx_from_byteidx()
" UTF-16 index of a string with single byte characters
let str = "abc"
for i in range(3)
call assert_equal(i, utf16idx(str, i))
endfor
call assert_equal(-1, utf16idx(str, 3))
" UTF-16 index of a string with two byte characters
let str = 'a©©b'
call assert_equal(0, str->utf16idx(0))
call assert_equal(1, str->utf16idx(1))
call assert_equal(1, str->utf16idx(2))
call assert_equal(2, str->utf16idx(3))
call assert_equal(2, str->utf16idx(4))
call assert_equal(3, str->utf16idx(5))
call assert_equal(-1, str->utf16idx(6))
" UTF-16 index of a string with four byte characters
let str = 'a😊😊b'
call assert_equal(0, utf16idx(str, 0))
call assert_equal(2, utf16idx(str, 1))
call assert_equal(2, utf16idx(str, 2))
call assert_equal(2, utf16idx(str, 3))
call assert_equal(2, utf16idx(str, 4))
call assert_equal(4, utf16idx(str, 5))
call assert_equal(4, utf16idx(str, 6))
call assert_equal(4, utf16idx(str, 7))
call assert_equal(4, utf16idx(str, 8))
call assert_equal(5, utf16idx(str, 9))
call assert_equal(-1, utf16idx(str, 10))
" UTF-16 index of a string with composing characters
let str = '-á-b́'
call assert_equal(0, utf16idx(str, 0))
call assert_equal(1, utf16idx(str, 1))
call assert_equal(1, utf16idx(str, 2))
call assert_equal(1, utf16idx(str, 3))
call assert_equal(2, utf16idx(str, 4))
call assert_equal(3, utf16idx(str, 5))
call assert_equal(3, utf16idx(str, 6))
call assert_equal(3, utf16idx(str, 7))
call assert_equal(-1, utf16idx(str, 8))
call assert_equal(0, utf16idx(str, 0, v:true))
call assert_equal(1, utf16idx(str, 1, v:true))
call assert_equal(2, utf16idx(str, 2, v:true))
call assert_equal(2, utf16idx(str, 3, v:true))
call assert_equal(3, utf16idx(str, 4, v:true))
call assert_equal(4, utf16idx(str, 5, v:true))
call assert_equal(5, utf16idx(str, 6, v:true))
call assert_equal(5, utf16idx(str, 7, v:true))
call assert_equal(-1, utf16idx(str, 8, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
call assert_equal(0, utf16idx(str, 0))
call assert_equal(1, utf16idx(str, 1))
call assert_equal(1, utf16idx(str, 2))
call assert_equal(1, utf16idx(str, 3))
call assert_equal(1, utf16idx(str, 4))
call assert_equal(1, utf16idx(str, 5))
call assert_equal(2, utf16idx(str, 6))
call assert_equal(3, utf16idx(str, 7))
call assert_equal(3, utf16idx(str, 8))
call assert_equal(3, utf16idx(str, 9))
call assert_equal(3, utf16idx(str, 10))
call assert_equal(3, utf16idx(str, 11))
call assert_equal(-1, utf16idx(str, 12))
call assert_equal(0, utf16idx(str, 0, v:true))
call assert_equal(1, utf16idx(str, 1, v:true))
call assert_equal(2, utf16idx(str, 2, v:true))
call assert_equal(2, utf16idx(str, 3, v:true))
call assert_equal(3, utf16idx(str, 4, v:true))
call assert_equal(3, utf16idx(str, 5, v:true))
call assert_equal(4, utf16idx(str, 6, v:true))
call assert_equal(5, utf16idx(str, 7, v:true))
call assert_equal(6, utf16idx(str, 8, v:true))
call assert_equal(6, utf16idx(str, 9, v:true))
call assert_equal(7, utf16idx(str, 10, v:true))
call assert_equal(7, utf16idx(str, 11, v:true))
call assert_equal(-1, utf16idx(str, 12, v:true))
" empty string
call assert_equal(-1, utf16idx('', 0))
call assert_equal(-1, utf16idx('', 0, v:true))
" error cases
call assert_equal(-1, utf16idx("", 0))
call assert_equal(-1, utf16idx("abc", -1))
call assert_equal(-1, utf16idx(test_null_string(), 0))
call assert_fails('let l = utf16idx([], 0)', 'E1174:')
call assert_fails('let l = utf16idx("ab", [])', 'E1210:')
call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:')
endfunc
" Test for utf16idx() using a character index
func Test_utf16idx_from_charidx()
let str = "abc"
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 3, v:false, v:true))
" UTF-16 index of a string with two byte characters
let str = "a©©b"
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
" UTF-16 index of a string with four byte characters
let str = "a😊😊b"
call assert_equal(0, utf16idx(str, 0, v:false, v:true))
call assert_equal(2, utf16idx(str, 1, v:false, v:true))
call assert_equal(4, utf16idx(str, 2, v:false, v:true))
call assert_equal(5, utf16idx(str, 3, v:false, v:true))
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
" UTF-16 index of a string with composing characters
let str = '-á-b́'
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, utf16idx(str, i, v:true, v:true))
endfor
call assert_equal(-1, utf16idx(str, 6, v:true, v:true))
" string with multiple composing characters
let str = '-ą́-ą́'
for i in str->strcharlen()->range()
call assert_equal(i, utf16idx(str, i, v:false, v:true))
endfor
call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
for i in str->strchars()->range()
call assert_equal(i, utf16idx(str, i, v:true, v:true))
endfor
call assert_equal(-1, utf16idx(str, 8, v:true, v:true))
" empty string
call assert_equal(-1, utf16idx('', 0, v:false, v:true))
call assert_equal(-1, utf16idx('', 0, v:true, v:true))
" error cases
call assert_equal(-1, utf16idx(test_null_string(), 0, v:true, v:true))
call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:')
endfunc
" Test for strutf16len()
func Test_strutf16len()
call assert_equal(3, strutf16len('abc'))
call assert_equal(3, 'abc'->strutf16len(v:true))
call assert_equal(4, strutf16len('a©©b'))
call assert_equal(4, strutf16len('a©©b', v:true))
call assert_equal(6, strutf16len('a😊😊b'))
call assert_equal(6, strutf16len('a😊😊b', v:true))
call assert_equal(4, strutf16len('-á-b́'))
call assert_equal(6, strutf16len('-á-b́', v:true))
call assert_equal(4, strutf16len('-ą́-ą́'))
call assert_equal(8, strutf16len('-ą́-ą́', v:true))
call assert_equal(0, strutf16len(''))
" error cases
call assert_fails('let l = strutf16len([])', 'E1174:')
call assert_fails('let l = strutf16len("a", [])', 'E1212:')
call assert_equal(0, strutf16len(test_null_string()))
endfunc
func Test_count() func Test_count()
let l = ['a', 'a', 'A', 'b'] let l = ['a', 'a', 'A', 'b']
call assert_equal(2, count(l, 'a')) call assert_equal(2, count(l, 'a'))
@@ -3074,5 +3469,4 @@ func Test_delfunc_while_listing()
call StopVimInTerminal(buf) call StopVimInTerminal(buf)
endfunc endfunc
" vim: shiftwidth=2 sts=2 expandtab " vim: shiftwidth=2 sts=2 expandtab

View File

@@ -695,6 +695,8 @@ static char *(features[]) =
static int included_patches[] = static int included_patches[] =
{ /* Add new patch number below this line */ { /* Add new patch number below this line */
/**/
1485,
/**/ /**/
1484, 1484,
/**/ /**/