mirror of
https://github.com/vim/vim.git
synced 2025-09-23 03:43:49 -04:00
patch 8.2.1665: cannot do fuzzy string matching
Problem: Cannot do fuzzy string matching. Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes #6932)
This commit is contained in:
@@ -2641,6 +2641,7 @@ matcharg({nr}) List arguments of |:match|
|
||||
matchdelete({id} [, {win}]) Number delete match identified by {id}
|
||||
matchend({expr}, {pat} [, {start} [, {count}]])
|
||||
Number position where {pat} ends in {expr}
|
||||
matchfuzzy({list}, {str}) List fuzzy match {str} in {list}
|
||||
matchlist({expr}, {pat} [, {start} [, {count}]])
|
||||
List match and submatches of {pat} in {expr}
|
||||
matchstr({expr}, {pat} [, {start} [, {count}]])
|
||||
@@ -7307,6 +7308,29 @@ matchend({expr}, {pat} [, {start} [, {count}]]) *matchend()*
|
||||
Can also be used as a |method|: >
|
||||
GetText()->matchend('word')
|
||||
|
||||
|
||||
matchfuzzy({list}, {str}) *matchfuzzy()*
|
||||
Returns a list with all the strings in {list} that fuzzy
|
||||
match {str}. The strings in the returned list are sorted
|
||||
based on the matching score. {str} is treated as a literal
|
||||
string and regular expression matching is NOT supported.
|
||||
The maximum supported {str} length is 256.
|
||||
|
||||
If there are no matching strings or there is an error, then an
|
||||
empty list is returned. If length of {str} is greater than
|
||||
256, then returns an empty list.
|
||||
|
||||
Example: >
|
||||
:echo matchfuzzy(["clay", "crow"], "cay")
|
||||
< results in ["clay"]. >
|
||||
:echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl")
|
||||
< results in a list of buffer names fuzzy matching "ndl". >
|
||||
:echo v:oldfiles->matchfuzzy("test")
|
||||
< results in a list of file names fuzzy matching "test". >
|
||||
:let l = readfile("buffer.c")->matchfuzzy("str")
|
||||
< results in a list of lines in "buffer.c" fuzzy matching "str".
|
||||
|
||||
|
||||
matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()*
|
||||
Same as |match()|, but return a |List|. The first item in the
|
||||
list is the matched string, same as what matchstr() would
|
||||
|
@@ -603,6 +603,7 @@ String manipulation: *string-functions*
|
||||
charclass() class of a character
|
||||
match() position where a pattern matches in a string
|
||||
matchend() position where a pattern match ends in a string
|
||||
matchfuzzy() fuzzy matches a string in a list of strings
|
||||
matchstr() match of a pattern in a string
|
||||
matchstrpos() match and positions of a pattern in a string
|
||||
matchlist() like matchstr() and also return submatches
|
||||
|
@@ -750,6 +750,7 @@ static funcentry_T global_functions[] =
|
||||
{"matcharg", 1, 1, FEARG_1, ret_list_string, f_matcharg},
|
||||
{"matchdelete", 1, 2, FEARG_1, ret_number, f_matchdelete},
|
||||
{"matchend", 2, 4, FEARG_1, ret_number, f_matchend},
|
||||
{"matchfuzzy", 2, 2, FEARG_1, ret_list_string, f_matchfuzzy},
|
||||
{"matchlist", 2, 4, FEARG_1, ret_list_string, f_matchlist},
|
||||
{"matchstr", 2, 4, FEARG_1, ret_string, f_matchstr},
|
||||
{"matchstrpos", 2, 4, FEARG_1, ret_list_any, f_matchstrpos},
|
||||
|
@@ -36,4 +36,5 @@ void find_pattern_in_path(char_u *ptr, int dir, int len, int whole, int skip_com
|
||||
spat_T *get_spat(int idx);
|
||||
int get_spat_last_idx(void);
|
||||
void f_searchcount(typval_T *argvars, typval_T *rettv);
|
||||
void f_matchfuzzy(typval_T *argvars, typval_T *rettv);
|
||||
/* vim: set ft=c : */
|
||||
|
340
src/search.c
340
src/search.c
@@ -4165,4 +4165,344 @@ f_searchcount(typval_T *argvars, typval_T *rettv)
|
||||
the_end:
|
||||
restore_last_search_pattern();
|
||||
}
|
||||
|
||||
/*
|
||||
* Fuzzy string matching
|
||||
*
|
||||
* Ported from the lib_fts library authored by Forrest Smith.
|
||||
* https://github.com/forrestthewoods/lib_fts/tree/master/code
|
||||
*
|
||||
* Blog describing the algorithm:
|
||||
* https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
|
||||
*
|
||||
* Each matching string is assigned a score. The following factors are checked:
|
||||
* Matched letter
|
||||
* Unmatched letter
|
||||
* Consecutively matched letters
|
||||
* Proximity to start
|
||||
* Letter following a separator (space, underscore)
|
||||
* Uppercase letter following lowercase (aka CamelCase)
|
||||
*
|
||||
* Matched letters are good. Unmatched letters are bad. Matching near the start
|
||||
* is good. Matching the first letter in the middle of a phrase is good.
|
||||
* Matching the uppercase letters in camel case entries is good.
|
||||
*
|
||||
* The score assigned for each factor is explained below.
|
||||
* File paths are different from file names. File extensions may be ignorable.
|
||||
* Single words care about consecutive matches but not separators or camel
|
||||
* case.
|
||||
* Score starts at 0
|
||||
* Matched letter: +0 points
|
||||
* Unmatched letter: -1 point
|
||||
* Consecutive match bonus: +5 points
|
||||
* Separator bonus: +10 points
|
||||
* Camel case bonus: +10 points
|
||||
* Unmatched leading letter: -3 points (max: -9)
|
||||
*
|
||||
* There is some nuance to this. Scores don’t have an intrinsic meaning. The
|
||||
* score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
|
||||
* lower minimum score due to unmatched letter penalty. Longer search patterns
|
||||
* have a higher maximum score due to match bonuses.
|
||||
*
|
||||
* Separator and camel case bonus is worth a LOT. Consecutive matches are worth
|
||||
* quite a bit.
|
||||
*
|
||||
* There is a penalty if you DON’T match the first three letters. Which
|
||||
* effectively rewards matching near the start. However there’s no difference
|
||||
* in matching between the middle and end.
|
||||
*
|
||||
* There is not an explicit bonus for an exact match. Unmatched letters receive
|
||||
* a penalty. So shorter strings and closer matches are worth more.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
listitem_T *item;
|
||||
int score;
|
||||
} fuzzyItem_T;
|
||||
|
||||
static int
|
||||
fuzzy_match_recursive(
|
||||
char_u *fuzpat,
|
||||
char_u *str,
|
||||
int *outScore,
|
||||
char_u *strBegin,
|
||||
char_u *srcMatches,
|
||||
char_u *matches,
|
||||
int maxMatches,
|
||||
int nextMatch,
|
||||
int *recursionCount,
|
||||
int recursionLimit)
|
||||
{
|
||||
// Recursion params
|
||||
int recursiveMatch = FALSE;
|
||||
char_u bestRecursiveMatches[256];
|
||||
int bestRecursiveScore = 0;
|
||||
int first_match;
|
||||
int matched;
|
||||
|
||||
// Count recursions
|
||||
++*recursionCount;
|
||||
if (*recursionCount >= recursionLimit)
|
||||
return FALSE;
|
||||
|
||||
// Detect end of strings
|
||||
if (*fuzpat == '\0' || *str == '\0')
|
||||
return FALSE;
|
||||
|
||||
// Loop through fuzpat and str looking for a match
|
||||
first_match = TRUE;
|
||||
while (*fuzpat != '\0' && *str != '\0')
|
||||
{
|
||||
// Found match
|
||||
if (vim_tolower(*fuzpat) == vim_tolower(*str))
|
||||
{
|
||||
char_u recursiveMatches[256];
|
||||
int recursiveScore = 0;
|
||||
|
||||
// Supplied matches buffer was too short
|
||||
if (nextMatch >= maxMatches)
|
||||
return FALSE;
|
||||
|
||||
// "Copy-on-Write" srcMatches into matches
|
||||
if (first_match && srcMatches)
|
||||
{
|
||||
memcpy(matches, srcMatches, nextMatch);
|
||||
first_match = FALSE;
|
||||
}
|
||||
|
||||
// Recursive call that "skips" this match
|
||||
if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore,
|
||||
strBegin, matches, recursiveMatches,
|
||||
sizeof(recursiveMatches), nextMatch, recursionCount,
|
||||
recursionLimit))
|
||||
{
|
||||
// Pick best recursive score
|
||||
if (!recursiveMatch || recursiveScore > bestRecursiveScore)
|
||||
{
|
||||
memcpy(bestRecursiveMatches, recursiveMatches, 256);
|
||||
bestRecursiveScore = recursiveScore;
|
||||
}
|
||||
recursiveMatch = TRUE;
|
||||
}
|
||||
|
||||
// Advance
|
||||
matches[nextMatch++] = (char_u)(str - strBegin);
|
||||
++fuzpat;
|
||||
}
|
||||
++str;
|
||||
}
|
||||
|
||||
// Determine if full fuzpat was matched
|
||||
matched = *fuzpat == '\0' ? TRUE : FALSE;
|
||||
|
||||
// Calculate score
|
||||
if (matched)
|
||||
{
|
||||
// bonus for adjacent matches
|
||||
int sequential_bonus = 15;
|
||||
// bonus if match occurs after a separator
|
||||
int separator_bonus = 30;
|
||||
// bonus if match is uppercase and prev is lower
|
||||
int camel_bonus = 30;
|
||||
// bonus if the first letter is matched
|
||||
int first_letter_bonus = 15;
|
||||
// penalty applied for every letter in str before the first match
|
||||
int leading_letter_penalty = -5;
|
||||
// maximum penalty for leading letters
|
||||
int max_leading_letter_penalty = -15;
|
||||
// penalty for every letter that doesn't matter
|
||||
int unmatched_letter_penalty = -1;
|
||||
int penalty;
|
||||
int unmatched;
|
||||
int i;
|
||||
|
||||
// Iterate str to end
|
||||
while (*str != '\0')
|
||||
++str;
|
||||
|
||||
// Initialize score
|
||||
*outScore = 100;
|
||||
|
||||
// Apply leading letter penalty
|
||||
penalty = leading_letter_penalty * matches[0];
|
||||
if (penalty < max_leading_letter_penalty)
|
||||
penalty = max_leading_letter_penalty;
|
||||
*outScore += penalty;
|
||||
|
||||
// Apply unmatched penalty
|
||||
unmatched = (int)(str - strBegin) - nextMatch;
|
||||
*outScore += unmatched_letter_penalty * unmatched;
|
||||
|
||||
// Apply ordering bonuses
|
||||
for (i = 0; i < nextMatch; ++i)
|
||||
{
|
||||
char_u currIdx = matches[i];
|
||||
|
||||
if (i > 0)
|
||||
{
|
||||
char_u prevIdx = matches[i - 1];
|
||||
|
||||
// Sequential
|
||||
if (currIdx == (prevIdx + 1))
|
||||
*outScore += sequential_bonus;
|
||||
}
|
||||
|
||||
// Check for bonuses based on neighbor character value
|
||||
if (currIdx > 0)
|
||||
{
|
||||
// Camel case
|
||||
char_u neighbor = strBegin[currIdx - 1];
|
||||
char_u curr = strBegin[currIdx];
|
||||
int neighborSeparator;
|
||||
|
||||
if (islower(neighbor) && isupper(curr))
|
||||
*outScore += camel_bonus;
|
||||
|
||||
// Separator
|
||||
neighborSeparator = neighbor == '_' || neighbor == ' ';
|
||||
if (neighborSeparator)
|
||||
*outScore += separator_bonus;
|
||||
}
|
||||
else
|
||||
{
|
||||
// First letter
|
||||
*outScore += first_letter_bonus;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return best result
|
||||
if (recursiveMatch && (!matched || bestRecursiveScore > *outScore))
|
||||
{
|
||||
// Recursive score is better than "this"
|
||||
memcpy(matches, bestRecursiveMatches, maxMatches);
|
||||
*outScore = bestRecursiveScore;
|
||||
return TRUE;
|
||||
}
|
||||
else if (matched)
|
||||
return TRUE; // "this" score is better than recursive
|
||||
|
||||
return FALSE; // no match
|
||||
}
|
||||
|
||||
/*
|
||||
* fuzzy_match()
|
||||
*
|
||||
* Performs exhaustive search via recursion to find all possible matches and
|
||||
* match with highest score.
|
||||
* Scores values have no intrinsic meaning. Possible score range is not
|
||||
* normalized and varies with pattern.
|
||||
* Recursion is limited internally (default=10) to prevent degenerate cases
|
||||
* (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
|
||||
* Uses char_u for match indices. Therefore patterns are limited to 256
|
||||
* characters.
|
||||
*
|
||||
* Returns TRUE if fuzpat is found AND calculates a score.
|
||||
*/
|
||||
static int
|
||||
fuzzy_match(char_u *str, char_u *fuzpat, int *outScore)
|
||||
{
|
||||
char_u matches[256];
|
||||
int recursionCount = 0;
|
||||
int recursionLimit = 10;
|
||||
|
||||
*outScore = 0;
|
||||
|
||||
return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches,
|
||||
sizeof(matches), 0, &recursionCount, recursionLimit);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the fuzzy matches in the descending order of the match score.
|
||||
*/
|
||||
static int
|
||||
fuzzy_item_compare(const void *s1, const void *s2)
|
||||
{
|
||||
int v1 = ((fuzzyItem_T *)s1)->score;
|
||||
int v2 = ((fuzzyItem_T *)s2)->score;
|
||||
|
||||
return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fuzzy search the string 'str' in 'strlist' and return the matching strings
|
||||
* in 'fmatchlist'.
|
||||
*/
|
||||
static void
|
||||
match_fuzzy(list_T *strlist, char_u *str, list_T *fmatchlist)
|
||||
{
|
||||
long len;
|
||||
fuzzyItem_T *ptrs;
|
||||
listitem_T *li;
|
||||
long i = 0;
|
||||
int found_match = FALSE;
|
||||
|
||||
len = list_len(strlist);
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
ptrs = ALLOC_MULT(fuzzyItem_T, len);
|
||||
if (ptrs == NULL)
|
||||
return;
|
||||
|
||||
// For all the string items in strlist, get the fuzzy matching score
|
||||
FOR_ALL_LIST_ITEMS(strlist, li)
|
||||
{
|
||||
int score;
|
||||
|
||||
ptrs[i].item = li;
|
||||
ptrs[i].score = -9999;
|
||||
// ignore non-string items in the list
|
||||
if (li->li_tv.v_type == VAR_STRING && li->li_tv.vval.v_string != NULL)
|
||||
if (fuzzy_match(li->li_tv.vval.v_string, str, &score))
|
||||
{
|
||||
ptrs[i].score = score;
|
||||
found_match = TRUE;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
if (found_match)
|
||||
{
|
||||
// Sort the list by the descending order of the match score
|
||||
qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
|
||||
fuzzy_item_compare);
|
||||
|
||||
// Copy the matching strings with 'score != -9999' to the return list
|
||||
for (i = 0; i < len; i++)
|
||||
{
|
||||
if (ptrs[i].score == -9999)
|
||||
break;
|
||||
list_append_string(fmatchlist, ptrs[i].item->li_tv.vval.v_string,
|
||||
-1);
|
||||
}
|
||||
}
|
||||
|
||||
vim_free(ptrs);
|
||||
}
|
||||
|
||||
/*
|
||||
* "matchfuzzy()" function
|
||||
*/
|
||||
void
|
||||
f_matchfuzzy(typval_T *argvars, typval_T *rettv)
|
||||
{
|
||||
if (argvars[0].v_type != VAR_LIST)
|
||||
{
|
||||
emsg(_(e_listreq));
|
||||
return;
|
||||
}
|
||||
if (argvars[0].vval.v_list == NULL)
|
||||
return;
|
||||
if (argvars[1].v_type != VAR_STRING
|
||||
|| argvars[1].vval.v_string == NULL)
|
||||
{
|
||||
semsg(_(e_invarg2), tv_get_string(&argvars[1]));
|
||||
return;
|
||||
}
|
||||
if (rettv_list_alloc(rettv) == OK)
|
||||
match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
|
||||
rettv->vval.v_list);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -2554,4 +2554,28 @@ func Test_browsedir()
|
||||
call assert_fails('call browsedir("open", [])', 'E730:')
|
||||
endfunc
|
||||
|
||||
" Test for matchfuzzy()
|
||||
func Test_matchfuzzy()
|
||||
call assert_fails('call matchfuzzy(10, "abc")', 'E714:')
|
||||
call assert_fails('call matchfuzzy(["abc"], [])', 'E730:')
|
||||
call assert_equal([], matchfuzzy([], 'abc'))
|
||||
call assert_equal([], matchfuzzy(['abc'], ''))
|
||||
call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac'))
|
||||
call assert_equal([], matchfuzzy([10, 20], 'ac'))
|
||||
call assert_equal(['abc'], matchfuzzy(['abc'], 'abc'))
|
||||
call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra'))
|
||||
call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa'))
|
||||
call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one'))
|
||||
call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo'))
|
||||
call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo'))
|
||||
call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa'))
|
||||
call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257)))
|
||||
|
||||
%bw!
|
||||
eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)})
|
||||
let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl')
|
||||
call assert_equal(1, len(l))
|
||||
call assert_match('needle', l[0])
|
||||
endfunc
|
||||
|
||||
" vim: shiftwidth=2 sts=2 expandtab
|
||||
|
@@ -750,6 +750,8 @@ static char *(features[]) =
|
||||
|
||||
static int included_patches[] =
|
||||
{ /* Add new patch number below this line */
|
||||
/**/
|
||||
1665,
|
||||
/**/
|
||||
1664,
|
||||
/**/
|
||||
|
Reference in New Issue
Block a user