0
0
mirror of https://github.com/vim/vim.git synced 2025-09-25 03:54:15 -04:00

patch 8.2.0901: formatting CJK text isn't optimal

Problem:    Formatting CJK text isn't optimal.
Solution:   Properly break CJK lines. (closes #3875)
This commit is contained in:
Bram Moolenaar
2020-06-04 18:22:13 +02:00
parent 9155825b24
commit e52702f003
9 changed files with 329 additions and 9 deletions

View File

@@ -1688,6 +1688,10 @@ B When joining lines, don't insert a space between two multi-byte
characters. Overruled by the 'M' flag. characters. Overruled by the 'M' flag.
1 Don't break a line after a one-letter word. It's broken before it 1 Don't break a line after a one-letter word. It's broken before it
instead (if possible). instead (if possible).
] Respect textwidth rigorously. With this flag set, no line can be
longer than textwidth, unless line-break-prohibition rules make this
impossible. Mainly for CJK scripts and works only if 'encoding' is
"utf-8".
j Where it makes sense, remove a comment leader when joining lines. For j Where it makes sense, remove a comment leader when joining lines. For
example, joining: example, joining:
int i; // the index ~ int i; // the index ~

View File

@@ -3842,6 +3842,158 @@ utf_head_off(char_u *base, char_u *p)
return (int)(p - q); return (int)(p - q);
} }
/*
* Whether space is NOT allowed before/after 'c'.
*/
int
utf_eat_space(int cc)
{
return ((cc >= 0x2000 && cc <= 0x206F) // General punctuations
|| (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
|| (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
|| (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
|| (cc >= 0xff1a && cc <= 0xff20) // ..
|| (cc >= 0xff3b && cc <= 0xff40) // ..
|| (cc >= 0xff5b && cc <= 0xff65)); // ..
}
/*
* Whether line break is allowed before "cc".
*/
int
utf_allow_break_before(int cc)
{
static const int BOL_prohibition_punct[] =
{
'!',
'%',
')',
',',
':',
';',
'>',
'?',
']',
'}',
0x2019, // right single quotation mark
0x201d, // ” right double quotation mark
0x2020, // † dagger
0x2021, // ‡ double dagger
0x2026, // … horizontal ellipsis
0x2030, // ‰ per mille sign
0x2031, // ‱ per then thousand sign
0x203c, // ‼ double exclamation mark
0x2047, // ⁇ double question mark
0x2048, // ⁈ question exclamation mark
0x2049, // ⁉ exclamation question mark
0x2103, // ℃ degree celsius
0x2109, // ℉ degree fahrenheit
0x3001, // 、 ideographic comma
0x3002, // 。 ideographic full stop
0x3009, // 〉 right angle bracket
0x300b, // 》 right double angle bracket
0x300d, // 」 right corner bracket
0x300f, // 』 right white corner bracket
0x3011, // 】 right black lenticular bracket
0x3015, // right tortoise shell bracket
0x3017, // 〗 right white lenticular bracket
0x3019, // 〙 right white tortoise shell bracket
0x301b, // 〛 right white square bracket
0xff01, // fullwidth exclamation mark
0xff09, // fullwidth right parenthesis
0xff0c, // fullwidth comma
0xff0e, // fullwidth full stop
0xff1a, // fullwidth colon
0xff1b, // fullwidth semicolon
0xff1f, // fullwidth question mark
0xff3d, // fullwidth right square bracket
0xff5d, // fullwidth right curly bracket
};
int first = 0;
int last = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
int mid = 0;
while (first < last)
{
mid = (first + last)/2;
if (cc == BOL_prohibition_punct[mid])
return FALSE;
else if (cc > BOL_prohibition_punct[mid])
first = mid + 1;
else
last = mid - 1;
}
return cc != BOL_prohibition_punct[first];
}
/*
* Whether line break is allowed after "cc".
*/
static int
utf_allow_break_after(int cc)
{
static const int EOL_prohibition_punct[] =
{
'(',
'<',
'[',
'`',
'{',
//0x2014, // — em dash
0x2018, // left single quotation mark
0x201c, // “ left double quotation mark
//0x2053, // swung dash
0x3008, // 〈 left angle bracket
0x300a, // 《 left double angle bracket
0x300c, // 「 left corner bracket
0x300e, // 『 left white corner bracket
0x3010, // 【 left black lenticular bracket
0x3014, // left tortoise shell bracket
0x3016, // 〖 left white lenticular bracket
0x3018, // 〘 left white tortoise shell bracket
0x301a, // 〚 left white square bracket
0xff08, // fullwidth left parenthesis
0xff3b, // fullwidth left square bracket
0xff5b, // fullwidth left curly bracket
};
int first = 0;
int last = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
int mid = 0;
while (first < last)
{
mid = (first + last)/2;
if (cc == EOL_prohibition_punct[mid])
return FALSE;
else if (cc > EOL_prohibition_punct[mid])
first = mid + 1;
else
last = mid - 1;
}
return cc != EOL_prohibition_punct[first];
}
/*
* Whether line break is allowed between "cc" and "ncc".
*/
int
utf_allow_break(int cc, int ncc)
{
// don't break between two-letter punctuations
if (cc == ncc
&& (cc == 0x2014 // em dash
|| cc == 0x2026)) // horizontal ellipsis
return FALSE;
return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
}
/* /*
* Copy a character from "*fp" to "*tp" and advance the pointers. * Copy a character from "*fp" to "*tp" and advance the pointers.
*/ */

View File

@@ -1967,7 +1967,10 @@ do_join(
&& (!has_format_option(FO_MBYTE_JOIN) && (!has_format_option(FO_MBYTE_JOIN)
|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100)) || (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
&& (!has_format_option(FO_MBYTE_JOIN2) && (!has_format_option(FO_MBYTE_JOIN2)
|| mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100) || (mb_ptr2char(curr) < 0x100
&& !(enc_utf8 && utf_eat_space(endcurr1)))
|| (endcurr1 < 0x100
&& !(enc_utf8 && utf_eat_space(mb_ptr2char(curr)))))
) )
{ {
// don't add a space if the line is ending in a space // don't add a space if the line is ending in a space

View File

@@ -141,12 +141,13 @@
#define FO_ONE_LETTER '1' #define FO_ONE_LETTER '1'
#define FO_WHITE_PAR 'w' // trailing white space continues paragr. #define FO_WHITE_PAR 'w' // trailing white space continues paragr.
#define FO_AUTO 'a' // automatic formatting #define FO_AUTO 'a' // automatic formatting
#define FO_RIGOROUS_TW ']' // respect textwidth rigorously
#define FO_REMOVE_COMS 'j' // remove comment leaders when joining lines #define FO_REMOVE_COMS 'j' // remove comment leaders when joining lines
#define FO_PERIOD_ABBR 'p' // don't break a single space after a period #define FO_PERIOD_ABBR 'p' // don't break a single space after a period
#define DFLT_FO_VI "vt" #define DFLT_FO_VI "vt"
#define DFLT_FO_VIM "tcq" #define DFLT_FO_VIM "tcq"
#define FO_ALL "tcroq2vlb1mMBn,awjp" // for do_set() #define FO_ALL "tcroq2vlb1mMBn,aw]jp" // for do_set()
// characters for the p_cpo option: // characters for the p_cpo option:
#define CPO_ALTREAD 'a' // ":read" sets alternate file name #define CPO_ALTREAD 'a' // ":read" sets alternate file name

View File

@@ -52,6 +52,9 @@ void show_utf8(void);
int latin_head_off(char_u *base, char_u *p); int latin_head_off(char_u *base, char_u *p);
int dbcs_screen_head_off(char_u *base, char_u *p); int dbcs_screen_head_off(char_u *base, char_u *p);
int utf_head_off(char_u *base, char_u *p); int utf_head_off(char_u *base, char_u *p);
int utf_eat_space(int cc);
int utf_allow_break_before(int cc);
int utf_allow_break(int cc, int ncc);
void mb_copy_char(char_u **fp, char_u **tp); void mb_copy_char(char_u **fp, char_u **tp);
int mb_off_next(char_u *base, char_u *p); int mb_off_next(char_u *base, char_u *p);
int mb_tail_off(char_u *base, char_u *p); int mb_tail_off(char_u *base, char_u *p);

View File

@@ -85,6 +85,7 @@ NEW_TESTS = \
test_charsearch_utf8 \ test_charsearch_utf8 \
test_checkpath \ test_checkpath \
test_cindent \ test_cindent \
test_cjk_linebreak \
test_clientserver \ test_clientserver \
test_close_count \ test_close_count \
test_cmdline \ test_cmdline \
@@ -333,6 +334,7 @@ NEW_TESTS_RES = \
test_charsearch.res \ test_charsearch.res \
test_checkpath.res \ test_checkpath.res \
test_cindent.res \ test_cindent.res \
test_cjk_linebreak.res \
test_clientserver.res \ test_clientserver.res \
test_close_count.res \ test_close_count.res \
test_cmdline.res \ test_cmdline.res \

View File

@@ -0,0 +1,91 @@
scriptencoding utf-8
func Run_cjk_linebreak_after()
set textwidth=12
for punct in [
\ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '', '”', '†', '‡',
\ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》',
\ '」', '』', '】', '', '〗', '〙', '〛', '', '', '', '', '',
\ '', '', '', '']
call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。')
normal gqq
call assert_equal('这是一个测试'.punct, getline(1))
%d_
endfor
endfunc
func Test_cjk_linebreak_after()
set formatoptions=croqn2mB1j
call Run_cjk_linebreak_after()
endfunc
" TODO: this test fails
"func Test_cjk_linebreak_after_rigorous()
" set formatoptions=croqn2mB1j]
" call Run_cjk_linebreak_after()
"endfunc
func Run_cjk_linebreak_before()
set textwidth=12
for punct in [
\ '(', '<', '[', '`', '{', '', '“', '〈', '《', '「', '『', '【', '',
\ '〖', '〘', '〚', '', '', '']
call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。')
normal gqq
call assert_equal('这是个测试', getline(1))
%d_
endfor
endfunc
func Test_cjk_linebreak_before()
set formatoptions=croqn2mB1j
call Run_cjk_linebreak_before()
endfunc
func Test_cjk_linebreak_before_rigorous()
set formatoptions=croqn2mB1j]
call Run_cjk_linebreak_before()
endfunc
func Run_cjk_linebreak_nobetween()
" …… must not start a line
call setline('.', '这是个测试……试试 CJK 行禁则补丁。')
set textwidth=12 ambiwidth=double
normal gqq
" TODO: this fails
" call assert_equal('这是个测试……', getline(1))
%d_
call setline('.', '这是一个测试……试试 CJK 行禁则补丁。')
set textwidth=12 ambiwidth=double
normal gqq
call assert_equal('这是一个测', getline(1))
%d_
" but —— can
call setline('.', '这是个测试——试试 CJK 行禁则补丁。')
set textwidth=12 ambiwidth=double
normal gqq
call assert_equal('这是个测试', getline(1))
endfunc
func Test_cjk_linebreak_nobetween()
set formatoptions=croqn2mB1j
call Run_cjk_linebreak_nobetween()
endfunc
func Test_cjk_linebreak_nobetween_rigorous()
set formatoptions=croqn2mB1j]
call Run_cjk_linebreak_nobetween()
endfunc
func Test_cjk_linebreak_join_punct()
for punct in ['——', '〗', '', '。', '……']
call setline(1, '文本文本'.punct)
call setline(2, 'English')
set formatoptions=croqn2mB1j
normal ggJ
call assert_equal('文本文本'.punct.'English', getline(1))
%d_
endfor
endfunc

View File

@@ -45,10 +45,12 @@ internal_format(
int c) // character to be inserted (can be NUL) int c) // character to be inserted (can be NUL)
{ {
int cc; int cc;
int skip_pos;
int save_char = NUL; int save_char = NUL;
int haveto_redraw = FALSE; int haveto_redraw = FALSE;
int fo_ins_blank = has_format_option(FO_INS_BLANK); int fo_ins_blank = has_format_option(FO_INS_BLANK);
int fo_multibyte = has_format_option(FO_MBYTE_BREAK); int fo_multibyte = has_format_option(FO_MBYTE_BREAK);
int fo_rigor_tw = has_format_option(FO_RIGOROUS_TW);
int fo_white_par = has_format_option(FO_WHITE_PAR); int fo_white_par = has_format_option(FO_WHITE_PAR);
int first_line = TRUE; int first_line = TRUE;
colnr_T leader_len; colnr_T leader_len;
@@ -125,6 +127,7 @@ internal_format(
curwin->w_cursor.col = startcol; curwin->w_cursor.col = startcol;
foundcol = 0; foundcol = 0;
skip_pos = 0;
// Find position to break at. // Find position to break at.
// Stop at first entered white when 'formatoptions' has 'v' // Stop at first entered white when 'formatoptions' has 'v'
@@ -189,8 +192,11 @@ internal_format(
if (curwin->w_cursor.col <= (colnr_T)wantcol) if (curwin->w_cursor.col <= (colnr_T)wantcol)
break; break;
} }
else if (cc >= 0x100 && fo_multibyte) else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && fo_multibyte)
{ {
int ncc;
int allow_break;
// Break after or before a multi-byte character. // Break after or before a multi-byte character.
if (curwin->w_cursor.col != startcol) if (curwin->w_cursor.col != startcol)
{ {
@@ -199,8 +205,14 @@ internal_format(
break; break;
col = curwin->w_cursor.col; col = curwin->w_cursor.col;
inc_cursor(); inc_cursor();
// Don't change end_foundcol if already set. ncc = gchar_cursor();
if (foundcol != curwin->w_cursor.col)
allow_break =
(enc_utf8 && utf_allow_break(cc, ncc))
|| enc_dbcs;
// If we have already checked this position, skip!
if (curwin->w_cursor.col != skip_pos && allow_break)
{ {
foundcol = curwin->w_cursor.col; foundcol = curwin->w_cursor.col;
end_foundcol = foundcol; end_foundcol = foundcol;
@@ -213,6 +225,7 @@ internal_format(
if (curwin->w_cursor.col == 0) if (curwin->w_cursor.col == 0)
break; break;
ncc = cc;
col = curwin->w_cursor.col; col = curwin->w_cursor.col;
dec_cursor(); dec_cursor();
@@ -220,16 +233,65 @@ internal_format(
if (WHITECHAR(cc)) if (WHITECHAR(cc))
continue; // break with space continue; // break with space
// Don't break until after the comment leader // Don't break until after the comment leader.
if (curwin->w_cursor.col < leader_len) if (curwin->w_cursor.col < leader_len)
break; break;
curwin->w_cursor.col = col; curwin->w_cursor.col = col;
skip_pos = curwin->w_cursor.col;
allow_break =
(enc_utf8 && utf_allow_break(cc, ncc))
|| enc_dbcs;
// Must handle this to respect line break prohibition.
if (allow_break)
{
foundcol = curwin->w_cursor.col; foundcol = curwin->w_cursor.col;
end_foundcol = foundcol; end_foundcol = foundcol;
}
if (curwin->w_cursor.col <= (colnr_T)wantcol) if (curwin->w_cursor.col <= (colnr_T)wantcol)
{
int ncc_allow_break =
(enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
if (allow_break)
break; break;
if (!ncc_allow_break && !fo_rigor_tw)
{
// Enable at most 1 punct hang outside of textwidth.
if (curwin->w_cursor.col == startcol)
{
// We are inserting a non-breakable char, postpone
// line break check to next insert.
end_foundcol = foundcol = 0;
break;
}
// Neither cc nor ncc is NUL if we are here, so
// it's safe to inc_cursor.
col = curwin->w_cursor.col;
inc_cursor();
cc = ncc;
ncc = gchar_cursor();
// handle insert
ncc = (ncc != NUL) ? ncc : c;
allow_break =
(enc_utf8 && utf_allow_break(cc, ncc))
|| enc_dbcs;
if (allow_break)
{
// Break only when we are not at end of line.
end_foundcol = foundcol =
ncc == NUL? 0 : curwin->w_cursor.col;
break;
}
curwin->w_cursor.col = col;
}
}
} }
if (curwin->w_cursor.col == 0) if (curwin->w_cursor.col == 0)
break; break;

View File

@@ -746,6 +746,8 @@ static char *(features[]) =
static int included_patches[] = static int included_patches[] =
{ /* Add new patch number below this line */ { /* Add new patch number below this line */
/**/
901,
/**/ /**/
900, 900,
/**/ /**/