To: vim_dev@googlegroups.com Subject: Patch 8.2.0901 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 8.2.0901 Problem: Formatting CJK text isn't optimal. Solution: Properly break CJK lines. (closes #3875) Files: runtime/doc/change.txt, src/mbyte.c, src/ops.c, src/option.h, src/proto/mbyte.pro, src/testdir/Make_all.mak, src/textformat.c, src/testdir/test_cjk_linebreak.vim *** ../vim-8.2.0900/runtime/doc/change.txt 2019-12-17 21:27:14.686319918 +0100 --- runtime/doc/change.txt 2020-06-04 17:56:02.629097404 +0200 *************** *** 1686,1691 **** --- 1688,1697 ---- characters. Overruled by the 'M' flag. 1 Don't break a line after a one-letter word. It's broken before it instead (if possible). + ] Respect textwidth rigorously. With this flag set, no line can be + longer than textwidth, unless line-break-prohibition rules make this + impossible. Mainly for CJK scripts and works only if 'encoding' is + "utf-8". j Where it makes sense, remove a comment leader when joining lines. For example, joining: int i; // the index ~ *** ../vim-8.2.0900/src/mbyte.c 2020-06-01 14:34:22.027462262 +0200 --- src/mbyte.c 2020-06-04 18:19:20.338981249 +0200 *************** *** 3843,3848 **** --- 3843,4000 ---- } /* + * Whether space is NOT allowed before/after 'c'. + */ + int + utf_eat_space(int cc) + { + return ((cc >= 0x2000 && cc <= 0x206F) // General punctuations + || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations + || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations + || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations + || (cc >= 0xff1a && cc <= 0xff20) // .. + || (cc >= 0xff3b && cc <= 0xff40) // .. + || (cc >= 0xff5b && cc <= 0xff65)); // .. + } + + /* + * Whether line break is allowed before "cc". + */ + int + utf_allow_break_before(int cc) + { + static const int BOL_prohibition_punct[] = + { + '!', + '%', + ')', + ',', + ':', + ';', + '>', + '?', + ']', + '}', + 0x2019, // ’ right single quotation mark + 0x201d, // ” right double quotation mark + 0x2020, // † dagger + 0x2021, // ‡ double dagger + 0x2026, // … horizontal ellipsis + 0x2030, // ‰ per mille sign + 0x2031, // ‱ per then thousand sign + 0x203c, // ‼ double exclamation mark + 0x2047, // ⁇ double question mark + 0x2048, // ⁈ question exclamation mark + 0x2049, // ⁉ exclamation question mark + 0x2103, // ℃ degree celsius + 0x2109, // ℉ degree fahrenheit + 0x3001, // 、 ideographic comma + 0x3002, // 。 ideographic full stop + 0x3009, // 〉 right angle bracket + 0x300b, // 》 right double angle bracket + 0x300d, // 」 right corner bracket + 0x300f, // 』 right white corner bracket + 0x3011, // 】 right black lenticular bracket + 0x3015, // 〕 right tortoise shell bracket + 0x3017, // 〗 right white lenticular bracket + 0x3019, // 〙 right white tortoise shell bracket + 0x301b, // 〛 right white square bracket + 0xff01, // ! fullwidth exclamation mark + 0xff09, // ) fullwidth right parenthesis + 0xff0c, // , fullwidth comma + 0xff0e, // . fullwidth full stop + 0xff1a, // : fullwidth colon + 0xff1b, // ; fullwidth semicolon + 0xff1f, // ? fullwidth question mark + 0xff3d, // ] fullwidth right square bracket + 0xff5d, // } fullwidth right curly bracket + }; + + int first = 0; + int last = sizeof(BOL_prohibition_punct)/sizeof(int) - 1; + int mid = 0; + + while (first < last) + { + mid = (first + last)/2; + + if (cc == BOL_prohibition_punct[mid]) + return FALSE; + else if (cc > BOL_prohibition_punct[mid]) + first = mid + 1; + else + last = mid - 1; + } + + return cc != BOL_prohibition_punct[first]; + } + + /* + * Whether line break is allowed after "cc". + */ + static int + utf_allow_break_after(int cc) + { + static const int EOL_prohibition_punct[] = + { + '(', + '<', + '[', + '`', + '{', + //0x2014, // — em dash + 0x2018, // ‘ left single quotation mark + 0x201c, // “ left double quotation mark + //0x2053, // ~ swung dash + 0x3008, // 〈 left angle bracket + 0x300a, // 《 left double angle bracket + 0x300c, // 「 left corner bracket + 0x300e, // 『 left white corner bracket + 0x3010, // 【 left black lenticular bracket + 0x3014, // 〔 left tortoise shell bracket + 0x3016, // 〖 left white lenticular bracket + 0x3018, // 〘 left white tortoise shell bracket + 0x301a, // 〚 left white square bracket + 0xff08, // ( fullwidth left parenthesis + 0xff3b, // [ fullwidth left square bracket + 0xff5b, // { fullwidth left curly bracket + }; + + int first = 0; + int last = sizeof(EOL_prohibition_punct)/sizeof(int) - 1; + int mid = 0; + + while (first < last) + { + mid = (first + last)/2; + + if (cc == EOL_prohibition_punct[mid]) + return FALSE; + else if (cc > EOL_prohibition_punct[mid]) + first = mid + 1; + else + last = mid - 1; + } + + return cc != EOL_prohibition_punct[first]; + } + + /* + * Whether line break is allowed between "cc" and "ncc". + */ + int + utf_allow_break(int cc, int ncc) + { + // don't break between two-letter punctuations + if (cc == ncc + && (cc == 0x2014 // em dash + || cc == 0x2026)) // horizontal ellipsis + return FALSE; + + return utf_allow_break_after(cc) && utf_allow_break_before(ncc); + } + + /* * Copy a character from "*fp" to "*tp" and advance the pointers. */ void *** ../vim-8.2.0900/src/ops.c 2020-06-01 19:14:09.050505748 +0200 --- src/ops.c 2020-06-04 17:38:21.854286486 +0200 *************** *** 1967,1973 **** && (!has_format_option(FO_MBYTE_JOIN) || (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100)) && (!has_format_option(FO_MBYTE_JOIN2) ! || mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100) ) { // don't add a space if the line is ending in a space --- 1967,1976 ---- && (!has_format_option(FO_MBYTE_JOIN) || (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100)) && (!has_format_option(FO_MBYTE_JOIN2) ! || (mb_ptr2char(curr) < 0x100 ! && !(enc_utf8 && utf_eat_space(endcurr1))) ! || (endcurr1 < 0x100 ! && !(enc_utf8 && utf_eat_space(mb_ptr2char(curr))))) ) { // don't add a space if the line is ending in a space *** ../vim-8.2.0900/src/option.h 2020-05-31 23:11:02.082515688 +0200 --- src/option.h 2020-06-04 17:38:21.854286486 +0200 *************** *** 141,152 **** #define FO_ONE_LETTER '1' #define FO_WHITE_PAR 'w' // trailing white space continues paragr. #define FO_AUTO 'a' // automatic formatting #define FO_REMOVE_COMS 'j' // remove comment leaders when joining lines #define FO_PERIOD_ABBR 'p' // don't break a single space after a period #define DFLT_FO_VI "vt" #define DFLT_FO_VIM "tcq" ! #define FO_ALL "tcroq2vlb1mMBn,awjp" // for do_set() // characters for the p_cpo option: #define CPO_ALTREAD 'a' // ":read" sets alternate file name --- 141,153 ---- #define FO_ONE_LETTER '1' #define FO_WHITE_PAR 'w' // trailing white space continues paragr. #define FO_AUTO 'a' // automatic formatting + #define FO_RIGOROUS_TW ']' // respect textwidth rigorously #define FO_REMOVE_COMS 'j' // remove comment leaders when joining lines #define FO_PERIOD_ABBR 'p' // don't break a single space after a period #define DFLT_FO_VI "vt" #define DFLT_FO_VIM "tcq" ! #define FO_ALL "tcroq2vlb1mMBn,aw]jp" // for do_set() // characters for the p_cpo option: #define CPO_ALTREAD 'a' // ":read" sets alternate file name *** ../vim-8.2.0900/src/proto/mbyte.pro 2020-06-01 14:34:22.027462262 +0200 --- src/proto/mbyte.pro 2020-06-04 18:00:03.751806728 +0200 *************** *** 52,57 **** --- 52,60 ---- int latin_head_off(char_u *base, char_u *p); int dbcs_screen_head_off(char_u *base, char_u *p); int utf_head_off(char_u *base, char_u *p); + int utf_eat_space(int cc); + int utf_allow_break_before(int cc); + int utf_allow_break(int cc, int ncc); void mb_copy_char(char_u **fp, char_u **tp); int mb_off_next(char_u *base, char_u *p); int mb_tail_off(char_u *base, char_u *p); *** ../vim-8.2.0900/src/testdir/Make_all.mak 2020-06-04 15:52:06.095922759 +0200 --- src/testdir/Make_all.mak 2020-06-04 17:38:21.854286486 +0200 *************** *** 85,90 **** --- 85,91 ---- test_charsearch_utf8 \ test_checkpath \ test_cindent \ + test_cjk_linebreak \ test_clientserver \ test_close_count \ test_cmdline \ *************** *** 333,338 **** --- 334,340 ---- test_charsearch.res \ test_checkpath.res \ test_cindent.res \ + test_cjk_linebreak.res \ test_clientserver.res \ test_close_count.res \ test_cmdline.res \ *** ../vim-8.2.0900/src/textformat.c 2020-05-01 14:26:17.132949262 +0200 --- src/textformat.c 2020-06-04 18:16:11.963699002 +0200 *************** *** 45,54 **** --- 45,56 ---- int c) // character to be inserted (can be NUL) { int cc; + int skip_pos; int save_char = NUL; int haveto_redraw = FALSE; int fo_ins_blank = has_format_option(FO_INS_BLANK); int fo_multibyte = has_format_option(FO_MBYTE_BREAK); + int fo_rigor_tw = has_format_option(FO_RIGOROUS_TW); int fo_white_par = has_format_option(FO_WHITE_PAR); int first_line = TRUE; colnr_T leader_len; *************** *** 125,130 **** --- 127,133 ---- curwin->w_cursor.col = startcol; foundcol = 0; + skip_pos = 0; // Find position to break at. // Stop at first entered white when 'formatoptions' has 'v' *************** *** 189,196 **** if (curwin->w_cursor.col <= (colnr_T)wantcol) break; } ! else if (cc >= 0x100 && fo_multibyte) { // Break after or before a multi-byte character. if (curwin->w_cursor.col != startcol) { --- 192,202 ---- if (curwin->w_cursor.col <= (colnr_T)wantcol) break; } ! else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && fo_multibyte) { + int ncc; + int allow_break; + // Break after or before a multi-byte character. if (curwin->w_cursor.col != startcol) { *************** *** 199,206 **** break; col = curwin->w_cursor.col; inc_cursor(); ! // Don't change end_foundcol if already set. ! if (foundcol != curwin->w_cursor.col) { foundcol = curwin->w_cursor.col; end_foundcol = foundcol; --- 205,218 ---- break; col = curwin->w_cursor.col; inc_cursor(); ! ncc = gchar_cursor(); ! ! allow_break = ! (enc_utf8 && utf_allow_break(cc, ncc)) ! || enc_dbcs; ! ! // If we have already checked this position, skip! ! if (curwin->w_cursor.col != skip_pos && allow_break) { foundcol = curwin->w_cursor.col; end_foundcol = foundcol; *************** *** 213,218 **** --- 225,231 ---- if (curwin->w_cursor.col == 0) break; + ncc = cc; col = curwin->w_cursor.col; dec_cursor(); *************** *** 220,235 **** if (WHITECHAR(cc)) continue; // break with space ! // Don't break until after the comment leader if (curwin->w_cursor.col < leader_len) break; curwin->w_cursor.col = col; ! foundcol = curwin->w_cursor.col; ! end_foundcol = foundcol; if (curwin->w_cursor.col <= (colnr_T)wantcol) ! break; } if (curwin->w_cursor.col == 0) break; --- 233,297 ---- if (WHITECHAR(cc)) continue; // break with space ! // Don't break until after the comment leader. if (curwin->w_cursor.col < leader_len) break; curwin->w_cursor.col = col; + skip_pos = curwin->w_cursor.col; ! allow_break = ! (enc_utf8 && utf_allow_break(cc, ncc)) ! || enc_dbcs; ! ! // Must handle this to respect line break prohibition. ! if (allow_break) ! { ! foundcol = curwin->w_cursor.col; ! end_foundcol = foundcol; ! } if (curwin->w_cursor.col <= (colnr_T)wantcol) ! { ! int ncc_allow_break = ! (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs; ! ! if (allow_break) ! break; ! if (!ncc_allow_break && !fo_rigor_tw) ! { ! // Enable at most 1 punct hang outside of textwidth. ! if (curwin->w_cursor.col == startcol) ! { ! // We are inserting a non-breakable char, postpone ! // line break check to next insert. ! end_foundcol = foundcol = 0; ! break; ! } ! ! // Neither cc nor ncc is NUL if we are here, so ! // it's safe to inc_cursor. ! col = curwin->w_cursor.col; ! ! inc_cursor(); ! cc = ncc; ! ncc = gchar_cursor(); ! // handle insert ! ncc = (ncc != NUL) ? ncc : c; ! ! allow_break = ! (enc_utf8 && utf_allow_break(cc, ncc)) ! || enc_dbcs; ! ! if (allow_break) ! { ! // Break only when we are not at end of line. ! end_foundcol = foundcol = ! ncc == NUL? 0 : curwin->w_cursor.col; ! break; ! } ! curwin->w_cursor.col = col; ! } ! } } if (curwin->w_cursor.col == 0) break; *** ../vim-8.2.0900/src/testdir/test_cjk_linebreak.vim 2020-06-04 18:21:24.394514113 +0200 --- src/testdir/test_cjk_linebreak.vim 2020-06-04 18:10:04.553145403 +0200 *************** *** 0 **** --- 1,91 ---- + scriptencoding utf-8 + + func Run_cjk_linebreak_after() + set textwidth=12 + for punct in [ + \ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '’', '”', '†', '‡', + \ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》', + \ '」', '』', '】', '〕', '〗', '〙', '〛', '!', ')', ',', '.', ':', + \ ';', '?', ']', '}'] + call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。') + normal gqq + call assert_equal('这是一个测试'.punct, getline(1)) + %d_ + endfor + endfunc + + func Test_cjk_linebreak_after() + set formatoptions=croqn2mB1j + call Run_cjk_linebreak_after() + endfunc + + " TODO: this test fails + "func Test_cjk_linebreak_after_rigorous() + " set formatoptions=croqn2mB1j] + " call Run_cjk_linebreak_after() + "endfunc + + func Run_cjk_linebreak_before() + set textwidth=12 + for punct in [ + \ '(', '<', '[', '`', '{', '‘', '“', '〈', '《', '「', '『', '【', '〔', + \ '〖', '〘', '〚', '(', '[', '{'] + call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。') + normal gqq + call assert_equal('这是个测试', getline(1)) + %d_ + endfor + endfunc + + func Test_cjk_linebreak_before() + set formatoptions=croqn2mB1j + call Run_cjk_linebreak_before() + endfunc + + func Test_cjk_linebreak_before_rigorous() + set formatoptions=croqn2mB1j] + call Run_cjk_linebreak_before() + endfunc + + func Run_cjk_linebreak_nobetween() + " …… must not start a line + call setline('.', '这是个测试……试试 CJK 行禁则补丁。') + set textwidth=12 ambiwidth=double + normal gqq + " TODO: this fails + " call assert_equal('这是个测试……', getline(1)) + %d_ + + call setline('.', '这是一个测试……试试 CJK 行禁则补丁。') + set textwidth=12 ambiwidth=double + normal gqq + call assert_equal('这是一个测', getline(1)) + %d_ + + " but —— can + call setline('.', '这是个测试——试试 CJK 行禁则补丁。') + set textwidth=12 ambiwidth=double + normal gqq + call assert_equal('这是个测试', getline(1)) + endfunc + + func Test_cjk_linebreak_nobetween() + set formatoptions=croqn2mB1j + call Run_cjk_linebreak_nobetween() + endfunc + + func Test_cjk_linebreak_nobetween_rigorous() + set formatoptions=croqn2mB1j] + call Run_cjk_linebreak_nobetween() + endfunc + + func Test_cjk_linebreak_join_punct() + for punct in ['——', '〗', ',', '。', '……'] + call setline(1, '文本文本'.punct) + call setline(2, 'English') + set formatoptions=croqn2mB1j + normal ggJ + call assert_equal('文本文本'.punct.'English', getline(1)) + %d_ + endfor + endfunc *** ../vim-8.2.0900/src/version.c 2020-06-04 17:19:01.581522349 +0200 --- src/version.c 2020-06-04 17:39:49.046032743 +0200 *************** *** 748,749 **** --- 748,751 ---- { /* Add new patch number below this line */ + /**/ + 901, /**/ -- MAN: You don't frighten us, English pig-dog! Go and boil your bottoms, son of a silly person. I blow my nose on you, so-called Arthur-king, you and your silly English K...kaniggets. He puts hands to his ears and blows a raspberry. "Monty Python and the Holy Grail" PYTHON (MONTY) PICTURES LTD /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ an exciting new programming language -- http://www.Zimbu.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///