To: vim_dev@googlegroups.com
Subject: Patch 8.2.0901
Fcc: outbox
From: Bram Moolenaar <Bram@moolenaar.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
------------

Patch 8.2.0901
Problem:    Formatting CJK text isn't optimal.
Solution:   Properly break CJK lines. (closes #3875)
Files:      runtime/doc/change.txt, src/mbyte.c, src/ops.c, src/option.h,
            src/proto/mbyte.pro, src/testdir/Make_all.mak, src/textformat.c,
            src/testdir/test_cjk_linebreak.vim


*** ../vim-8.2.0900/runtime/doc/change.txt	2019-12-17 21:27:14.686319918 +0100
--- runtime/doc/change.txt	2020-06-04 17:56:02.629097404 +0200
***************
*** 1686,1691 ****
--- 1688,1697 ----
  	characters.  Overruled by the 'M' flag.
  1	Don't break a line after a one-letter word.  It's broken before it
  	instead (if possible).
+ ]	Respect textwidth rigorously. With this flag set, no line can be
+ 	longer than textwidth, unless line-break-prohibition rules make this
+ 	impossible.  Mainly for CJK scripts and works only if 'encoding' is
+ 	"utf-8".
  j	Where it makes sense, remove a comment leader when joining lines.  For
  	example, joining:
  		int i;   // the index ~
*** ../vim-8.2.0900/src/mbyte.c	2020-06-01 14:34:22.027462262 +0200
--- src/mbyte.c	2020-06-04 18:19:20.338981249 +0200
***************
*** 3843,3848 ****
--- 3843,4000 ----
  }
  
  /*
+  * Whether space is NOT allowed before/after 'c'.
+  */
+     int
+ utf_eat_space(int cc)
+ {
+     return ((cc >= 0x2000 && cc <= 0x206F)	// General punctuations
+ 	 || (cc >= 0x2e00 && cc <= 0x2e7f)	// Supplemental punctuations
+ 	 || (cc >= 0x3000 && cc <= 0x303f)	// CJK symbols and punctuations
+ 	 || (cc >= 0xff01 && cc <= 0xff0f)	// Full width ASCII punctuations
+ 	 || (cc >= 0xff1a && cc <= 0xff20)	// ..
+ 	 || (cc >= 0xff3b && cc <= 0xff40)	// ..
+ 	 || (cc >= 0xff5b && cc <= 0xff65));	// ..
+ }
+ 
+ /*
+  * Whether line break is allowed before "cc".
+  */
+     int
+ utf_allow_break_before(int cc)
+ {
+     static const int BOL_prohibition_punct[] =
+     {
+ 	'!',
+ 	'%',
+ 	')',
+ 	',',
+ 	':',
+ 	';',
+ 	'>',
+ 	'?',
+ 	']',
+ 	'}',
+ 	0x2019, // ’ right single quotation mark
+ 	0x201d, // ” right double quotation mark
+ 	0x2020, // † dagger
+ 	0x2021, // ‡ double dagger
+ 	0x2026, // … horizontal ellipsis
+ 	0x2030, // ‰ per mille sign
+ 	0x2031, // ‱ per then thousand sign
+ 	0x203c, // ‼ double exclamation mark
+ 	0x2047, // ⁇ double question mark
+ 	0x2048, // ⁈ question exclamation mark
+ 	0x2049, // ⁉ exclamation question mark
+ 	0x2103, // ℃ degree celsius
+ 	0x2109, // ℉ degree fahrenheit
+ 	0x3001, // 、 ideographic comma
+ 	0x3002, // 。 ideographic full stop
+ 	0x3009, // 〉 right angle bracket
+ 	0x300b, // 》 right double angle bracket
+ 	0x300d, // 」 right corner bracket
+ 	0x300f, // 』 right white corner bracket
+ 	0x3011, // 】 right black lenticular bracket
+ 	0x3015, // 〕 right tortoise shell bracket
+ 	0x3017, // 〗 right white lenticular bracket
+ 	0x3019, // 〙 right white tortoise shell bracket
+ 	0x301b, // 〛 right white square bracket
+ 	0xff01, // ！ fullwidth exclamation mark
+ 	0xff09, // ） fullwidth right parenthesis
+ 	0xff0c, // ， fullwidth comma
+ 	0xff0e, // ． fullwidth full stop
+ 	0xff1a, // ： fullwidth colon
+ 	0xff1b, // ； fullwidth semicolon
+ 	0xff1f, // ？ fullwidth question mark
+ 	0xff3d, // ］ fullwidth right square bracket
+ 	0xff5d, // ｝ fullwidth right curly bracket
+     };
+ 
+     int first = 0;
+     int last  = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
+     int mid   = 0;
+ 
+     while (first < last)
+     {
+ 	mid = (first + last)/2;
+ 
+ 	if (cc == BOL_prohibition_punct[mid])
+ 	    return FALSE;
+ 	else if (cc > BOL_prohibition_punct[mid])
+ 	    first = mid + 1;
+ 	else
+ 	    last = mid - 1;
+     }
+ 
+     return cc != BOL_prohibition_punct[first];
+ }
+ 
+ /*
+  * Whether line break is allowed after "cc".
+  */
+     static int
+ utf_allow_break_after(int cc)
+ {
+     static const int EOL_prohibition_punct[] =
+     {
+ 	'(',
+ 	'<',
+ 	'[',
+ 	'`',
+ 	'{',
+ 	//0x2014, // — em dash
+ 	0x2018, // ‘ left single quotation mark
+ 	0x201c, // “ left double quotation mark
+ 	//0x2053, // ～ swung dash
+ 	0x3008, // 〈 left angle bracket
+ 	0x300a, // 《 left double angle bracket
+ 	0x300c, // 「 left corner bracket
+ 	0x300e, // 『 left white corner bracket
+ 	0x3010, // 【 left black lenticular bracket
+ 	0x3014, // 〔 left tortoise shell bracket
+ 	0x3016, // 〖 left white lenticular bracket
+ 	0x3018, // 〘 left white tortoise shell bracket
+ 	0x301a, // 〚 left white square bracket
+ 	0xff08, // （ fullwidth left parenthesis
+ 	0xff3b, // ［ fullwidth left square bracket
+ 	0xff5b, // ｛ fullwidth left curly bracket
+     };
+ 
+     int first = 0;
+     int last  = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
+     int mid   = 0;
+ 
+     while (first < last)
+     {
+ 	mid = (first + last)/2;
+ 
+ 	if (cc == EOL_prohibition_punct[mid])
+ 	    return FALSE;
+ 	else if (cc > EOL_prohibition_punct[mid])
+ 	    first = mid + 1;
+ 	else
+ 	    last = mid - 1;
+     }
+ 
+     return cc != EOL_prohibition_punct[first];
+ }
+ 
+ /*
+  * Whether line break is allowed between "cc" and "ncc".
+  */
+     int
+ utf_allow_break(int cc, int ncc)
+ {
+     // don't break between two-letter punctuations
+     if (cc == ncc
+ 	    && (cc == 0x2014 // em dash
+ 		|| cc == 0x2026)) // horizontal ellipsis
+ 	return FALSE;
+ 
+     return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
+ }
+ 
+ /*
   * Copy a character from "*fp" to "*tp" and advance the pointers.
   */
      void
*** ../vim-8.2.0900/src/ops.c	2020-06-01 19:14:09.050505748 +0200
--- src/ops.c	2020-06-04 17:38:21.854286486 +0200
***************
*** 1967,1973 ****
  		    && (!has_format_option(FO_MBYTE_JOIN)
  			|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
  		    && (!has_format_option(FO_MBYTE_JOIN2)
! 			|| mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100)
  	       )
  	    {
  		// don't add a space if the line is ending in a space
--- 1967,1976 ----
  		    && (!has_format_option(FO_MBYTE_JOIN)
  			|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
  		    && (!has_format_option(FO_MBYTE_JOIN2)
! 			|| (mb_ptr2char(curr) < 0x100
! 			    && !(enc_utf8 && utf_eat_space(endcurr1)))
! 			|| (endcurr1 < 0x100
! 			    && !(enc_utf8 && utf_eat_space(mb_ptr2char(curr)))))
  	       )
  	    {
  		// don't add a space if the line is ending in a space
*** ../vim-8.2.0900/src/option.h	2020-05-31 23:11:02.082515688 +0200
--- src/option.h	2020-06-04 17:38:21.854286486 +0200
***************
*** 141,152 ****
  #define FO_ONE_LETTER	'1'
  #define FO_WHITE_PAR	'w'	// trailing white space continues paragr.
  #define FO_AUTO		'a'	// automatic formatting
  #define FO_REMOVE_COMS	'j'	// remove comment leaders when joining lines
  #define FO_PERIOD_ABBR	'p'	// don't break a single space after a period
  
  #define DFLT_FO_VI	"vt"
  #define DFLT_FO_VIM	"tcq"
! #define FO_ALL		"tcroq2vlb1mMBn,awjp"	// for do_set()
  
  // characters for the p_cpo option:
  #define CPO_ALTREAD	'a'	// ":read" sets alternate file name
--- 141,153 ----
  #define FO_ONE_LETTER	'1'
  #define FO_WHITE_PAR	'w'	// trailing white space continues paragr.
  #define FO_AUTO		'a'	// automatic formatting
+ #define FO_RIGOROUS_TW	']'     // respect textwidth rigorously
  #define FO_REMOVE_COMS	'j'	// remove comment leaders when joining lines
  #define FO_PERIOD_ABBR	'p'	// don't break a single space after a period
  
  #define DFLT_FO_VI	"vt"
  #define DFLT_FO_VIM	"tcq"
! #define FO_ALL		"tcroq2vlb1mMBn,aw]jp"	// for do_set()
  
  // characters for the p_cpo option:
  #define CPO_ALTREAD	'a'	// ":read" sets alternate file name
*** ../vim-8.2.0900/src/proto/mbyte.pro	2020-06-01 14:34:22.027462262 +0200
--- src/proto/mbyte.pro	2020-06-04 18:00:03.751806728 +0200
***************
*** 52,57 ****
--- 52,60 ----
  int latin_head_off(char_u *base, char_u *p);
  int dbcs_screen_head_off(char_u *base, char_u *p);
  int utf_head_off(char_u *base, char_u *p);
+ int utf_eat_space(int cc);
+ int utf_allow_break_before(int cc);
+ int utf_allow_break(int cc, int ncc);
  void mb_copy_char(char_u **fp, char_u **tp);
  int mb_off_next(char_u *base, char_u *p);
  int mb_tail_off(char_u *base, char_u *p);
*** ../vim-8.2.0900/src/testdir/Make_all.mak	2020-06-04 15:52:06.095922759 +0200
--- src/testdir/Make_all.mak	2020-06-04 17:38:21.854286486 +0200
***************
*** 85,90 ****
--- 85,91 ----
  	test_charsearch_utf8 \
  	test_checkpath \
  	test_cindent \
+ 	test_cjk_linebreak \
  	test_clientserver \
  	test_close_count \
  	test_cmdline \
***************
*** 333,338 ****
--- 334,340 ----
  	test_charsearch.res \
  	test_checkpath.res \
  	test_cindent.res \
+ 	test_cjk_linebreak.res \
  	test_clientserver.res \
  	test_close_count.res \
  	test_cmdline.res \
*** ../vim-8.2.0900/src/textformat.c	2020-05-01 14:26:17.132949262 +0200
--- src/textformat.c	2020-06-04 18:16:11.963699002 +0200
***************
*** 45,54 ****
--- 45,56 ----
      int		c) // character to be inserted (can be NUL)
  {
      int		cc;
+     int		skip_pos;
      int		save_char = NUL;
      int		haveto_redraw = FALSE;
      int		fo_ins_blank = has_format_option(FO_INS_BLANK);
      int		fo_multibyte = has_format_option(FO_MBYTE_BREAK);
+     int		fo_rigor_tw  = has_format_option(FO_RIGOROUS_TW);
      int		fo_white_par = has_format_option(FO_WHITE_PAR);
      int		first_line = TRUE;
      colnr_T	leader_len;
***************
*** 125,130 ****
--- 127,133 ----
  
  	curwin->w_cursor.col = startcol;
  	foundcol = 0;
+ 	skip_pos = 0;
  
  	// Find position to break at.
  	// Stop at first entered white when 'formatoptions' has 'v'
***************
*** 189,196 ****
  		if (curwin->w_cursor.col <= (colnr_T)wantcol)
  		    break;
  	    }
! 	    else if (cc >= 0x100 && fo_multibyte)
  	    {
  		// Break after or before a multi-byte character.
  		if (curwin->w_cursor.col != startcol)
  		{
--- 192,202 ----
  		if (curwin->w_cursor.col <= (colnr_T)wantcol)
  		    break;
  	    }
! 	    else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && fo_multibyte)
  	    {
+ 		int ncc;
+ 		int allow_break;
+ 
  		// Break after or before a multi-byte character.
  		if (curwin->w_cursor.col != startcol)
  		{
***************
*** 199,206 ****
  			break;
  		    col = curwin->w_cursor.col;
  		    inc_cursor();
! 		    // Don't change end_foundcol if already set.
! 		    if (foundcol != curwin->w_cursor.col)
  		    {
  			foundcol = curwin->w_cursor.col;
  			end_foundcol = foundcol;
--- 205,218 ----
  			break;
  		    col = curwin->w_cursor.col;
  		    inc_cursor();
! 		    ncc = gchar_cursor();
! 
! 		    allow_break =
! 			(enc_utf8 && utf_allow_break(cc, ncc))
! 			|| enc_dbcs;
! 
! 		    // If we have already checked this position, skip!
! 		    if (curwin->w_cursor.col != skip_pos && allow_break)
  		    {
  			foundcol = curwin->w_cursor.col;
  			end_foundcol = foundcol;
***************
*** 213,218 ****
--- 225,231 ----
  		if (curwin->w_cursor.col == 0)
  		    break;
  
+ 		ncc = cc;
  		col = curwin->w_cursor.col;
  
  		dec_cursor();
***************
*** 220,235 ****
  
  		if (WHITECHAR(cc))
  		    continue;		// break with space
! 		// Don't break until after the comment leader
  		if (curwin->w_cursor.col < leader_len)
  		    break;
  
  		curwin->w_cursor.col = col;
  
! 		foundcol = curwin->w_cursor.col;
! 		end_foundcol = foundcol;
  		if (curwin->w_cursor.col <= (colnr_T)wantcol)
! 		    break;
  	    }
  	    if (curwin->w_cursor.col == 0)
  		break;
--- 233,297 ----
  
  		if (WHITECHAR(cc))
  		    continue;		// break with space
! 		// Don't break until after the comment leader.
  		if (curwin->w_cursor.col < leader_len)
  		    break;
  
  		curwin->w_cursor.col = col;
+ 		skip_pos = curwin->w_cursor.col;
  
! 		allow_break =
! 		    (enc_utf8 && utf_allow_break(cc, ncc))
! 		    || enc_dbcs;
! 
! 		// Must handle this to respect line break prohibition.
! 		if (allow_break)
! 		{
! 		    foundcol = curwin->w_cursor.col;
! 		    end_foundcol = foundcol;
! 		}
  		if (curwin->w_cursor.col <= (colnr_T)wantcol)
! 		{
! 		    int ncc_allow_break =
! 			 (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
! 
! 		    if (allow_break)
! 			break;
! 		    if (!ncc_allow_break && !fo_rigor_tw)
! 		    {
! 			// Enable at most 1 punct hang outside of textwidth.
! 			if (curwin->w_cursor.col == startcol)
! 			{
! 			    // We are inserting a non-breakable char, postpone
! 			    // line break check to next insert.
! 			    end_foundcol = foundcol = 0;
! 			    break;
! 			}
! 
! 			// Neither cc nor ncc is NUL if we are here, so
! 			// it's safe to inc_cursor.
! 			col = curwin->w_cursor.col;
! 
! 			inc_cursor();
! 			cc  = ncc;
! 			ncc = gchar_cursor();
! 			// handle insert
! 			ncc = (ncc != NUL) ? ncc : c;
! 
! 			allow_break =
! 				(enc_utf8 && utf_allow_break(cc, ncc))
! 				|| enc_dbcs;
! 
! 			if (allow_break)
! 			{
! 			    // Break only when we are not at end of line.
! 			    end_foundcol = foundcol =
! 				      ncc == NUL? 0 : curwin->w_cursor.col;
! 			    break;
! 			}
! 			curwin->w_cursor.col = col;
! 		    }
! 		}
  	    }
  	    if (curwin->w_cursor.col == 0)
  		break;
*** ../vim-8.2.0900/src/testdir/test_cjk_linebreak.vim	2020-06-04 18:21:24.394514113 +0200
--- src/testdir/test_cjk_linebreak.vim	2020-06-04 18:10:04.553145403 +0200
***************
*** 0 ****
--- 1,91 ----
+ scriptencoding utf-8
+ 
+ func Run_cjk_linebreak_after()
+   set textwidth=12
+   for punct in [
+         \ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '’', '”', '†', '‡',
+         \ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》',
+         \ '」', '』', '】', '〕', '〗', '〙', '〛', '！', '）', '，', '．', '：',
+         \ '；', '？', '］', '｝']
+     call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。')
+     normal gqq
+     call assert_equal('这是一个测试'.punct, getline(1))
+     %d_
+   endfor
+ endfunc
+ 
+ func Test_cjk_linebreak_after()
+   set formatoptions=croqn2mB1j
+   call Run_cjk_linebreak_after()
+ endfunc
+ 
+ " TODO: this test fails
+ "func Test_cjk_linebreak_after_rigorous()
+ "  set formatoptions=croqn2mB1j]
+ "  call Run_cjk_linebreak_after()
+ "endfunc
+ 
+ func Run_cjk_linebreak_before()
+   set textwidth=12
+   for punct in [
+         \ '(', '<', '[', '`', '{', '‘', '“', '〈', '《', '「', '『', '【', '〔',
+         \ '〖', '〘', '〚', '（', '［', '｛']
+     call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。')
+     normal gqq
+     call assert_equal('这是个测试', getline(1))
+     %d_
+   endfor
+ endfunc
+ 
+ func Test_cjk_linebreak_before()
+   set formatoptions=croqn2mB1j
+   call Run_cjk_linebreak_before()
+ endfunc
+ 
+ func Test_cjk_linebreak_before_rigorous()
+   set formatoptions=croqn2mB1j]
+   call Run_cjk_linebreak_before()
+ endfunc
+ 
+ func Run_cjk_linebreak_nobetween()
+   " …… must not start a line
+   call setline('.', '这是个测试……试试 CJK 行禁则补丁。')
+   set textwidth=12 ambiwidth=double
+   normal gqq
+   " TODO: this fails
+   " call assert_equal('这是个测试……', getline(1))
+   %d_
+ 
+   call setline('.', '这是一个测试……试试 CJK 行禁则补丁。')
+   set textwidth=12 ambiwidth=double
+   normal gqq
+   call assert_equal('这是一个测', getline(1))
+   %d_
+ 
+   " but —— can
+   call setline('.', '这是个测试——试试 CJK 行禁则补丁。')
+   set textwidth=12 ambiwidth=double
+   normal gqq
+   call assert_equal('这是个测试', getline(1))
+ endfunc
+ 
+ func Test_cjk_linebreak_nobetween()
+   set formatoptions=croqn2mB1j
+   call Run_cjk_linebreak_nobetween()
+ endfunc
+ 
+ func Test_cjk_linebreak_nobetween_rigorous()
+   set formatoptions=croqn2mB1j]
+   call Run_cjk_linebreak_nobetween()
+ endfunc
+ 
+ func Test_cjk_linebreak_join_punct()
+   for punct in ['——', '〗', '，', '。', '……']
+     call setline(1, '文本文本'.punct)
+     call setline(2, 'English')
+     set formatoptions=croqn2mB1j
+     normal ggJ
+     call assert_equal('文本文本'.punct.'English', getline(1))
+     %d_
+   endfor
+ endfunc
*** ../vim-8.2.0900/src/version.c	2020-06-04 17:19:01.581522349 +0200
--- src/version.c	2020-06-04 17:39:49.046032743 +0200
***************
*** 748,749 ****
--- 748,751 ----
  {   /* Add new patch number below this line */
+ /**/
+     901,
  /**/

-- 
MAN:     You don't frighten us, English pig-dog!  Go and boil your bottoms,
         son of a silly person.  I blow my nose on you, so-called Arthur-king,
         you and your silly English K...kaniggets.
   He puts hands to his ears and blows a raspberry.
                 "Monty Python and the Holy Grail" PYTHON (MONTY) PICTURES LTD

 /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net   \\\
///        sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\  an exciting new programming language -- http://www.Zimbu.org        ///
 \\\            help me help AIDS victims -- http://ICCF-Holland.org    ///