--- diffutils-2.8.7-cvs/configure.ac +++ diffutils-2.8.7-cvs/configure.ac @@ -57,6 +57,7 @@ if test $ac_cv_func_sigprocmask = no; th fi AC_FUNC_CLOSEDIR_VOID AC_FUNC_FORK +AC_FUNC_MBRTOWC AM_GNU_GETTEXT([external], [need-formatstring-macros]) AM_GNU_GETTEXT_VERSION([0.15]) --- diffutils-2.8.7-cvs/src/diff.c +++ diffutils-2.8.7-cvs/src/diff.c @@ -276,6 +276,13 @@ main (int argc, char **argv) re_set_syntax (RE_SYNTAX_GREP | RE_NO_POSIX_BACKTRACKING); excluded = new_exclude (); +#ifdef HANDLE_MULTIBYTE + if (MB_CUR_MAX > 1) + lines_differ = lines_differ_multibyte; + else +#endif + lines_differ = lines_differ_singlebyte; + /* Decode the options. */ while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1) --- diffutils-2.8.7-cvs/src/diff.h +++ diffutils-2.8.7-cvs/src/diff.h @@ -23,6 +23,17 @@ #include #include +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H +/* Solaris 2.5 has a bug: must be included before . */ +# include +# include +# if defined (HAVE_MBRTOWC) +# define HANDLE_MULTIBYTE 1 +# endif +#endif + /* What kind of changes a hunk contains. */ enum changes { @@ -353,7 +364,13 @@ extern char const change_letter[4]; extern char const pr_program[]; char *concat (char const *, char const *, char const *); char *dir_file_pathname (char const *, char const *); -bool lines_differ (char const *, char const *); + +bool (*lines_differ) (char const *, char const *); +bool lines_differ_singlebyte (char const *, char const *); +#ifdef HANDLE_MULTIBYTE +bool lines_differ_multibyte (char const *, char const *); +#endif + lin translate_line_number (struct file_data const *, lin); struct change *find_change (struct change *); struct change *find_reverse_change (struct change *); --- diffutils-2.8.7-cvs/src/io.c +++ diffutils-2.8.7-cvs/src/io.c @@ -22,6 +22,7 @@ #include #include #include +#include /* Rotate an unsigned value to the left. */ #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) @@ -194,6 +195,28 @@ slurp (struct file_data *current) /* Split the file into lines, simultaneously computing the equivalence class for each line. */ +#ifdef HANDLE_MULTIBYTE +# define MBC2WC(P, END, MBLENGTH, WC, STATE, CONVFAIL) \ +do \ +{ \ + mbstate_t state_bak = STATE; \ + \ + CONVFAIL = 0; \ + MBLENGTH = mbrtowc (&WC, P, END - (char const *) P, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t) -2: \ + case (size_t) -1: \ + STATE = state_bak; \ + ++CONVFAIL; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ +} \ +while (0) +#endif static void find_and_hash_each_line (struct file_data *current) @@ -220,11 +243,294 @@ find_and_hash_each_line (struct file_dat bool same_length_diff_contents_compare_anyway = diff_length_compare_anyway | ignore_case; +#ifdef HANDLE_MULTIBYTE + wchar_t wc; + size_t mblength; + mbstate_t state; + int convfail; + + memset (&state, '\0', sizeof (mbstate_t)); +#endif + while (p < suffix_begin) { char const *ip = p; h = 0; +#ifdef HANDLE_MULTIBYTE + if (MB_CUR_MAX > 1) + { + wchar_t lo_wc; + char mbc[MB_LEN_MAX]; + mbstate_t state_wc; + + /* Hash this line until we find a newline. */ + switch (ignore_white_space) + { + case IGNORE_ALL_SPACE: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + mbc[0] = *p++; + else if (!iswspace (wc)) + { + bool flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + else + { + p += mblength; + continue; + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + break; + + case IGNORE_SPACE_CHANGE: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (!convfail && iswspace (wc)) + { + while (1) + { + if (*p == '\n') + { + ++p; + goto hashing_done; + } + + p += mblength; + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + if (convfail || (!convfail && !iswspace (wc))) + break; + } + h = HASH (h, ' '); + } + + /* WC is now the first non-space. */ + if (convfail) + mbc[0] = *p++; + else + { + bool flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + break; + + case IGNORE_TAB_EXPANSION: + { + size_t column = 0; + + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + { + c = *p++; + h = HASH (h, c); + ++column; + } + else + { + bool flag; + + switch (wc) + { + case L'\b': + column -= 0 < column; + h = HASH (h, '\b'); + ++p; + break; + + case L'\t': + { + size_t repetitions; + repetitions = tabsize - column % tabsize; + column = (column + repetitions < column + ? 0 + : column + repetitions); + do + h = HASH (h, ' '); + while (--repetitions != 0); + ++p; + } + break; + + case L'\r': + column = 0; + h = HASH (h, '\r'); + ++p; + break; + + default: + flag = 0; + column += wcwidth (wc); + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + } + } + } + break; + + default: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + mbc[0] = *p++; + else + { + int flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + } + goto hashing_done; + } +#endif /* Hash this line until we find a newline. */ if (ignore_case) --- diffutils-2.8.7-cvs/src/side.c +++ diffutils-2.8.7-cvs/src/side.c @@ -73,11 +73,72 @@ print_half_line (char const *const *line register size_t out_position = 0; register char const *text_pointer = line[0]; register char const *text_limit = line[1]; +#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H + unsigned char mbc[MB_LEN_MAX]; + wchar_t wc; + mbstate_t state, state_bak; + size_t mbc_pos, mblength; + int mbc_loading_flag = 0; + int wc_width; + + memset (&state, '\0', sizeof (mbstate_t)); +#endif while (text_pointer < text_limit) { register unsigned char c = *text_pointer++; +#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H + if (MB_CUR_MAX > 1 && mbc_loading_flag) + { + mbc_loading_flag = 0; + state_bak = state; + mbc[mbc_pos++] = c; + +process_mbc: + mblength = mbrtowc (&wc, mbc, mbc_pos, &state); + + switch (mblength) + { + case (size_t)-2: /* Incomplete multibyte character. */ + mbc_loading_flag = 1; + state = state_bak; + break; + + case (size_t)-1: /* Invalid as a multibyte character. */ + if (in_position++ < out_bound) + { + out_position = in_position; + putc (mbc[0], out); + } + memmove (mbc, mbc + 1, --mbc_pos); + if (mbc_pos > 0) + { + mbc[mbc_pos] = '\0'; + goto process_mbc; + } + break; + + default: + wc_width = wcwidth (wc); + if (wc_width < 1) /* Unprintable multibyte character. */ + { + if (in_position <= out_bound) + fprintf (out, "%lc", (wint_t)wc); + } + else /* Printable multibyte character. */ + { + in_position += wc_width; + if (in_position <= out_bound) + { + out_position = in_position; + fprintf (out, "%lc", (wint_t)wc); + } + } + } + continue; + } +#endif switch (c) { case '\t': @@ -135,8 +196,39 @@ print_half_line (char const *const *line break; default: - if (! isprint (c)) - goto control_char; +#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H + if (MB_CUR_MAX > 1) + { + memset (mbc, '\0', MB_LEN_MAX); + mbc_pos = 0; + mbc[mbc_pos++] = c; + state_bak = state; + + mblength = mbrtowc (&wc, mbc, mbc_pos, &state); + + /* The value of mblength is always less than 2 here. */ + switch (mblength) + { + case (size_t)-2: /* Incomplete multibyte character. */ + state = state_bak; + mbc_loading_flag = 1; + continue; + + case (size_t)-1: /* Invalid as a multibyte character. */ + state = state_bak; + break; + + default: + if (! iswprint (wc)) + goto control_char; + } + } + else +#endif + { + if (! isprint (c)) + goto control_char; + } /* falls through */ case ' ': if (in_position++ < out_bound) --- diffutils-2.8.7-cvs/src/util.c +++ diffutils-2.8.7-cvs/src/util.c @@ -317,7 +317,7 @@ finish_output (void) Return nonzero if the lines differ. */ bool -lines_differ (char const *s1, char const *s2) +lines_differ_singlebyte (char const *s1, char const *s2) { register char const *t1 = s1; register char const *t2 = s2; @@ -446,6 +446,293 @@ lines_differ (char const *s1, char const return true; } + +#ifdef HANDLE_MULTIBYTE +# define MBC2WC(T, END, MBLENGTH, WC, STATE, CONVFAIL) \ +do \ +{ \ + mbstate_t bak = STATE; \ + \ + CONVFAIL = 0; \ + MBLENGTH = mbrtowc (&WC, T, END - T, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-2: \ + case (size_t)-1: \ + STATE = bak; \ + ++CONVFAIL; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ +} \ +while (0) + +bool +lines_differ_multibyte (char const *s1, char const *s2) +{ + unsigned char const *end1, *end2; + unsigned char c1, c2; + wchar_t wc1, wc2, wc1_bak, wc2_bak; + size_t mblen1, mblen2; + mbstate_t state1, state2, state1_bak, state2_bak; + int convfail1, convfail2, convfail1_bak, convfail2_bak; + + unsigned char const *t1 = (unsigned char const *) s1; + unsigned char const *t2 = (unsigned char const *) s2; + unsigned char const *t1_bak, *t2_bak; + size_t column = 0; + + if (ignore_white_space == IGNORE_NO_WHITE_SPACE && !ignore_case) + { + while (*t1 != '\n') + if (*t1++ != * t2++) + return 1; + return 0; + } + + memset (&state1, '\0', sizeof (mbstate_t)); + memset (&state2, '\0', sizeof (mbstate_t)); + + end1 = s1 + strlen (s1); + end2 = s2 + strlen (s2); + + while (1) + { + c1 = *t1; + c2 = *t2; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + + /* Test for exact char equality first, since it's a common case. */ + if (convfail1 ^ convfail2) + break; + else if (convfail1 && convfail2 && c1 != c2) + break; + else if (!convfail1 && !convfail2 && wc1 != wc2) + { + switch (ignore_white_space) + { + case IGNORE_ALL_SPACE: + /* For -w, just skip past any white space. */ + while (1) + { + if (convfail1) + break; + else if (wc1 == L'\n' || !iswspace (wc1)) + break; + + t1 += mblen1; + c1 = *t1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + + while (1) + { + if (convfail2) + break; + else if (wc2 == L'\n' || !iswspace (wc2)) + break; + + t2 += mblen2; + c2 = *t2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + t1 += mblen1; + t2 += mblen2; + break; + + case IGNORE_SPACE_CHANGE: + /* For -b, advance past any sequence of white space in + line 1 and consider it just one space, or nothing at + all if it is at the end of the line. */ + if (wc1 != L'\n' && iswspace (wc1)) + { + size_t mblen_bak; + mbstate_t state_bak; + + do + { + t1 += mblen1; + mblen_bak = mblen1; + state_bak = state1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + while (!convfail1 && (wc1 != L'\n' && iswspace (wc1))); + + state1 = state_bak; + mblen1 = mblen_bak; + t1 -= mblen1; + convfail1 = 0; + wc1 = L' '; + } + + /* Likewise for line 2. */ + if (wc2 != L'\n' && iswspace (wc2)) + { + size_t mblen_bak; + mbstate_t state_bak; + + do + { + t2 += mblen2; + mblen_bak = mblen2; + state_bak = state2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + while (!convfail2 && (wc2 != L'\n' && iswspace (wc2))); + + state2 = state_bak; + mblen2 = mblen_bak; + t2 -= mblen2; + convfail2 = 0; + wc2 = L' '; + } + + if (wc1 != wc2) + { + if (wc2 == L' ' && wc1 != L'\n' && + t1 > (unsigned char const *)s1 && + !convfail1_bak && iswspace (wc1_bak)) + { + t1 = t1_bak; + wc1 = wc1_bak; + state1 = state1_bak; + convfail1 = convfail1_bak; + continue; + } + if (wc1 == L' ' && wc2 != L'\n' + && t2 > (unsigned char const *)s2 + && !convfail2_bak && iswspace (wc2_bak)) + { + t2 = t2_bak; + wc2 = wc2_bak; + state2 = state2_bak; + convfail2 = convfail2_bak; + continue; + } + } + + t1_bak = t1; t2_bak = t2; + wc1_bak = wc1; wc2_bak = wc2; + state1_bak = state1; state2_bak = state2; + convfail1_bak = convfail1; convfail2_bak = convfail2; + + if (wc1 == L'\n') + wc1 = L' '; + else + t1 += mblen1; + + if (wc2 == L'\n') + wc2 = L' '; + else + t2 += mblen2; + + break; + + case IGNORE_TAB_EXPANSION: + if ((wc1 == L' ' && wc2 == L'\t') + || (wc1 == L'\t' && wc2 == L' ')) + { + size_t column2 = column; + + while (1) + { + if (convfail1) + { + ++t1; + break; + } + else if (wc1 == L' ') + column++; + else if (wc1 == L'\t') + column += tabsize - column % tabsize; + else + { + t1 += mblen1; + break; + } + + t1 += mblen1; + c1 = *t1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + + while (1) + { + if (convfail2) + { + ++t2; + break; + } + else if (wc2 == L' ') + column2++; + else if (wc2 == L'\t') + column2 += tabsize - column2 % tabsize; + else + { + t2 += mblen2; + break; + } + + t2 += mblen2; + c2 = *t2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + + if (column != column2) + return 1; + } + else + { + t1 += mblen1; + t2 += mblen2; + } + break; + + case IGNORE_NO_WHITE_SPACE: + t1 += mblen1; + t2 += mblen2; + break; + } + + /* Lowercase all letters if -i is specified. */ + if (ignore_case) + { + if (!convfail1) + wc1 = towlower (wc1); + if (!convfail2) + wc2 = towlower (wc2); + } + + if (convfail1 ^ convfail2) + break; + else if (convfail1 && convfail2 && c1 != c2) + break; + else if (!convfail1 && !convfail2 && wc1 != wc2) + break; + } + else + { + t1_bak = t1; t2_bak = t2; + wc1_bak = wc1; wc2_bak = wc2; + state1_bak = state1; state2_bak = state2; + convfail1_bak = convfail1; convfail2_bak = convfail2; + + t1 += mblen1; t2 += mblen2; + } + + if (!convfail1 && wc1 == L'\n') + return 0; + + column += convfail1 ? 1 : + (wc1 == L'\t') ? tabsize - column % tabsize : wcwidth (wc1); + } + + return 1; +} +#endif /* Find the consecutive changes at the start of the script START. Return the last link before the first gap. */