--- def.h | 4 + mime.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- sendout.c | 40 +++++++++++++---- 3 files changed, 172 insertions(+), 16 deletions(-) --- def.h +++ def.h 2016-04-08 14:58:42.729798789 +0000 @@ -142,7 +142,9 @@ enum mimeclean { MIME_LONGLINES = 002, /* has lines too long for RFC 2822 */ MIME_CTRLCHAR = 004, /* contains control characters */ MIME_HASNUL = 010, /* contains \0 characters */ - MIME_NOTERMNL = 020 /* lacks a terminating newline */ + MIME_NOTERMNL = 020, /* lacks a terminating newline */ + MIME_UTF8 = 040, /* UTF-8 high bit multi byte characters */ + MIME_LATIN = 0100 /* Latin high bit single byte characters */ }; enum tdflags { --- mime.c +++ mime.c 2016-04-08 15:00:05.808259514 +0000 @@ -302,13 +302,78 @@ gettcharset(void) return t; } +#define F 0 /* character never appears in mail text */ +#define T 1 /* character appears in plain ASCII text */ +#define I 2 /* character appears in ISO-8859 text */ +#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ + +static char text_chars[256] = { + /* NUL BEL BS HT LF FF CR */ + F, F, F, F, F, F, F, F, T, T, T, F, T, T, F, F, /* 0x0X */ + /* ESC */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ + /* NEL */ + X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ + X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ +}; + +static int encflags; +static void +test_enc(const char *s) +{ + int c = *s; + if (c & 0100) { + int n, follow; + + if ((c & 040) == 0) /* 110xxxxx */ + follow = 1; + else if ((c & 020) == 0) /* 1110xxxx */ + follow = 2; + else if ((c & 010) == 0) /* 11110xxx */ + follow = 3; + else if ((c & 004) == 0) /* 111110xx */ + follow = 4; + else if ((c & 002) == 0) /* 1111110x */ + follow = 5; + else + return; + + for (n = 1; n <= follow; n++) { + if ((c = *(s+n)) == '\0') + goto latin; + if ((c & 0200) == 0 || (c & 0100)) + goto latin; + } + encflags = MIME_UTF8; + return; + } +latin: + c = *s; + if (text_chars[c & 0377] == I) + encflags = MIME_LATIN; +} + static int has_highbit(const char *s) { if (s) { do - if (*s & 0200) + if (*s & 0200) { + test_enc(s); return 1; + } while (*s++ != '\0'); } return 0; @@ -328,6 +393,7 @@ name_highbit(struct name *np) char * need_hdrconv(struct header *hp, enum gfield w) { + encflags = 0; if (w & GIDENT) { if (hp->h_from && name_highbit(hp->h_from)) goto needs; @@ -355,7 +421,7 @@ need_hdrconv(struct header *hp, enum gfi if (w & GSUBJECT && has_highbit(hp->h_subject)) goto needs; return NULL; -needs: return getcharset(MIME_HIGHBIT); +needs: return getcharset(MIME_HIGHBIT|encflags); } #ifdef HAVE_ICONV @@ -441,7 +507,7 @@ iconv_open_ft(const char *tocode, const * be used to check the validity of the input even with * identical encoding names. */ - if (strcmp(t, f) == 0) + if (asccasecmp(t, f) == 0) errno = 0; return (iconv_t)-1; } @@ -665,7 +731,7 @@ mime_tline(char *x, char *l) l++; if (*l != '\0') *l++ = '\0'; - if (strcmp(x, n) == 0) { + if (asccasecmp(x, n) == 0) { match = 1; break; } @@ -748,14 +814,62 @@ mime_isclean(FILE *f) maxlen = curlen; curlen = 1; } else if (c & 0200) { + int i = c; isclean |= MIME_HIGHBIT; + if (c & 0100) { + int n, follow; + + if ((c & 040) == 0) /* 110xxxxx */ + follow = 1; + else if ((c & 020) == 0) /* 1110xxxx */ + follow = 2; + else if ((c & 010) == 0) /* 11110xxx */ + follow = 3; + else if ((c & 004) == 0) /* 111110xx */ + follow = 4; + else if ((c & 002) == 0) /* 1111110x */ + follow = 5; + else + continue; + + for (n = 0; n < follow; n++) { + lastc = c; + c = getc(f); + curlen++; + + if (c == '\0') { + isclean |= MIME_HASNUL; + goto latin; + } + if ((c & 0200) == 0 || (c & 0100)) + goto latin; + if ((c == '\n') || (c == EOF)) { + if (curlen > maxlen) + maxlen = curlen; + curlen = 1; + goto latin; + } + } + isclean |= MIME_UTF8; + continue; + } + latin: + if (text_chars[i & 0377] == I) + isclean |= MIME_LATIN; + if (text_chars[i & 0377] == X) { + isclean |= MIME_CTRLCHAR; + break; + } } else if (c == '\0') { isclean |= MIME_HASNUL; break; - } else if ((c < 040 && (c != '\t' && c != '\f')) || c == 0177) { + } else if (text_chars[c & 0377] == F) { isclean |= MIME_CTRLCHAR; + break; } } while (c != EOF); + if (isclean & (MIME_CTRLCHAR|MIME_HASNUL)) + isclean &= (MIME_CTRLCHAR|MIME_HASNUL); if (lastc != '\n') isclean |= MIME_NOTERMNL; clearerr(f); @@ -826,11 +940,16 @@ get_mime_convert(FILE *fp, char **conten * ^I or ^L in text/plain bodies. However, some * obscure character sets actually contain these * characters, so the content type can be set. + * Beside ^I or ^L from RFC 2046 we accept also + * backspace ^H often used in enhanced text. */ if ((*contenttype = value("contenttype-cntrl")) == NULL) *contenttype = "application/octet-stream"; } else if (*contenttype == NULL) *contenttype = "text/plain"; + } else if (ascncasecmp(*contenttype, "text/", 5) == 0) + { + *charset = getcharset(*isclean); } return convert; } @@ -989,8 +1108,14 @@ mime_fromhdr(struct str *in, struct str #ifdef HAVE_ICONV iconv_t fhicd = (iconv_t)-1; #endif + enum mimeclean isclean = 0; tcs = gettcharset(); + + encflags = 0; + if (has_highbit(in->s)) + isclean |= (MIME_HIGHBIT|encflags); + maxstor = in->l; out->s = smalloc(maxstor + 1); out->l = 0; @@ -1010,7 +1135,7 @@ mime_fromhdr(struct str *in, struct str #ifdef HAVE_ICONV if (fhicd != (iconv_t)-1) iconv_close(fhicd); - if (strcmp(cs, tcs)) + if (asccasecmp(cs, tcs)) fhicd = iconv_open_ft(tcs, cs); else fhicd = (iconv_t)-1; @@ -1105,12 +1230,17 @@ notmime: } fromhdr_end: *q = '\0'; - if (flags & TD_ISPR) { + if ((flags & TD_ISPR) && (isclean & MIME_HIGHBIT)) { struct str new; + if ((isclean & MIME_UTF8) && asccasecmp("utf-8", tcs) == 0) + goto skip; + if ((isclean & MIME_LATIN) && ascncasecmp("iso-8859-", tcs, 9) == 0) + goto skip; makeprint(out, &new); free(out->s); *out = new; } +skip: if (flags & TD_DELCTRL) out->l = delctrl(out->s, out->l); #ifdef HAVE_ICONV --- sendout.c +++ sendout.c 2016-04-08 14:58:42.729798789 +0000 @@ -226,6 +226,23 @@ attach_file1(struct attachment *ap, FILE charset = ap->a_charset; convert = get_mime_convert(fi, &contenttype, &charset, &isclean, dosign); +#ifdef HAVE_ICONV + tcs = gettcharset(); + if (isclean & MIME_UTF8) + { + tcs = "utf-8"; + } + if (isclean & MIME_LATIN) { + tcs = value("charset"); + if (tcs == NULL && wantcharset && wantcharset != (char *)-1) + tcs = wantcharset; + } + if (tcs == NULL) { + contenttype = "application/octet-stream"; + charset = NULL; + convert = CONV_TOB64; + } +#endif fprintf(fo, "\n--%s\n" "Content-Type: %s", @@ -255,11 +272,10 @@ attach_file1(struct attachment *ap, FILE iconv_close(iconvd); iconvd = (iconv_t)-1; } - tcs = gettcharset(); if ((isclean & (MIME_HASNUL|MIME_CTRLCHAR)) == 0 && ascncasecmp(contenttype, "text/", 5) == 0 && - isclean & MIME_HIGHBIT && - charset != NULL) { + (isclean & MIME_HIGHBIT) && + charset != NULL && tcs != NULL) { if ((iconvd = iconv_open_ft(charset, tcs)) == (iconv_t)-1 && errno != 0) { if (errno == EINVAL) @@ -467,11 +483,12 @@ infix(struct header *hp, FILE *fi, int d } rm(tempMail); Ftfree(&tempMail); - convert = get_mime_convert(fi, &contenttype, &charset, - &isclean, dosign); + convert = get_mime_convert(fi, &contenttype, &charset, &isclean, + dosign); #ifdef HAVE_ICONV tcs = gettcharset(); - if ((convhdr = need_hdrconv(hp, GTO|GSUBJECT|GCC|GBCC|GIDENT)) != 0) { + if ((convhdr = need_hdrconv(hp, GTO|GSUBJECT|GCC|GBCC|GIDENT)) != 0 && + tcs != NULL) { if (iconvd != (iconv_t)-1) iconv_close(iconvd); if ((iconvd = iconv_open_ft(convhdr, tcs)) == (iconv_t)-1 @@ -505,10 +522,17 @@ infix(struct header *hp, FILE *fi, int d iconv_close(iconvd); iconvd = (iconv_t)-1; } + if (isclean & MIME_UTF8) + tcs = "utf-8"; + if (isclean & MIME_LATIN) { + tcs = value("charset"); + if (tcs == NULL && wantcharset && wantcharset != (char *)-1) + tcs = wantcharset; + } if ((isclean & (MIME_HASNUL|MIME_CTRLCHAR)) == 0 && ascncasecmp(contenttype, "text/", 5) == 0 && - isclean & MIME_HIGHBIT && - charset != NULL) { + (isclean & MIME_HIGHBIT) && + charset != NULL && tcs != NULL) { if (iconvd != (iconv_t)-1) iconv_close(iconvd); if ((iconvd = iconv_open_ft(charset, tcs)) == (iconv_t)-1