hcode/hdcode.c

#ifndef lint
static char rcsid[] = "$Id: hdcode.c,v 1.6 1997/11/19 04:16:52 news Exp news $";
#endif

/*
 * $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 * CLEAN_QP Copyright Notice
 *
 * Most of the following CLEAN_QP codes are stealed from a package called
 * hmailer-beta2 written by Song Woo-Geel   cookie@venus.etri.re.kr.
 *
 * I am not sure if I can distribute it or not.
 * However, The following is the original copyright statement.
 * ***************************************************************
 *	Copyright (C) 1995  Song Woo-Geel
 *	Written by cookie@venus.etri.re.kr on May. 12 '95.
 * ***************************************************************
 *
 * Now, the person who stealed cookie's code and his Copyright is
 * ***************************************************************
 * Copyright (C) 1997  Sang-yong Suh <sysuh@kigam.re.kr>
 *
 * THIS CODE IS PROVIDED AS IS AND WITHOUT ANY WARRANTY.
 * USE IT AT YOUR OWN RISK AND DON'T COMAPLAIN ME OR TO COOKIE.
 *
 * $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 */

/*
 * Revision history
 *
 * 1997/05/16  sysuh  Check charset="iso-2022-kr" (note the quotes)
 * 1997/06/14  sysuh  Replace charset=iso-8859-1 to EUC-KR
 * 1997/11/18  sysuh  Decode embedded QP texts within multipart.
 */

/*
 * hMailDecode : decode Korean "Q" or "B" encoded HANGUL news article
 *
 * SYNOPSIS
 *  1. standalone usage:   compile with -D_MAIN
 *	hdcode [file]
 *
 *  2. as subroutine:
 *
 *      hMailDecode(char *NULL, char *NULL);	# initialize
 *	while (fgets(ibuf, sizeof(ibuf), stdin))
 *	    hMailDecode(char *ibuf, char *obuf); # pass one line at a time
 *
 * OPTIONS
 *
 * decode file and write to stdout.
 *      [file]	If file arg is missing, read stdin.
 *
 * NOTE:
 *      Header encoding	: 	RFC-1342 	( "Q" or "B" encoding )
 *	Content encoding :	Quoted-printable or ISO-2022-KR or Base64.
 */

#ifndef	CLEAN_QP
void
hMailDecode(ibuf, obuf)
char *ibuf;
char *obuf;
{ }
#else	/* CLEAN_QP */
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

/*	RFC-1342 header encoding start/end sequence	*/
#define PREFIX		"=?"
#define	POSTFIX	 	"?="
#define SUFFIXQP	"?Q?"
#define SUFFIXB64	"?B?"
#define OLDPREFIX	"=?B?EUC-KR?"
#define HEADER_CTE	"Content-Transfer-Encoding: "
#define HEADER_CT	"Content-Type: "
#define	KOR_CHARSET	"EUC-KR"	/* KSC-5601	*/

/*	ISO-2022 encoding designator escape sequence	*/
#define	INTRO_ISO	"\033$)C"
#define	SHIFTOUT	'\016'		/* ASCII SO	*/
#define	SHIFTIN		'\017'		/* ASCII SI	*/
#define	DEL		'\177'		/* ASCII DEL	*/
#define OFFSET	( unsigned char ) 0200	/* or ISO encoding offset */

#define	LSIZ		4096
#define	TRUE		1
#define	FALSE		0

enum section_t { SEC_HEADER, SEC_BODY };
enum encode_t { ENC_UNKNOWN, ENC_NONE, ENC_QP, ENC_ISO, ENC_B64 };

/* recognize encoding name and convert to encode_t type */
static enum encode_t encodingInfo(arg)
char  *arg ;
{
    if (!arg) return(ENC_UNKNOWN);

    while (isspace(*arg)) arg++;
    if (strncasecmp(arg, "7bit", 4) == 0
		|| strncasecmp(arg, "8bit", 4) == 0
		|| strncasecmp(arg, "none", 4) == 0)
	return(ENC_NONE);
    else if (strncasecmp(arg, "quoted-printable", 16) == 0)
	return(ENC_QP);
    else if (strncasecmp(arg, "base64", 6) == 0)
	return(ENC_B64);
    else
	return(ENC_UNKNOWN);
}

/* convert CR+LF in middle or tail to LF. Overwrite! */
static void uncanonize(iptr)
char   *iptr ;
{
    char   *optr = iptr ;   /*  overwrite input buffer  */
    static char   oldch = '\0' ;
    while ( *iptr ) {
	if (( oldch=='\r' && (*iptr=='\n'||*iptr=='\r' )) || *iptr != '\r' )
	    *optr++ = *iptr ;
	oldch = *iptr++;
    }
    *optr = '\0' ;
}

/* Decode ISO-2022-kr coded line to KSC-5601 */
static void decodeISOLine (iptr, optr)
char *iptr, *optr ;
{
    int    shifted = 0 ;   /* 	Each line begins in unshifted state  	*/
    assert( iptr != NULL && optr != NULL );

    while( *iptr ) {
	if ( *iptr == SHIFTOUT )
	    shifted = 1 ;
	else if ( *iptr == SHIFTIN )
	    shifted = 0 ;
	else if ( shifted  && *iptr > ' ' && *iptr <  DEL )
	    *optr++ = (char) ( (unsigned char) *iptr + OFFSET);
	else
	    *optr++ = *iptr ;
	iptr++ ;
    }
    *optr = '\0';
    assert( !shifted );  /* missing shift-in code ?? */
}

/*
 *	decodeB64Str()
 *	Return value : actual decoded resulting string length.
 *   	limit is max length to decode, actual input may be shorter.
 *	Input size may be not multiple of 3 or 4. returns right size.
 *
 *	Adapted by from bq.c,   Copyright (C) 1992 Ienup Sung.
 *  @(#)bq.c: base64 encode/decode modules by is@ev.trigem.co.kr   1992.7.22
 */

#define LS6B			00077L	/* least significant 6 bits */
#define LS8B			00377L	/* least significant 8 bits */
#define PAD			'='

static char *b64_alphabet =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

/* Decode base64 encoded string */
static int decodeB64Str(ibuf, obuf, limit )
char ibuf[], obuf[];
int	limit ;
{
    register unsigned long bitbuf = 0 ;
    char    *iptr = ibuf, *optr = obuf, *offs ;
    int     valid = 0 , mod4 = 0 ;
    assert( ibuf !=NULL && obuf != NULL && limit >= 0 ) ;

    while  ( limit > 0 ) {
        bitbuf = (bitbuf << 6) & ~ LS6B;
	if( (offs = strchr( b64_alphabet, *iptr )) != NULL ) {
	    bitbuf |= (unsigned long) (offs - b64_alphabet) ;
	    valid++ ;
	}
	else   /*  if *iptr is PAD or non b64 alphabet, ignore   */
	    bitbuf &= ~LS6B;

	iptr++ ;
        if  (  *iptr == '\0' || iptr - ibuf >= limit ) {
	    /* 	 align to 4 * 6 bit shifted postion */
	    while  ( ++mod4 % 4  != 0 ) bitbuf = (bitbuf << 6) & ~ LS6B;
	    limit = 0 ;   /*   exit root   */
	}
	else
	    ++mod4 ;
	if ( mod4 % 4 == 0 ) {
	    if (valid >= 2 ) *optr++ = (char)((bitbuf >> 16 ) & LS8B);
	    if (valid >= 3 ) *optr++ = (char)((bitbuf >> 8 ) & LS8B);
	    if (valid == 4 ) *optr++ = (char)((bitbuf) & LS8B);
	    bitbuf = 0 ; valid = 0 ;
	}
    }
    *optr = '\0';
    return(optr - obuf);
}

/* RFC1341 BASE64 BODY section decoding */
static void decodeB64Line(ibuf, obuf)
char ibuf[], obuf[] ;
{
    assert( ibuf != NULL && obuf != NULL );
    decodeB64Str(ibuf, obuf, strlen(ibuf)-1) ;   /* ignore '\n'  */
}

/* Convert two hexadecimal digit char to integer  	*/
/*   ex)  h2toi('4', 'E') == 0x4e == 'N' 		*/
/* If any char is not hexadecimal digit, return -1	*/
static int h2toi( ch1, ch2 )
char ch1, ch2 ;
{
    unsigned d1, d2 ;

    if ((ch1) >= '0' && (ch1) <= '9' ) d1 = ( ch1 - '0' );
    else if ((ch1) >= 'A' && (ch1) <= 'F' ) d1 = ( 10 + ch1 - 'A' );
    else if ((ch1) >= 'a' && (ch1) <= 'f' )  d1 = ( 10 + ch1 - 'a' );
    else return(-1);

    if ((ch2) >= '0' && (ch2) <= '9' ) d2 = ( ch2 - '0' );
    else if ((ch2) >= 'A' && (ch2) <= 'F' ) d2 = ( 10 + ch2 - 'A' );
    else if ((ch2) >= 'a' && (ch2) <= 'f' )  d2 = ( 10 + ch2 - 'a' );
    else return(-1);

    return ( (int) (((d1<<4) + d2 ) & LS8B)) ;
}

static void decodeQPStr(ibuf, obuf, limit)
char ibuf[], obuf[] ;
int limit ;
{
    char *iptr = ibuf, *optr = obuf ;
    int ctmp ;
    assert( iptr != NULL && obuf != NULL && limit >= 0 );

    while( *iptr && ( iptr - ibuf < limit ) ) {
	if ( *iptr == '=' && ( ctmp = h2toi( iptr[1], iptr[2])) >= 0 ){
	    *optr++ = (char) ctmp ;
	    iptr += 3 ;
	}
	else if ( *iptr == '_' ) {  /* 	translate    */
	    *optr++ = ' ' ; iptr++ ;
	}
	else
	    *optr++ = *iptr++ ;
    }
    *optr = '\0' ;
}

/* do RFC1341 QP BODY section decoding */
static void decodeQPLine(iptr, obuf)
char *iptr, obuf[] ;
{
    char   *optr = obuf ;
    int ctmp ;   /*    h2toi() returns -1 on non-hexa digit	*/
    assert( iptr != NULL && obuf != NULL );

    while( *iptr ) {
	if ( *iptr == '=' && ( ctmp = h2toi( iptr[1], iptr[2])) >= 0 ){
	    *optr++ = (char) ctmp ;
	    iptr += 3 ;
	}
	else if ( *iptr == '=' && ( iptr[1] == '\n' ))
	    iptr += 2 ; /*   skip soft line break     */
	else
	    *optr++ = *iptr++;
    }
    *optr = '\0' ;
}

/*   Cut off trailing blanks or CR-LF to LF */
static void fixTrailer(ibuf)
char ibuf[] ;
{
    char *end = ibuf + strlen(ibuf)-1 ; /*  line break  */
    assert( ibuf != NULL ) ;

    if ( *end == '\n' && end >= ibuf ) end-- ;
    if ( *end == '\r' && end >= ibuf  ) end-- ;
    while (( *end == ' ' || *end == '\t' ) &&  end >= ibuf ) end-- ;
    if ( *++end != '\n' )
	strcpy( end, "\n") ;
}

/*    similar to strncpy(), but "to" string is always terminated  */
static void strncpyz(to, from, len)
char *to, *from ;
int len ;
{
    assert( to != NULL && from != NULL && len >= 0 ) ;
    while( len-- > 0 && *from )
	*to++ = *from++;
    *to = '\0';
}

/* Decode header section by RFC1342 MIME "Q" or "B" header encoding rule */
static enum encode_t decodeHeader(iptr, optr, HeadEncoding)
char *iptr, *optr ;
enum encode_t HeadEncoding;
{
    char *preptr, *sufptr, *txtptr, *postptr ;
    assert( iptr != NULL && optr != NULL );

    while(*iptr ) {
	if ( ( preptr = strstr( iptr, PREFIX )) == NULL
	    || ( sufptr = strchr( preptr+strlen(PREFIX), '?' )) == NULL
	    /*   misssing POSTFIX, do not decode   */
	    || ( postptr = strstr( sufptr+strlen(SUFFIXQP), POSTFIX ))==NULL ) {
	    strcpy(optr, iptr ) ;
	    return HeadEncoding;
	}
	txtptr = sufptr+strlen(SUFFIXQP) ;
	/* (header)  =?EUC-KR?Q?=89=AB=CD=EF?=       */
	/* (ptr's)   ^pre    ^sf^txt        ^post    */

	strncpy( optr, iptr, preptr-iptr ) ;
	optr += preptr - iptr ;    /* watch out order */
	iptr = preptr ;

	if ( strncasecmp( sufptr, SUFFIXQP, strlen(SUFFIXQP) ) == 0 ) {
	    decodeQPStr( txtptr , optr, postptr-txtptr ) ;
	    HeadEncoding = ENC_QP ;
	}
	else if ( strncasecmp( sufptr, SUFFIXB64, strlen(SUFFIXB64) ) == 0 ) {
	    decodeB64Str( txtptr , optr, postptr-txtptr ) ;
	    HeadEncoding = ENC_B64 ;
	}
	/* For compatibility with old (before Dec. 94) elm2.3h or hcode2.0  */
	else if ( strncasecmp( preptr, OLDPREFIX, strlen(OLDPREFIX)) == 0 ) {
	    txtptr = preptr+strlen(OLDPREFIX) ;
	    decodeB64Str( txtptr , optr, postptr-txtptr ) ;
	    HeadEncoding = ENC_B64 ;
	}
	else {   /* Unknown coding, do not decode */
	    strncpyz( optr, iptr, postptr+ strlen(POSTFIX)-iptr );
	}

	optr += strlen(optr) ;
	iptr = postptr + strlen(POSTFIX) ;
        *optr = '\0';
    }
    return HeadEncoding;
}

static int replace_charset(ibuf)
char *ibuf;
{
    char *p, *q;

    if ((p = strstr(ibuf, "charset="))) {	/* replace with "euc-kr" */
	q = p + 8;
	if (*q == '"')
	    q++;
	if (strncasecmp(q, "iso-2022-kr", 11) == 0 ||
	    strncasecmp(q, "iso-8859-1",  10) == 0)
	    strcpy(p+8, "EUC-KR\n");
	return 1;		/* charset detected */
    }
    return 0;
}

/*static char *strcasestr(buf, str)
char *buf;
char *str;
{
    int i, n;
    int lenstr = strlen(str);

    n = strlen(buf) - lenstr + 1;
    for (i=0; i<n; i++,buf++)
	if (strncasecmp(buf, str, lenstr) == 0)
	    return buf;
    return NULL;
}*/

static char *get_mpb_string(ibuf)
char *ibuf;
{
    char *p, *bstr=NULL;
    int len;

    if ((p = strcasestr(ibuf, "boundary=\""))) {	/* mpString */
	p += 10;
	len = strlen(p);
	if (len > 10) {		/* minimum length of the boundary string */
	    bstr = strdup(p);
	    p = bstr + len;
	    while (*--p == '\n' || *p == '"' || *p == ';')
		*p = '\0';
	}
    }
    return bstr;
}

static int isContentTypeText(cstr)
char *cstr;
{
    return (strcasestr(cstr, "text") != NULL);
}

static int isUueHeader(str)
char *str;
{
    int i, n;

    while (isspace(*str))
	str++;
    if (strncasecmp(str, "begin ", 6) != 0)
	return 0;

    /* check three digits and a <blank>. */
    str += 6;
    n = strlen(str);
    if (n < 5)		/* 3 digits, 1 blank, and 1 filename */
	return 0;

    for (i=0; i<3; i++)
	if (!isdigit(*str++))
	    return 0;
    while (isspace(*str))	/* skip blanks */
	str++;
    if (*str)
	return 1;
    return 0;
}

static enum section_t
hMailDecode(ibuf, obuf)
char *ibuf;
char *obuf;
{
    char *p, *q;
    static enum section_t section = SEC_HEADER ;
    static enum encode_t HeadEncoding = ENC_UNKNOWN;
    static enum encode_t Encoding = ENC_UNKNOWN;
    static int  isoMode = 0 ;
    static int  isPendingCT = 0;	/* previous CT header is incomplete */
    static char *mpBoundary = NULL;	/* multipart boundary text holder */

    /* initialize */
    if (!ibuf || !*ibuf) {
	section = SEC_HEADER;
	HeadEncoding = ENC_UNKNOWN;
	Encoding = ENC_UNKNOWN;
	isoMode = 0;
	isPendingCT = 0;
	if (mpBoundary)
	    free(mpBoundary);
	mpBoundary = NULL;
	return section;
    }

    fixTrailer(ibuf);
    if (section == SEC_HEADER && *ibuf == '\n') {
	section = SEC_BODY;
	strcpy(obuf, "\n");
	return section;
    }

    if (section == SEC_HEADER) {

	if (isPendingCT) {
	    if (!isspace(*ibuf))
		isPendingCT = 0;
	    else if (!replace_charset(ibuf) && !mpBoundary
			&& (mpBoundary = get_mpb_string(ibuf))) {
		strcpy(obuf, ibuf);
		uncanonize(obuf);
		return section;
	    }
	}

	/* Content-Type must be checked before Content-Transfer-Encoding */
	if (strncasecmp(ibuf, HEADER_CT, strlen(HEADER_CT)) == 0) {
	    p = ibuf + strlen(HEADER_CT);
	    if ((q = strcasestr(p, "multipart"))) {
		if ((mpBoundary = get_mpb_string(q+9)) == 0)
		    isPendingCT = 1;		/* continued header */
	    } else if (!isContentTypeText(p)) {
		Encoding = ENC_NONE;
	    } else if (!replace_charset(p))
		isPendingCT = 1;
	    strcpy(obuf, ibuf);
	    /* Encoding = ENC_ISO and iso-8859-1 */
	}
	else if (Encoding != ENC_NONE &&
		strncasecmp(ibuf, HEADER_CTE, strlen(HEADER_CTE)) == 0) {
	    Encoding = encodingInfo(ibuf+strlen(HEADER_CTE)) ;
	    if (Encoding == ENC_QP || Encoding == ENC_B64)
		sprintf(obuf, "%s%s\n", HEADER_CTE, "8bit") ;
	    else
		strcpy(obuf, ibuf);
	}
	else if (strstr(ibuf, PREFIX))
	    HeadEncoding = decodeHeader(ibuf, obuf, HeadEncoding);
	else
	    strcpy(obuf, ibuf);
    }
    else if (section == SEC_BODY) {

	if (mpBoundary && strstr(ibuf, mpBoundary)) {
	    section = SEC_HEADER;
	    HeadEncoding = ENC_UNKNOWN;
	    Encoding = ENC_UNKNOWN;
	    isoMode = 0;
	    strcpy(obuf, ibuf);
	}

	else if (Encoding == ENC_NONE)
	    strcpy(obuf, ibuf);
	else if (isUueHeader(ibuf)) {
	    Encoding = ENC_NONE;
	    strcpy(obuf, ibuf);
	}

	else if (Encoding == ENC_QP)
	    decodeQPLine(ibuf, obuf);
	else if (Encoding == ENC_B64)
	    decodeB64Line(ibuf, obuf) ;
	else if ((p = strstr(ibuf, INTRO_ISO))) {
	    /* remove ISO intro sequnce from ibuf */
	    strcpy( p, p + strlen(INTRO_ISO));
	    decodeISOLine(ibuf, obuf);
	    isoMode = 1 ;
	}
    /*
     * If headers are "B" encoded AND content line has SO char without prioior
     * ISO introducer, we assume missing introducer. It's feature.
     */
	else if ((isoMode || HeadEncoding == ENC_B64)
				&& strchr(ibuf, SHIFTOUT) != NULL )
	    decodeISOLine(ibuf, obuf );
	else
	    strcpy(obuf, ibuf);
    }
    uncanonize(obuf);
    return section;
}

/*
** Is it posted on a Han newsgroups?
*/
int
IsHanNewsgroups(char *line)
{
    char *p;

    for (p=line; p; p++) {
	while (isspace(*p))
	    p++;
	if (strncmp(p, "han.", 4) == 0 &&
			strncmp(p, "han.test", 8) != 0)
	    return TRUE;
	p = strchr(p, ',');
	if (p == NULL)
	    break;
    }
    return FALSE;
}


/*
** Retrun CleanQP filtered article.  NULL indicates ERROR.
*/
char *
hNewsCleanQP(char *article, int checkNewsgroups)
{
    char	*newart;
    size_t	newlen;
    int		used;
    char	*p, *q, *next;
    char	hold;
    char	*orig = NULL;
    enum	section_t section;

    newlen = strlen(article) + LSIZ;
    newart = malloc(newlen);
    if (newart == NULL) {
	fprintf(stderr, "hNewsDecode: can't malloc %d bytes\n", newlen);
	return NULL;
    }

    section = hMailDecode((char *)NULL, (char *)NULL);
    next = article;
    used = 0;
    for (p=next; next; p=next, *p=hold) {

	/*
	** Make the input line.  Remember the char of start of next line.
	*/
	next = strchr(p, '\n');
	if (next) {
	    hold = *++next;
	    *next = '\0';
	}

	/*
	** Check Newsgroups line.
	*/
	if (checkNewsgroups && strncasecmp(p, "newsgroups:", 11) == 0) {
	    checkNewsgroups = FALSE;
	    if (!IsHanNewsgroups(p+11)) {
		free(newart);
		if (next)
		    *next = hold;
		return NULL;
	    }
	}

	/*
	** Allocate output space
	*/
	if (newlen - used < LSIZ + LSIZ) {
	    newlen += LSIZ + LSIZ;
	    newart = realloc(newart, newlen);
	    if (newart == NULL) {
		fprintf(stderr, "hNewsDecode: can't realloc %d bytes\n",
			newlen);
		if (next)
		    *next = hold;		/* recover old char */
		return NULL;
	    }
	}

	/*
	** Apply filter, and set up next output pointer.
	*/
	q = newart + used;
	strcpy(q, p);
	section = hMailDecode(q, q);
	if (section == SEC_HEADER) {
	    if (orig == NULL && !isspace(*p) && strcmp(p, q))
		orig = p;
	    if (orig && (!isspace(hold) || hold == '\n')) {
		used += strlen(q);
		sprintf(newart+used, "X-Orig-%s", orig);
		orig = NULL;
	    }
	}
	used += strlen(newart+used);

	if (next == NULL)
	    break;
    }
    return newart;
}

#ifdef _MAIN

int main(argc, argv)
int argc;
char **argv;
{
    char	*article, *newart;
    char	line[LSIZ];
    int		len;
    int		used = 0;
    size_t	artlen = 0;
    int		checkNewsgroups = FALSE;
    int		i;
    char	*infile = NULL;

    for (i=1; i<argc; i++)
	if (strcmp(argv[i], "-n") == 0)
	    checkNewsgroups = TRUE;
	else if (*argv[i] != '-' && infile == NULL)
	    infile = argv[i];
	else {
	    fprintf(stderr, "usage: %s [-n] [file]\n", argv[0]);
	    exit(1);
	}

    if (infile && !freopen(infile, "r", stdin)) {
	fprintf(stderr, "can't open input %s\n", infile);
	exit(1);
    }

    artlen = 100 * LSIZ;
    article = malloc(artlen);
    if (article == NULL) {
	fprintf(stderr, "can't malloc %d bytes\n", artlen);
	exit(1);
    }
    while (fgets(line, sizeof(line), stdin)) {
	len = strlen(line);
	if (artlen <= used + len) {
	    artlen += LSIZ * 10;
	    article = realloc(article, artlen);
	    if (article == NULL) {
		fprintf(stderr, "can't realloc %d bytes\n", artlen);
		exit(1);
	    }
	}
	strncpy(article+used, line, len);
	used += len;
    }
    *(article + used) = '\0';
    newart = hNewsCleanQP(article, checkNewsgroups);
    if (newart) {
	free(article);
	article = newart;
    }
    while (*article) {
	putchar(*article++);
    }
    return 0;
}
#endif	/* _MAIN */
#endif	/* CLEAN_QP */