commit b9aae1e43eb2c8e989510187cff0ba3e996f9a4c Author: Gisle Aas Date: Thu Oct 22 21:45:54 2009 +0200 decode_entities confused by trailing incomplete entity Mark Martinec reported crashed when running SpamAssassin, given a particular HTML junk mail to parse. The problem was caused by HTML::Parsers decode_entities function confusing itself when it encountered strings with incomplete entities at the end of the string. diff --git a/t/entities.t b/t/entities.t index 7f6a29a..e96501c 100644 --- a/t/entities.t +++ b/t/entities.t @@ -1,6 +1,6 @@ use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); -use Test::More tests => 12; +use Test::More tests => 13; $a = "Våre norske tegn bør æres"; @@ -71,6 +71,8 @@ is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); is(decode_entities("'"), "'"); is(encode_entities("'", "'"), "'"); +is(decode_entities("Attention Homeοωnөrs...1ѕt Tімe Eνөг"), + "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}"); __END__ # Quoted from rfc1866.txt diff --git a/util.c b/util.c index 28fec78..6f56a2b 100644 --- a/util.c +++ b/util.c @@ -94,14 +94,14 @@ decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) ent_start = s; repl = 0; - if (*s == '#') { + if (s < end && *s == '#') { UV num = 0; UV prev = 0; int ok = 0; s++; - if (*s == 'x' || *s == 'X') { + if (s < end && (*s == 'x' || *s == 'X')) { s++; - while (*s) { + while (s < end) { char *tmp = strchr(PL_hexdigit, *s); if (!tmp) break; @@ -117,7 +117,7 @@ decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) } } else { - while (isDIGIT(*s)) { + while (s < end && isDIGIT(*s)) { num = num * 10 + (*s - '0'); if (prev && num < prev) { /* overflow */ @@ -180,7 +180,7 @@ decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) } else { char *ent_name = s; - while (isALNUM(*s)) + while (s < end && isALNUM(*s)) s++; if (ent_name != s && entity2char) { SV** svp; @@ -216,7 +216,7 @@ decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) if (repl) { char *repl_allocated = 0; - if (*s == ';') + if (s < end && *s == ';') s++; t--; /* '&' already copied, undo it */