--- lib/XML/SAX/PurePerl/Reader/Stream.pm +++ lib/XML/SAX/PurePerl/Reader/Stream.pm @@ -70,6 +70,54 @@ my $self = shift; my ($encoding) = @_; # warn("set encoding to: $encoding\n"); + + # make sure that the buffer used to detect the encoding + # does not end in the middle of a utf8 sequence + if ($encoding eq 'UTF-8' && + !$self->[EOF] && + !utf8::is_utf8($self->[BUFFER]) && # make sure we do not do it twice + length($self->[BUFFER]) > 5) { + + my $x = reverse(substr($self->[BUFFER], -5)); + my $y = 0; + + # skip the all the bytes at the end of buffer + # starting with bits 10 (continuation bytes of utf8 sequence) + while ($x ne "" && (ord($x) & 0xc0) == 0x80) { + $y--; + $x = substr($x, 1); + } + + # if $x is ascii character, do nothing + # otherwise we must take a look how many + # continuation bytes we need + if ((ord($x) & 0xc0) == 0xc0) { + $x = ord($x); + if (($x & 0xe0) == 0xc0) { # the sequence contains one more byte + $y++; + } elsif (($x & 0xf0) == 0xe0) { # ...2 bytes + $y += 2; + } elsif (($x & 0xf8) == 0xf0) { # ...3 bytes + $y += 3; + } elsif (($x & 0xfc) == 0xf8) { # ...4 bytes + $y += 4; + } elsif (($x & 0xfe) == 0xfc) { # ...5 bytes + $y += 5; + } + + # read the last sequence in the buffer completely, if needed + if ($y > 0) { + my $buf; + my $bytesread = read($self->[FH], $buf, $y); + if ($bytesread) { + $self->[BUFFER] .= $buf; + } elsif (defined($bytesread)) { + $self->[EOF]++; + } + } + } + } + XML::SAX::PurePerl::Reader::switch_encoding_stream($self->[FH], $encoding); XML::SAX::PurePerl::Reader::switch_encoding_string($self->[BUFFER], $encoding); $self->[ENCODING] = $encoding;