58 lines
1.7 KiB
Diff
58 lines
1.7 KiB
Diff
|
--- lib/XML/SAX/PurePerl/Reader/Stream.pm
|
||
|
+++ lib/XML/SAX/PurePerl/Reader/Stream.pm
|
||
|
@@ -70,6 +70,54 @@
|
||
|
my $self = shift;
|
||
|
my ($encoding) = @_;
|
||
|
# warn("set encoding to: $encoding\n");
|
||
|
+
|
||
|
+ # make sure that the buffer used to detect the encoding
|
||
|
+ # does not end in the middle of a utf8 sequence
|
||
|
+ if ($encoding eq 'UTF-8' &&
|
||
|
+ !$self->[EOF] &&
|
||
|
+ !utf8::is_utf8($self->[BUFFER]) && # make sure we do not do it twice
|
||
|
+ length($self->[BUFFER]) > 5) {
|
||
|
+
|
||
|
+ my $x = reverse(substr($self->[BUFFER], -5));
|
||
|
+ my $y = 0;
|
||
|
+
|
||
|
+ # skip the all the bytes at the end of buffer
|
||
|
+ # starting with bits 10 (continuation bytes of utf8 sequence)
|
||
|
+ while ($x ne "" && (ord($x) & 0xc0) == 0x80) {
|
||
|
+ $y--;
|
||
|
+ $x = substr($x, 1);
|
||
|
+ }
|
||
|
+
|
||
|
+ # if $x is ascii character, do nothing
|
||
|
+ # otherwise we must take a look how many
|
||
|
+ # continuation bytes we need
|
||
|
+ if ((ord($x) & 0xc0) == 0xc0) {
|
||
|
+ $x = ord($x);
|
||
|
+ if (($x & 0xe0) == 0xc0) { # the sequence contains one more byte
|
||
|
+ $y++;
|
||
|
+ } elsif (($x & 0xf0) == 0xe0) { # ...2 bytes
|
||
|
+ $y += 2;
|
||
|
+ } elsif (($x & 0xf8) == 0xf0) { # ...3 bytes
|
||
|
+ $y += 3;
|
||
|
+ } elsif (($x & 0xfc) == 0xf8) { # ...4 bytes
|
||
|
+ $y += 4;
|
||
|
+ } elsif (($x & 0xfe) == 0xfc) { # ...5 bytes
|
||
|
+ $y += 5;
|
||
|
+ }
|
||
|
+
|
||
|
+ # read the last sequence in the buffer completely, if needed
|
||
|
+ if ($y > 0) {
|
||
|
+ my $buf;
|
||
|
+ my $bytesread = read($self->[FH], $buf, $y);
|
||
|
+ if ($bytesread) {
|
||
|
+ $self->[BUFFER] .= $buf;
|
||
|
+ } elsif (defined($bytesread)) {
|
||
|
+ $self->[EOF]++;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
XML::SAX::PurePerl::Reader::switch_encoding_stream($self->[FH], $encoding);
|
||
|
XML::SAX::PurePerl::Reader::switch_encoding_string($self->[BUFFER], $encoding);
|
||
|
$self->[ENCODING] = $encoding;
|