Files
perl-HTML-TagParser/HTML-TagParser-0.16-subtree.diff

375 lines
12 KiB
Diff

--- HTML-TagParser-0.16/t/12_navigation.t.orig 2008-07-11 01:34:16.000000000 +0200
+++ HTML-TagParser-0.16/t/12_navigation.t 2008-07-11 02:06:45.000000000 +0200
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------
+use strict;
+use Test::More tests => 8;
+BEGIN { use_ok('HTML::TagParser') };
+# ----------------------------------------------------------------
+
+my $SOURCE = <<EOT;
+<html>
+<body>
+<div id="foo">
+ <span>AAA</span>
+ <div id="bar"selected>
+ BBB
+ <span>CCC</span>
+ DDD
+ <div/>
+ EEE
+ </div>
+ <span>FFF</span>
+</div>
+</body>
+</html>
+EOT
+# ----------------------------------------------------------------
+
+my $document = HTML::TagParser->new( $SOURCE );
+ok( ref $document, "new()" );
+my $bar = $document->getElementById('bar');
+my $fff = $bar->nextSibling();
+like( $fff->innerText(), qr/FFF/s, "nextSibling" );
+is( $fff->nextSibling(), undef, "no nextSibling" );
+my $ch = $bar->childNodes();
+is( $#$ch, 1, "childNodes" );
+is( $ch->[1]->parentNode()->id(), "bar", "parentNode" );
+is( $ch->[1]->parentNode()->parentNode()->id(), "foo", "parent.parentNode" );
+is( $ch->[1]->parentNode()->parentNode()->parentNode->parentNode()->parentNode(), undef, "root parentNode" );
+
+# ----------------------------------------------------------------
+;1;
+# ----------------------------------------------------------------
--- HTML-TagParser-0.16/t/08_nest.t.orig 2006-05-05 21:14:24.000000000 +0200
+++ HTML-TagParser-0.16/t/08_nest.t 2008-07-11 01:29:17.000000000 +0200
@@ -28,12 +28,12 @@
like( $body->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "body" );
my $foo = $html->getElementById( "foo" );
- like( $foo->innerText(), qr/AAA/s, "foo" );
-# like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" );
+# like( $foo->innerText(), qr/AAA/s, "foo" );
+ like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" );
my $bar = $html->getElementById( "bar" );
- like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" );
-# like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" );
+# like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" );
+ like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" );
# ----------------------------------------------------------------
;1;
# ----------------------------------------------------------------
--- HTML-TagParser-0.16/lib/HTML/TagParser.pm.orig 2007-04-06 12:48:08.000000000 +0200
+++ HTML-TagParser-0.16/lib/HTML/TagParser.pm 2008-07-11 02:15:14.000000000 +0200
@@ -10,12 +10,15 @@
my $elem = $html->getElementsByTagName( "title" );
print "<title>", $elem->innerText(), "</title>\n" if ref $elem;
-Parse a HTML source and find its first <form action=""> attribute's value.
+Parse a HTML source and find its first <form action=""> attribute's value
+and find all input elements belonging to this form.
my $src = '<html><form action="hoge.cgi">...</form></html>';
my $html = HTML::TagParser->new( $src );
my $elem = $html->getElementsByTagName( "form" );
print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem;
+ my @first_inputs = $elem->subTree()->getElementsByTagName( "input" );
+ my $form = $first_inputs[0]->getParent();
Fetch a HTML file via HTTP, and display its all <a> elements and attributes.
@@ -120,6 +123,43 @@
This method returns $elem's innerText without tags.
+=head2 $subhtml = $elem->subTree();
+
+This method returns a new object of class HTML::Parser,
+with all the elements that are in the DOM hierarchy under $elem.
+
+=head2 $elem = $elem->nextSibling();
+
+This method returns the next sibling within the same parent.
+It returns undef when called on a closing tag or on the lastChild node
+of a parentNode.
+
+=head2 $elem = $elem->previousSibling();
+
+This method returns the previous sibling within the same parent.
+It returns undef when called on the firstChild node of a parentNode.
+
+=head2 $child_elem = $elem->firstChild();
+
+This method returns the first child node of $elem.
+It returns undef when called on a closing tag element or on a
+non-container or empty container element.
+
+=head2 $child_elems = $elem->childNodes();
+
+This method creates an array of all child nodes of $elem and returns the array by reference.
+It returns an empty array-ref [] whenever firstChild() would return undef.
+
+=head2 $child_elem = $elem->lastChild();
+
+This method returns the last child node of $elem.
+It returns undef whenever firstChild() would return undef.
+
+=head2 $parent = $elem->parentNode();
+
+This method returns the parent node of $elem.
+It returns undef when called on root nodes.
+
=head2 $attr = $elem->attributes();
This method returns a hash of $elem's all attributes.
@@ -128,6 +168,17 @@
This method returns the value of $elem's attributes which name is $key.
+=head1 BUGS
+
+The HTML-Parser is simple. Methods innerText and subTree may be
+fooled by nested tags or embedded javascript code.
+
+The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results.
+The most expensive ones are lastChild() and previousSibling().
+parentNode() is also expensive, but only once. It does caching.
+
+The DOM tree is read-only, as this is just a parser.
+
=head1 INTERNATIONALIZATION
This module natively understands the character encoding used in document
@@ -157,12 +208,21 @@
use Carp;
use vars qw( $VERSION );
-$VERSION = "0.16";
+$VERSION = "0.16.1";
my $J2E = {qw( jis ISO-2022-JP sjis Shift_JIS euc EUC-JP ucs2 UCS2 )};
my $E2J = { map { lc($_) } reverse %$J2E };
my $SEC_OF_DAY = 60 * 60 * 24;
+# [000] '/' if closing tag.
+# [001] tagName
+# [002] attributes string (with trailing /, if self-closing tag).
+# [003] content until next (nested) tag.
+# [004] attributes hash cache.
+# [005] innerText combined strings cache.
+# [006] index of matching closing tag (or opening tag, if [000]=='/')
+# [007] index of parent (aka container) tag.
+#
sub new {
my $package = shift;
my $src = shift;
@@ -330,10 +390,10 @@
return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); # <xxx/>
my $tagname = $elem->[001];
+ my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
my $list = [];
- for ( ; $cur < $#$flat ; $cur++ ) {
+ for ( ; $cur < $closing ; $cur++ ) {
push( @$list, $flat->[$cur]->[003] );
- last if ( $flat->[ $cur + 1 ]->[001] eq $tagname );
}
my $text = join( "", grep { $_ ne "" } @$list );
$text =~ s/^\s+//s;
@@ -342,6 +402,127 @@
$elem->[005] = HTML::TagParser::Util::xml_unescape( $text );
}
+sub subTree
+{
+ my $self = shift;
+ my ( $flat, $cur ) = @$self;
+ my $elem = $flat->[$cur];
+ return if $elem->[000]; # </xxx>
+ my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
+ my $list = [];
+ while (++$cur < $closing)
+ {
+ push @$list, $flat->[$cur];
+ }
+
+ # allow the getElement...() methods on the returned object.
+ return bless { flat => $list }, 'HTML::TagParser';
+}
+
+
+sub nextSibling
+{
+ my $self = shift;
+ my ( $flat, $cur ) = @$self;
+ my $elem = $flat->[$cur];
+
+ return undef if $elem->[000]; # </xxx>
+ my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
+ my $next_s = $flat->[$closing+1];
+ return undef unless $next_s;
+ return undef if $next_s->[000]; # parent's </xxx>
+ return HTML::TagParser::Element->new( $flat, $closing+1 );
+}
+
+sub firstChild
+{
+ my $self = shift;
+ my ( $flat, $cur ) = @$self;
+ my $elem = $flat->[$cur];
+ return undef if $elem->[000]; # </xxx>
+ my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
+ return undef if $closing <= $cur+1; # no children here.
+ return HTML::TagParser::Element->new( $flat, $cur+1 );
+}
+
+sub childNodes
+{
+ my $self = shift;
+ my ( $flat, $cur ) = @$self;
+ my $child = firstChild($self);
+ return [] unless $child; # an empty array is easier for our callers than undef
+ my @c = ( $child );
+ while (defined ($child = nextSibling($child)))
+ {
+ push @c, $child;
+ }
+ return \@c;
+}
+
+sub lastChild
+{
+ my $c = childNodes(@_);
+ return undef unless $c->[0];
+ return $c->[-1];
+}
+
+sub previousSibling
+{
+ my $self = shift;
+ my ( $flat, $cur ) = @$self;
+
+ ## This one is expensive.
+ ## We use find_closing() which walks forward.
+ ## We'd need a find_opening() which walks backwards.
+ ## So we walk backwards one by one and consult find_closing()
+ ## until we find $cur-1 or $cur.
+
+ my $idx = $cur-1;
+ while ($idx >= 0)
+ {
+ if ($flat->[$idx][000] && defined($flat->[$idx][006]))
+ {
+ $idx = $flat->[$idx][006]; # use cache for backwards skipping
+ next;
+ }
+
+ my $closing = HTML::TagParser::Util::find_closing($flat, $idx);
+ return HTML::TagParser::Element->new( $flat, $idx )
+ if defined $closing and ($closing == $cur || $closing == $cur-1);
+ $idx--;
+ }
+ return undef;
+}
+
+sub parentNode
+{
+ my $self = shift;
+ my ( $flat, $cur ) = @$self;
+
+ return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache
+
+ ##
+ ## This one is very expensive.
+ ## We use previousSibling() to walk backwards, and
+ ## previousSibling() is expensive.
+ ##
+ my $ps = $self;
+ my $first = $self;
+
+ while (defined($ps = previousSibling($ps))) { $first = $ps; }
+
+ my $parent = $first->[1] - 1;
+ return undef if $parent < 0;
+ die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur;
+
+ $flat->[$cur][007] = $parent; # cache
+ return HTML::TagParser::Element->new( $flat, $parent )
+}
+
+##
+## feature:
+## self-closing tags have an additional attribute '/' => '/'.
+##
sub attributes {
my $self = shift;
my ( $flat, $cur ) = @$self;
@@ -420,6 +601,66 @@
$flat;
}
+## returns 1 beyond the end, if not found.
+## returns undef if called on a </xxx> closing tag
+sub find_closing
+{
+ my ($flat, $cur) = @_;
+
+ return $flat->[$cur][006] if $flat->[$cur][006]; # cache
+ return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing
+
+ my $name = $flat->[$cur][001];
+ my $pre_nest = 0;
+ ## count how many levels deep this type of tag is nested.
+ my $idx;
+ for ($idx = 0; $idx <= $cur; $idx++)
+ {
+ my $e = $flat->[$idx];
+ next unless $e->[001] eq $name;
+ next if (($e->[002]||'') =~ m{/$}); # self-closing
+ $pre_nest += ($e->[000]) ? -1 : 1;
+ $pre_nest = 0 if $pre_nest < 0;
+ $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.
+ }
+ my $last_idx = $#$flat;
+
+ ## we move last_idx closer, in case this container
+ ## has not all its subcontainers closed properly.
+ my $post_nest = 0;
+ for ($idx = $last_idx; $idx > $cur; $idx--)
+ {
+ my $e = $flat->[$idx];
+ next unless $e->[001] eq $name;
+ $last_idx = $idx-1; # remember where a matching tag was
+ next if (($e->[002]||'') =~ m{/$}); # self-closing
+ $post_nest -= ($e->[000]) ? -1 : 1;
+ $post_nest = 0 if $post_nest < 0;
+ last if $pre_nest <= $post_nest;
+ $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards.
+ }
+
+ my $nest = 1; # we know it is not self-closing. start behind.
+
+ for ($idx = $cur+1; $idx <= $last_idx; $idx++)
+ {
+ my $e = $flat->[$idx];
+ next unless $e->[001] eq $name;
+ next if (($e->[002]||'') =~ m{/$}); # self-closing
+ $nest += ($e->[000]) ? -1 : 1;
+ if ($nest <= 0)
+ {
+ die "assert </xxx>" unless $e->[000];
+ $e->[006] = $cur; # point back to opening tag
+ return $flat->[$cur][006] = $idx;
+ }
+ $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.
+ }
+
+ # not all closed, but cannot go further
+ return $flat->[$cur][006] = $last_idx+1;
+}
+
sub find_meta_charset {
my $txtref = shift; # reference
while ( $$txtref =~ m{