perl-HTML-TagParser/HTML-TagParser-0.16-subtree.diff

--- HTML-TagParser-0.16/t/12_navigation.t.orig	2008-07-11 01:34:16.000000000 +0200
+++ HTML-TagParser-0.16/t/12_navigation.t	2008-07-11 02:06:45.000000000 +0200
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------
+use strict;
+use Test::More tests => 8;
+BEGIN { use_ok('HTML::TagParser') };
+# ----------------------------------------------------------------
+
+my $SOURCE = <<EOT;
+<html>
+<body>
+<div id="foo">
+	<span>AAA</span>
+	<div id="bar"selected>
+		BBB
+		<span>CCC</span>
+		DDD
+		<div/>
+		EEE
+	</div>
+	<span>FFF</span>
+</div>
+</body>
+</html>
+EOT
+# ----------------------------------------------------------------
+
+my $document = HTML::TagParser->new( $SOURCE );
+ok( ref $document, "new()" );
+my $bar = $document->getElementById('bar');
+my $fff = $bar->nextSibling();
+like( $fff->innerText(), qr/FFF/s, "nextSibling" );
+is( $fff->nextSibling(), undef, "no nextSibling" );
+my $ch = $bar->childNodes();
+is( $#$ch, 1, "childNodes" );
+is( $ch->[1]->parentNode()->id(), "bar", "parentNode" );
+is( $ch->[1]->parentNode()->parentNode()->id(), "foo", "parent.parentNode" );
+is( $ch->[1]->parentNode()->parentNode()->parentNode->parentNode()->parentNode(), undef, "root parentNode" );
+
+# ----------------------------------------------------------------
+;1;
+# ----------------------------------------------------------------
--- HTML-TagParser-0.16/t/08_nest.t.orig	2006-05-05 21:14:24.000000000 +0200
+++ HTML-TagParser-0.16/t/08_nest.t	2008-07-11 01:29:17.000000000 +0200
@@ -28,12 +28,12 @@
 	like( $body->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "body" );

 	my $foo = $html->getElementById( "foo" );
-	like( $foo->innerText(), qr/AAA/s, "foo" );
-#	like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" );
+#	like( $foo->innerText(), qr/AAA/s, "foo" );
+	like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" );

 	my $bar = $html->getElementById( "bar" );
-	like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" );
-#	like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" );
+#	like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" );
+	like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" );
 # ----------------------------------------------------------------
 ;1;
 # ----------------------------------------------------------------
--- HTML-TagParser-0.16/lib/HTML/TagParser.pm.orig	2007-04-06 12:48:08.000000000 +0200
+++ HTML-TagParser-0.16/lib/HTML/TagParser.pm	2008-07-11 02:15:14.000000000 +0200
@@ -10,12 +10,15 @@
     my $elem = $html->getElementsByTagName( "title" );
     print "<title>", $elem->innerText(), "</title>\n" if ref $elem;

-Parse a HTML source and find its first <form action=""> attribute's value.
+Parse a HTML source and find its first <form action=""> attribute's value
+and find all input elements belonging to this form.

     my $src  = '<html><form action="hoge.cgi">...</form></html>';
     my $html = HTML::TagParser->new( $src );
     my $elem = $html->getElementsByTagName( "form" );
     print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem;
+    my @first_inputs = $elem->subTree()->getElementsByTagName( "input" );
+    my $form = $first_inputs[0]->getParent();

 Fetch a HTML file via HTTP, and display its all <a> elements and attributes.

@@ -120,6 +123,43 @@

 This method returns $elem's innerText without tags.

+=head2 $subhtml = $elem->subTree();
+
+This method returns a new object of class HTML::Parser,
+with all the elements that are in the DOM hierarchy under $elem.
+
+=head2 $elem = $elem->nextSibling();
+
+This method returns the next sibling within the same parent.
+It returns undef when called on a closing tag or on the lastChild node
+of a parentNode.
+
+=head2 $elem = $elem->previousSibling();
+
+This method returns the previous sibling within the same parent.
+It returns undef when called on the firstChild node of a parentNode.
+
+=head2 $child_elem = $elem->firstChild();
+
+This method returns the first child node of $elem.
+It returns undef when called on a closing tag element or on a
+non-container or empty container element.
+
+=head2 $child_elems = $elem->childNodes();
+
+This method creates an array of all child nodes of $elem and returns the array by reference.
+It returns an empty array-ref [] whenever firstChild() would return undef.
+
+=head2 $child_elem = $elem->lastChild();
+
+This method returns the last child node of $elem.
+It returns undef whenever firstChild() would return undef.
+
+=head2 $parent = $elem->parentNode();
+
+This method returns the parent node of $elem.
+It returns undef when called on root nodes.
+
 =head2 $attr = $elem->attributes();

 This method returns a hash of $elem's all attributes.
@@ -128,6 +168,17 @@

 This method returns the value of $elem's attributes which name is $key.

+=head1 BUGS
+
+The HTML-Parser is simple. Methods innerText and subTree may be
+fooled by nested tags or embedded javascript code.
+
+The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results.
+The most expensive ones are lastChild() and previousSibling().
+parentNode() is also expensive, but only once. It does caching.
+
+The DOM tree is read-only, as this is just a parser.
+
 =head1 INTERNATIONALIZATION

 This module natively understands the character encoding used in document
@@ -157,12 +208,21 @@
 use Carp;

 use vars qw( $VERSION );
-$VERSION = "0.16";
+$VERSION = "0.16.1";

 my $J2E        = {qw( jis ISO-2022-JP sjis Shift_JIS euc EUC-JP ucs2 UCS2 )};
 my $E2J        = { map { lc($_) } reverse %$J2E };
 my $SEC_OF_DAY = 60 * 60 * 24;

+#  [000]	'/' if closing tag.
+#  [001]	tagName
+#  [002]	attributes string (with trailing /, if self-closing tag).
+#  [003]	content until next (nested) tag.
+#  [004]	attributes hash cache.
+#  [005]	innerText combined strings cache.
+#  [006]	index of matching closing tag (or opening tag, if [000]=='/')
+#  [007]	index of parent (aka container) tag.
+#
 sub new {
     my $package = shift;
     my $src     = shift;
@@ -330,10 +390,10 @@
     return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); # <xxx/>

     my $tagname = $elem->[001];
+    my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
     my $list    = [];
-    for ( ; $cur < $#$flat ; $cur++ ) {
+    for ( ; $cur < $closing ; $cur++ ) {
         push( @$list, $flat->[$cur]->[003] );
-        last if ( $flat->[ $cur + 1 ]->[001] eq $tagname );
     }
     my $text = join( "", grep { $_ ne "" } @$list );
     $text =~ s/^\s+//s;
@@ -342,6 +402,127 @@
     $elem->[005] = HTML::TagParser::Util::xml_unescape( $text );
 }

+sub subTree
+{
+    my $self = shift;
+    my ( $flat, $cur ) = @$self;
+    my $elem = $flat->[$cur];
+    return if $elem->[000];                         # </xxx>
+    my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
+    my $list    = [];
+    while (++$cur < $closing)
+      {
+        push @$list, $flat->[$cur];
+      }
+
+    # allow the getElement...() methods on the returned object.
+    return bless { flat => $list }, 'HTML::TagParser';
+}
+
+
+sub nextSibling
+{
+    my $self = shift;
+    my ( $flat, $cur ) = @$self;
+    my $elem = $flat->[$cur];
+
+    return undef if $elem->[000];                         # </xxx>
+    my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
+    my $next_s = $flat->[$closing+1];
+    return undef unless $next_s;
+    return undef if $next_s->[000];	# parent's </xxx>
+    return HTML::TagParser::Element->new( $flat, $closing+1 );
+}
+
+sub firstChild
+{
+    my $self = shift;
+    my ( $flat, $cur ) = @$self;
+    my $elem = $flat->[$cur];
+    return undef if $elem->[000];                         # </xxx>
+    my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
+    return undef if $closing <= $cur+1;			# no children here.
+    return HTML::TagParser::Element->new( $flat, $cur+1 );
+}
+
+sub childNodes
+{
+    my $self = shift;
+    my ( $flat, $cur ) = @$self;
+    my $child = firstChild($self);
+    return [] unless $child;	# an empty array is easier for our callers than undef
+    my @c = ( $child );
+    while (defined ($child = nextSibling($child)))
+      {
+        push @c, $child;
+      }
+    return \@c;
+}
+
+sub lastChild
+{
+    my $c = childNodes(@_);
+    return undef unless $c->[0];
+    return $c->[-1];
+}
+
+sub previousSibling
+{
+    my $self = shift;
+    my ( $flat, $cur ) = @$self;
+
+    ## This one is expensive.
+    ## We use find_closing() which walks forward.
+    ## We'd need a find_opening() which walks backwards.
+    ## So we walk backwards one by one and consult find_closing()
+    ## until we find $cur-1 or $cur.
+
+    my $idx = $cur-1;
+    while ($idx >= 0)
+      {
+        if ($flat->[$idx][000] && defined($flat->[$idx][006]))
+	  {
+	    $idx = $flat->[$idx][006];	# use cache for backwards skipping
+	    next;
+	  }
+
+        my $closing = HTML::TagParser::Util::find_closing($flat, $idx);
+	return HTML::TagParser::Element->new( $flat, $idx )
+	  if defined $closing and ($closing == $cur || $closing == $cur-1);
+	$idx--;
+      }
+    return undef;
+}
+
+sub parentNode
+{
+    my $self = shift;
+    my ( $flat, $cur ) = @$self;
+
+    return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007];	# cache
+
+    ##
+    ## This one is very expensive.
+    ## We use previousSibling() to walk backwards, and
+    ## previousSibling() is expensive.
+    ##
+    my $ps = $self;
+    my $first = $self;
+
+    while (defined($ps = previousSibling($ps))) { $first = $ps; }
+
+    my $parent = $first->[1] - 1;
+    return undef if $parent < 0;
+    die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur;
+
+    $flat->[$cur][007] = $parent;	# cache
+    return HTML::TagParser::Element->new( $flat, $parent )
+}
+
+##
+## feature:
+## self-closing tags have an additional attribute '/' => '/'.
+##
 sub attributes {
     my $self = shift;
     my ( $flat, $cur ) = @$self;
@@ -420,6 +601,66 @@
     $flat;
 }

+## returns 1 beyond the end, if not found.
+## returns undef if called on a </xxx> closing tag
+sub find_closing
+{
+  my ($flat, $cur) = @_;
+
+  return $flat->[$cur][006]        if   $flat->[$cur][006];	# cache
+  return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$});    # self-closing
+
+  my $name = $flat->[$cur][001];
+  my $pre_nest = 0;
+  ## count how many levels deep this type of tag is nested.
+  my $idx;
+  for ($idx = 0; $idx <= $cur; $idx++)
+    {
+      my $e = $flat->[$idx];
+      next unless   $e->[001] eq $name;
+      next if     (($e->[002]||'') =~ m{/$});	# self-closing
+      $pre_nest += ($e->[000]) ? -1 : 1;
+      $pre_nest = 0 if $pre_nest < 0;
+      $idx = $e->[006]-1 if !$e->[000] && $e->[006];	# use caches for skipping forward.
+    }
+  my $last_idx = $#$flat;
+
+  ## we move last_idx closer, in case this container
+  ## has not all its subcontainers closed properly.
+  my $post_nest = 0;
+  for ($idx = $last_idx; $idx > $cur; $idx--)
+    {
+      my $e = $flat->[$idx];
+      next unless    $e->[001] eq $name;
+      $last_idx = $idx-1;		# remember where a matching tag was
+      next if      (($e->[002]||'') =~ m{/$});	# self-closing
+      $post_nest -= ($e->[000]) ? -1 : 1;
+      $post_nest = 0 if $post_nest < 0;
+      last if $pre_nest <= $post_nest;
+      $idx = $e->[006]+1 if $e->[000] && defined $e->[006];	# use caches for skipping backwards.
+    }
+
+  my $nest = 1;		# we know it is not self-closing. start behind.
+
+  for ($idx = $cur+1; $idx <= $last_idx; $idx++)
+    {
+      my $e = $flat->[$idx];
+      next unless    $e->[001] eq $name;
+      next if      (($e->[002]||'') =~ m{/$});	# self-closing
+      $nest      += ($e->[000]) ? -1 : 1;
+      if ($nest <= 0)
+        {
+	  die "assert </xxx>" unless $e->[000];
+	  $e->[006] = $cur;	# point back to opening tag
+	  return $flat->[$cur][006] = $idx;
+	}
+      $idx = $e->[006]-1 if !$e->[000] && $e->[006];	# use caches for skipping forward.
+    }
+
+  # not all closed, but cannot go further
+  return $flat->[$cur][006] = $last_idx+1;
+}
+
 sub find_meta_charset {
     my $txtref = shift;    # reference
     while ( $$txtref =~ m{