", $elem->innerText(), "

++ AAA ++

++ BBB ++ CCC ++ DDD ++

++ EEE ++

++ FFF ++

++ ++ ++EOT ++# ---------------------------------------------------------------- ++ ++my $document = HTML::TagParser->new( $SOURCE ); ++ok( ref $document, "new()" ); ++my $bar = $document->getElementById('bar'); ++my $fff = $bar->nextSibling(); ++like( $fff->innerText(), qr/FFF/s, "nextSibling" ); ++is( $fff->nextSibling(), undef, "no nextSibling" ); ++my $ch = $bar->childNodes(); ++is( $#$ch, 1, "childNodes" ); ++is( $ch->[1]->parentNode()->id(), "bar", "parentNode" ); ++is( $ch->[1]->parentNode()->parentNode()->id(), "foo", "parent.parentNode" ); ++is( $ch->[1]->parentNode()->parentNode()->parentNode->parentNode()->parentNode(), undef, "root parentNode" ); ++ ++# ---------------------------------------------------------------- ++;1; ++# ---------------------------------------------------------------- +--- HTML-TagParser-0.16/t/08_nest.t.orig 2006-05-05 21:14:24.000000000 +0200 ++++ HTML-TagParser-0.16/t/08_nest.t 2008-07-11 01:29:17.000000000 +0200 +@@ -28,12 +28,12 @@ + like( $body->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "body" ); + + my $foo = $html->getElementById( "foo" ); +- like( $foo->innerText(), qr/AAA/s, "foo" ); +-# like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); ++# like( $foo->innerText(), qr/AAA/s, "foo" ); ++ like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); + + my $bar = $html->getElementById( "bar" ); +- like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); +-# like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); ++# like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); ++ like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); + # ---------------------------------------------------------------- + ;1; + # ---------------------------------------------------------------- +--- HTML-TagParser-0.16/lib/HTML/TagParser.pm.orig 2007-04-06 12:48:08.000000000 +0200 ++++ HTML-TagParser-0.16/lib/HTML/TagParser.pm 2008-07-11 02:15:14.000000000 +0200 +@@ -10,12 +10,15 @@ + my $elem = $html->getElementsByTagName( "title" ); + print "", $elem->innerText(), "\n" if ref $elem; + +-Parse a HTML source and find its first '; + my $html = HTML::TagParser->new( $src ); + my $elem = $html->getElementsByTagName( "form" ); + print "

getAttribute("action"), "\">\n" if ref $elem; ++ my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); ++ my $form = $first_inputs[0]->getParent(); + + Fetch a HTML file via HTTP, and display its all elements and attributes. + +@@ -120,6 +123,43 @@ + + This method returns $elem's innerText without tags. + ++=head2 $subhtml = $elem->subTree(); ++ ++This method returns a new object of class HTML::Parser, ++with all the elements that are in the DOM hierarchy under $elem. ++ ++=head2 $elem = $elem->nextSibling(); ++ ++This method returns the next sibling within the same parent. ++It returns undef when called on a closing tag or on the lastChild node ++of a parentNode. ++ ++=head2 $elem = $elem->previousSibling(); ++ ++This method returns the previous sibling within the same parent. ++It returns undef when called on the firstChild node of a parentNode. ++ ++=head2 $child_elem = $elem->firstChild(); ++ ++This method returns the first child node of $elem. ++It returns undef when called on a closing tag element or on a ++non-container or empty container element. ++ ++=head2 $child_elems = $elem->childNodes(); ++ ++This method creates an array of all child nodes of $elem and returns the array by reference. ++It returns an empty array-ref [] whenever firstChild() would return undef. ++ ++=head2 $child_elem = $elem->lastChild(); ++ ++This method returns the last child node of $elem. ++It returns undef whenever firstChild() would return undef. ++ ++=head2 $parent = $elem->parentNode(); ++ ++This method returns the parent node of $elem. ++It returns undef when called on root nodes. ++ + =head2 $attr = $elem->attributes(); + + This method returns a hash of $elem's all attributes. +@@ -128,6 +168,17 @@ + + This method returns the value of $elem's attributes which name is $key. + ++=head1 BUGS ++ ++The HTML-Parser is simple. Methods innerText and subTree may be ++fooled by nested tags or embedded javascript code. ++ ++The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. ++The most expensive ones are lastChild() and previousSibling(). ++parentNode() is also expensive, but only once. It does caching. ++ ++The DOM tree is read-only, as this is just a parser. ++ + =head1 INTERNATIONALIZATION + + This module natively understands the character encoding used in document +@@ -157,12 +208,21 @@ + use Carp; + + use vars qw( $VERSION ); +-$VERSION = "0.16"; ++$VERSION = "0.16.1"; + + my $J2E = {qw( jis ISO-2022-JP sjis Shift_JIS euc EUC-JP ucs2 UCS2 )}; + my $E2J = { map { lc($_) } reverse %$J2E }; + my $SEC_OF_DAY = 60 * 60 * 24; + ++# [000] '/' if closing tag. ++# [001] tagName ++# [002] attributes string (with trailing /, if self-closing tag). ++# [003] content until next (nested) tag. ++# [004] attributes hash cache. ++# [005] innerText combined strings cache. ++# [006] index of matching closing tag (or opening tag, if [000]=='/') ++# [007] index of parent (aka container) tag. ++# + sub new { + my $package = shift; + my $src = shift; +@@ -330,10 +390,10 @@ + return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); # + + my $tagname = $elem->[001]; ++ my $closing = HTML::TagParser::Util::find_closing($flat, $cur); + my $list = []; +- for ( ; $cur < $#$flat ; $cur++ ) { ++ for ( ; $cur < $closing ; $cur++ ) { + push( @$list, $flat->[$cur]->[003] ); +- last if ( $flat->[ $cur + 1 ]->[001] eq $tagname ); + } + my $text = join( "", grep { $_ ne "" } @$list ); + $text =~ s/^\s+//s; +@@ -342,6 +402,127 @@ + $elem->[005] = HTML::TagParser::Util::xml_unescape( $text ); + } + ++sub subTree ++{ ++ my $self = shift; ++ my ( $flat, $cur ) = @$self; ++ my $elem = $flat->[$cur]; ++ return if $elem->[000]; # ++ my $closing = HTML::TagParser::Util::find_closing($flat, $cur); ++ my $list = []; ++ while (++$cur < $closing) ++ { ++ push @$list, $flat->[$cur]; ++ } ++ ++ # allow the getElement...() methods on the returned object. ++ return bless { flat => $list }, 'HTML::TagParser'; ++} ++ ++ ++sub nextSibling ++{ ++ my $self = shift; ++ my ( $flat, $cur ) = @$self; ++ my $elem = $flat->[$cur]; ++ ++ return undef if $elem->[000]; # ++ my $closing = HTML::TagParser::Util::find_closing($flat, $cur); ++ my $next_s = $flat->[$closing+1]; ++ return undef unless $next_s; ++ return undef if $next_s->[000]; # parent's ++ return HTML::TagParser::Element->new( $flat, $closing+1 ); ++} ++ ++sub firstChild ++{ ++ my $self = shift; ++ my ( $flat, $cur ) = @$self; ++ my $elem = $flat->[$cur]; ++ return undef if $elem->[000]; # ++ my $closing = HTML::TagParser::Util::find_closing($flat, $cur); ++ return undef if $closing <= $cur+1; # no children here. ++ return HTML::TagParser::Element->new( $flat, $cur+1 ); ++} ++ ++sub childNodes ++{ ++ my $self = shift; ++ my ( $flat, $cur ) = @$self; ++ my $child = firstChild($self); ++ return [] unless $child; # an empty array is easier for our callers than undef ++ my @c = ( $child ); ++ while (defined ($child = nextSibling($child))) ++ { ++ push @c, $child; ++ } ++ return \@c; ++} ++ ++sub lastChild ++{ ++ my $c = childNodes(@_); ++ return undef unless $c->[0]; ++ return $c->[-1]; ++} ++ ++sub previousSibling ++{ ++ my $self = shift; ++ my ( $flat, $cur ) = @$self; ++ ++ ## This one is expensive. ++ ## We use find_closing() which walks forward. ++ ## We'd need a find_opening() which walks backwards. ++ ## So we walk backwards one by one and consult find_closing() ++ ## until we find $cur-1 or $cur. ++ ++ my $idx = $cur-1; ++ while ($idx >= 0) ++ { ++ if ($flat->[$idx][000] && defined($flat->[$idx][006])) ++ { ++ $idx = $flat->[$idx][006]; # use cache for backwards skipping ++ next; ++ } ++ ++ my $closing = HTML::TagParser::Util::find_closing($flat, $idx); ++ return HTML::TagParser::Element->new( $flat, $idx ) ++ if defined $closing and ($closing == $cur || $closing == $cur-1); ++ $idx--; ++ } ++ return undef; ++} ++ ++sub parentNode ++{ ++ my $self = shift; ++ my ( $flat, $cur ) = @$self; ++ ++ return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache ++ ++ ## ++ ## This one is very expensive. ++ ## We use previousSibling() to walk backwards, and ++ ## previousSibling() is expensive. ++ ## ++ my $ps = $self; ++ my $first = $self; ++ ++ while (defined($ps = previousSibling($ps))) { $first = $ps; } ++ ++ my $parent = $first->[1] - 1; ++ return undef if $parent < 0; ++ die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; ++ ++ $flat->[$cur][007] = $parent; # cache ++ return HTML::TagParser::Element->new( $flat, $parent ) ++} ++ ++## ++## feature: ++## self-closing tags have an additional attribute '/' => '/'. ++## + sub attributes { + my $self = shift; + my ( $flat, $cur ) = @$self; +@@ -420,6 +601,66 @@ + $flat; + } + ++## returns 1 beyond the end, if not found. ++## returns undef if called on a closing tag ++sub find_closing ++{ ++ my ($flat, $cur) = @_; ++ ++ return $flat->[$cur][006] if $flat->[$cur][006]; # cache ++ return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing ++ ++ my $name = $flat->[$cur][001]; ++ my $pre_nest = 0; ++ ## count how many levels deep this type of tag is nested. ++ my $idx; ++ for ($idx = 0; $idx <= $cur; $idx++) ++ { ++ my $e = $flat->[$idx]; ++ next unless $e->[001] eq $name; ++ next if (($e->[002]||'') =~ m{/$}); # self-closing ++ $pre_nest += ($e->[000]) ? -1 : 1; ++ $pre_nest = 0 if $pre_nest < 0; ++ $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. ++ } ++ my $last_idx = $#$flat; ++ ++ ## we move last_idx closer, in case this container ++ ## has not all its subcontainers closed properly. ++ my $post_nest = 0; ++ for ($idx = $last_idx; $idx > $cur; $idx--) ++ { ++ my $e = $flat->[$idx]; ++ next unless $e->[001] eq $name; ++ $last_idx = $idx-1; # remember where a matching tag was ++ next if (($e->[002]||'') =~ m{/$}); # self-closing ++ $post_nest -= ($e->[000]) ? -1 : 1; ++ $post_nest = 0 if $post_nest < 0; ++ last if $pre_nest <= $post_nest; ++ $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards. ++ } ++ ++ my $nest = 1; # we know it is not self-closing. start behind. ++ ++ for ($idx = $cur+1; $idx <= $last_idx; $idx++) ++ { ++ my $e = $flat->[$idx]; ++ next unless $e->[001] eq $name; ++ next if (($e->[002]||'') =~ m{/$}); # self-closing ++ $nest += ($e->[000]) ? -1 : 1; ++ if ($nest <= 0) ++ { ++ die "assert " unless $e->[000]; ++ $e->[006] = $cur; # point back to opening tag ++ return $flat->[$cur][006] = $idx; ++ } ++ $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. ++ } ++ ++ # not all closed, but cannot go further ++ return $flat->[$cur][006] = $last_idx+1; ++} ++ + sub find_meta_charset { + my $txtref = shift; # reference + while ( $$txtref =~ m{ diff --git a/HTML-TagParser-0.16.tar.bz2 b/HTML-TagParser-0.16.tar.bz2 new file mode 100644 index 0000000..8d1e527 --- /dev/null +++ b/HTML-TagParser-0.16.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:486f73a9e6727cd2169cf451e2ad50dcd9010a0366dc12786ca5c83624ea9711 +size 35210 diff --git a/parse.pl b/parse.pl new file mode 100644 index 0000000..8a8e10e --- /dev/null +++ b/parse.pl @@ -0,0 +1,76 @@ +#! /usr/bin/perl -w -t +# +# a small test script, that +# takes your favourite web pages and turns it into +# a parsed perl data structure. + +use strict; +use HTML::TagParser; +use Data::Dumper; +$Data::Dumper::Sortkeys = 1; + +my $text = eval { local $/; open my $in, "<", shift; <$in>}; + +my $dom = HTML::TagParser->new($text); +my $r = {}; +for my $e ($dom->getElementsByTagName('form')) + { + my $s = $e->subTree; + my $sr = {}; + for my $se ($s->getElementsByTagName('input')) + { + store_element($sr, $se, 'input', 0); + } + for my $se ($s->getElementsByTagName('textarea')) + { + store_element($sr, $se, 'textarea', 1); + } + + store_element($r, $e, 'form', $sr); + } + +for my $e ($dom->getElementsByTagName('a')) + { + my $attr = $e->attributes; + next unless defined $attr->{href}; + store_element($r, $e, 'href', 0); + } + +die Dumper $r; +exit 0; + +############################################## + +sub store_element +{ + my ($r, $e, $tn, $container) = @_; + my $attr = $e->attributes; + my $inp = + { + tagname => $e->tagName, + text => $e->innerText()||'', + }; + delete $inp->{text} unless length $inp->{text}; + delete $attr->{'/'}; + if ($container) + { + $inp->{attr} = $attr; + if (ref $container) + { + for my $c (keys %$container) { $inp->{$c} = $container->{$c} } + } + } + else + { + ## inline attr. + for my $a (keys %$attr) { $inp->{$a} = $attr->{$a} } + } + + push @{$r->{$tn}}, $inp; + for my $a qw(id name type) + { + push @{$r->{$tn.'_by_'.$a}{$attr->{$a}}}, $inp + if defined $attr->{$a}; + } +} + diff --git a/perl-HTML-TagParser.changes b/perl-HTML-TagParser.changes new file mode 100644 index 0000000..0946d8e --- /dev/null +++ b/perl-HTML-TagParser.changes @@ -0,0 +1,12 @@ +------------------------------------------------------------------- +Wed Jul 9 23:14:09 CEST 2008 - jw@suse.de + +- tweaked the t/08_nest.t to no longer expect + the errors that the old code used to make. + +------------------------------------------------------------------- +Wed Jul 9 00:57:57 CEST 2008 - jw@suse.de + +- initial checkin of upstream version 0.16 +- own subtree patch applied. + diff --git a/perl-HTML-TagParser.spec b/perl-HTML-TagParser.spec new file mode 100644 index 0000000..f41fe9b --- /dev/null +++ b/perl-HTML-TagParser.spec @@ -0,0 +1,76 @@ +# +# spec file for package perl-HTML-TagParser (Version 0.16.1) +# +# Copyright (c) 2007 SUSE LINUX Products GmbH, Nuernberg, Germany. +# This file and all modifications and additions to the pristine +# package are under the same license as the package itself. +# +# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# + +# norootforbuild + +%define oversion 0.16 + +Name: perl-HTML-TagParser +URL: http://search.cpan.org/~kawasaki/HTML-TagParser-%{oversion}/ +License: GPL, Artistic License +Group: Development/Languages/Perl +Requires: perl = %{perl_version} +Autoreqprov: on +Summary: Yet another HTML document parser with DOM-like methods +Packager: jw@suse.de + +Version: 0.16.1 +Release: 1 +Source: HTML-TagParser-%{oversion}.tar.bz2 +Patch: HTML-TagParser-%{oversion}-subtree.diff +BuildRoot: %{_tmppath}/%{name}-%{version}-build + +%description +HTML::TagParser - Yet another HTML document parser with DOM-like methods. +Patched version with improved container detection, added subtree extraction and +DOM-like navigation methods. + +Authors: +-------- +Yusuke Kawasaki + +Contributors +------------ +Juergen Weigert + + +%prep +%setup -n HTML-TagParser-%{oversion} +%patch -p1 + +%build +perl Makefile.PL +make +make test + +%install +chmod a-x README +make DESTDIR=$RPM_BUILD_ROOT install_vendor +%if 0%{?suse_version} +%perl_process_packlist +%else +rm -f $RPM_BUILD_ROOT/%{perl_archlib}/perllocal.pod +%endif + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root) +%doc MANIFEST Changes README +%{perl_vendorlib}/HTML +%dir %{perl_vendorarch}/auto/HTML-TagParser +%{perl_vendorarch}/auto/HTML-TagParser/.packlist +%{_mandir}/man3/HTML::TagParser*.3pm* +%if 0%{?suse_version} +/var/adm/perl-modules/perl-HTML-TagParser +%endif + +%changelog