forked from pool/perl-HTML-TableExtract
Accepting request 97856 from devel:languages:perl
- update to 2.11 - added parsing context, override for eof() and parse() for memory clear on new docs or post-eof() - fixed some long standing test warnings OBS-URL: https://build.opensuse.org/request/show/97856 OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/perl-HTML-TableExtract?expand=0&rev=15
This commit is contained in:
@@ -1,6 +0,0 @@
|
|||||||
diff -ruN HTML-TableExtract-2.10-orig/t/gnarly.html HTML-TableExtract-2.10/t/gnarly.html
|
|
||||||
--- HTML-TableExtract-2.10-orig/t/gnarly.html 2006-05-01 23:22:47.000000000 +0200
|
|
||||||
+++ HTML-TableExtract-2.10/t/gnarly.html 2011-02-25 18:41:08.000000000 +0100
|
|
||||||
@@ -1 +1 @@
|
|
||||||
-<html><head><title>gnarly table</title></head><body><table border=1><tr><td colspan=4 rowspan=1>(0,0) [1,4]</td><td colspan=4 rowspan=2>(0,1) [2,4]</td></tr><tr><td colspan=1 rowspan=2>(1,0) [2,1]</td><td colspan=1 rowspan=1>(1,1) [1,1]</td><td colspan=2 rowspan=1>(1,2) [1,2]</td></tr><tr><td colspan=4 rowspan=2>(2,0) [2,4]</td><td colspan=2 rowspan=2>(2,1) [2,2]</td><td colspan=1 rowspan=1>(2,2) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(3,0) [1,1]</td><td colspan=1 rowspan=1>(3,1) [1,1]</td></tr><tr><td colspan=2 rowspan=3>(4,0) [3,2]</td><td colspan=1 rowspan=1>(4,1) [1,1]</td><td colspan=1 rowspan=3>(4,2) [3,1]</td><td colspan=4 rowspan=4>(4,3) [4,4]</td></tr><tr><td colspan=1 rowspan=1>(5,0) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(6,0) [1,1]</td></tr><tr><td colspan=4 rowspan=1>(7,0) [1,4]</td></tr></table></body></html>
|
|
||||||
+<html><head><title>gnarly table</title></head><body><table border="1"><tr><td colspan="4" rowspan="1">(0,0) [1,4]</td><td colspan="4" rowspan="2">(0,1) [2,4]</td></tr><tr><td colspan="1" rowspan="2">(1,0) [2,1]</td><td colspan="1" rowspan="1">(1,1) [1,1]</td><td colspan="2" rowspan="1">(1,2) [1,2]</td></tr><tr><td colspan="4" rowspan="2">(2,0) [2,4]</td><td colspan="2" rowspan="2">(2,1) [2,2]</td><td colspan="1" rowspan="1">(2,2) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(3,0) [1,1]</td><td colspan="1" rowspan="1">(3,1) [1,1]</td></tr><tr><td colspan="2" rowspan="3">(4,0) [3,2]</td><td colspan="1" rowspan="1">(4,1) [1,1]</td><td colspan="1" rowspan="3">(4,2) [3,1]</td><td colspan="4" rowspan="4">(4,3) [4,4]</td></tr><tr><td colspan="1" rowspan="1">(5,0) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(6,0) [1,1]</td></tr><tr><td colspan="4" rowspan="1">(7,0) [1,4]</td></tr></table></body></html>
|
|
@@ -1,3 +0,0 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
|
||||||
oid sha256:26485211a78da74df7df5b1ebfd34cb017437179cf14937265b23959e1356d56
|
|
||||||
size 23675
|
|
3
HTML-TableExtract-2.11.tar.gz
Normal file
3
HTML-TableExtract-2.11.tar.gz
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1861d55a2aa1728ef56ea2d08d630b9a008456f1106994e4e49e76f56e4955ee
|
||||||
|
size 27123
|
@@ -1,3 +1,11 @@
|
|||||||
|
-------------------------------------------------------------------
|
||||||
|
Tue Dec 20 09:13:30 UTC 2011 - coolo@suse.com
|
||||||
|
|
||||||
|
- update to 2.11
|
||||||
|
- added parsing context, override for eof() and parse() for
|
||||||
|
memory clear on new docs or post-eof()
|
||||||
|
- fixed some long standing test warnings
|
||||||
|
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de
|
Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de
|
||||||
|
|
||||||
|
@@ -16,24 +16,24 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Name: perl-HTML-TableExtract
|
Name: perl-HTML-TableExtract
|
||||||
Version: 2.10
|
Version: 2.11
|
||||||
Release: 86
|
Release: 0
|
||||||
License: GPL+ or Artistic
|
|
||||||
%define cpan_name HTML-TableExtract
|
%define cpan_name HTML-TableExtract
|
||||||
Summary: For extracting the content contained in tables within an HTML document
|
Summary: Perl module for extracting the content contained in tables within an HTM[cut]
|
||||||
Url: http://search.cpan.org/dist/HTML-TableExtract/
|
License: GPL-1.0+ or Artistic-1.0
|
||||||
Group: Development/Libraries/Perl
|
Group: Development/Libraries/Perl
|
||||||
#Source: http://www.cpan.org/authors/id/M/MS/MSISK/HTML-TableExtract-2.10.tar.gz
|
Url: http://search.cpan.org/dist/HTML-TableExtract/
|
||||||
Source: %{cpan_name}-%{version}.tar.bz2
|
Source: http://www.cpan.org/authors/id/M/MS/MSISK/%{cpan_name}-%{version}.tar.gz
|
||||||
Patch0: %{cpan_name}-2.10-HTML.patch
|
|
||||||
BuildArch: noarch
|
BuildArch: noarch
|
||||||
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
||||||
BuildRequires: perl
|
BuildRequires: perl
|
||||||
BuildRequires: perl-macros
|
BuildRequires: perl-macros
|
||||||
BuildRequires: perl(HTML::ElementTable) >= 1.16
|
BuildRequires: perl(HTML::ElementTable) >= 1.16
|
||||||
BuildRequires: perl(HTML::Parser)
|
BuildRequires: perl(HTML::Parser)
|
||||||
|
#BuildRequires: perl(HTML::Entities)
|
||||||
|
#BuildRequires: perl(HTML::TableExtract)
|
||||||
|
#BuildRequires: perl(testload)
|
||||||
Requires: perl(HTML::ElementTable) >= 1.16
|
Requires: perl(HTML::ElementTable) >= 1.16
|
||||||
Requires: perl(HTML::Parser)
|
Requires: perl(HTML::Parser)
|
||||||
%{perl_requires}
|
%{perl_requires}
|
||||||
@@ -96,45 +96,9 @@ When extracting only text from tables, the text is decoded with
|
|||||||
HTML::Entities by default; this can be disabled by setting the _decode_
|
HTML::Entities by default; this can be disabled by setting the _decode_
|
||||||
parameter to 0.
|
parameter to 0.
|
||||||
|
|
||||||
Extraction Modes
|
|
||||||
The default mode of extraction for HTML::TableExtract is raw text or
|
|
||||||
HTML. In this mode, embedded tables are completely decoupled from one
|
|
||||||
another. In this case, HTML::TableExtract is a subclass of
|
|
||||||
HTML::Parser:
|
|
||||||
|
|
||||||
use HTML::TableExtract;
|
|
||||||
|
|
||||||
Alternativevly, tables can be extracted as HTML::ElementTable
|
|
||||||
structures, which are in turn embedded in an HTML::Element tree
|
|
||||||
representing the entire HTML document. Embedded tables are not
|
|
||||||
decoupled from one another since this tree structure must be
|
|
||||||
manitained. In this case, HTML::TableExtract is a subclass of
|
|
||||||
HTML::TreeBuilder (itself a subclass of HTML:::Parser):
|
|
||||||
|
|
||||||
use HTML::TableExtract qw(tree);
|
|
||||||
|
|
||||||
In either case, the basic interface for HTML::TableExtract and the
|
|
||||||
resulting table objects remains the same -- all that changes is what
|
|
||||||
you can do with the resulting data.
|
|
||||||
|
|
||||||
HTML::TableExtract is a subclass of HTML::Parser, and as such inherits
|
|
||||||
all of its basic methods such as 'parse()' and 'parse_file()'. During
|
|
||||||
scans, 'start()', 'end()', and 'text()' are utilized. Feel free to
|
|
||||||
override them, but if you do not eventually invoke them in the SUPER
|
|
||||||
class with some content, results are not guaranteed.
|
|
||||||
|
|
||||||
Advice
|
|
||||||
The main point of this module was to provide a flexible method of
|
|
||||||
extracting tabular information from HTML documents without relying to
|
|
||||||
heavily on the document layout. For that reason, I suggest using
|
|
||||||
_Headers_ whenever possible -- that way, you are anchoring your
|
|
||||||
extraction on what the document is trying to communicate rather than
|
|
||||||
some feature of the HTML comprising the document (other than the fact
|
|
||||||
that the data is contained in a table).
|
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -q -n %{cpan_name}-%{version}
|
%setup -q -n %{cpan_name}-%{version}
|
||||||
%patch0 -p1
|
find . -type f -print0 | xargs -0 chmod 644
|
||||||
|
|
||||||
%build
|
%build
|
||||||
%{__perl} Makefile.PL INSTALLDIRS=vendor
|
%{__perl} Makefile.PL INSTALLDIRS=vendor
|
||||||
@@ -148,11 +112,8 @@ Advice
|
|||||||
%perl_process_packlist
|
%perl_process_packlist
|
||||||
%perl_gen_filelist
|
%perl_gen_filelist
|
||||||
|
|
||||||
%clean
|
|
||||||
%{__rm} -rf %{buildroot}
|
|
||||||
|
|
||||||
%files -f %{name}.files
|
%files -f %{name}.files
|
||||||
%defattr(644,root,root,755)
|
%defattr(-,root,root,755)
|
||||||
%doc Changes README
|
%doc Changes README
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
Reference in New Issue
Block a user