forked from pool/perl-HTML-TableExtract
Accepting request 97856 from devel:languages:perl
- update to 2.11 - added parsing context, override for eof() and parse() for memory clear on new docs or post-eof() - fixed some long standing test warnings OBS-URL: https://build.opensuse.org/request/show/97856 OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/perl-HTML-TableExtract?expand=0&rev=15
This commit is contained in:
@@ -1,6 +0,0 @@
|
||||
diff -ruN HTML-TableExtract-2.10-orig/t/gnarly.html HTML-TableExtract-2.10/t/gnarly.html
|
||||
--- HTML-TableExtract-2.10-orig/t/gnarly.html 2006-05-01 23:22:47.000000000 +0200
|
||||
+++ HTML-TableExtract-2.10/t/gnarly.html 2011-02-25 18:41:08.000000000 +0100
|
||||
@@ -1 +1 @@
|
||||
-<html><head><title>gnarly table</title></head><body><table border=1><tr><td colspan=4 rowspan=1>(0,0) [1,4]</td><td colspan=4 rowspan=2>(0,1) [2,4]</td></tr><tr><td colspan=1 rowspan=2>(1,0) [2,1]</td><td colspan=1 rowspan=1>(1,1) [1,1]</td><td colspan=2 rowspan=1>(1,2) [1,2]</td></tr><tr><td colspan=4 rowspan=2>(2,0) [2,4]</td><td colspan=2 rowspan=2>(2,1) [2,2]</td><td colspan=1 rowspan=1>(2,2) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(3,0) [1,1]</td><td colspan=1 rowspan=1>(3,1) [1,1]</td></tr><tr><td colspan=2 rowspan=3>(4,0) [3,2]</td><td colspan=1 rowspan=1>(4,1) [1,1]</td><td colspan=1 rowspan=3>(4,2) [3,1]</td><td colspan=4 rowspan=4>(4,3) [4,4]</td></tr><tr><td colspan=1 rowspan=1>(5,0) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(6,0) [1,1]</td></tr><tr><td colspan=4 rowspan=1>(7,0) [1,4]</td></tr></table></body></html>
|
||||
+<html><head><title>gnarly table</title></head><body><table border="1"><tr><td colspan="4" rowspan="1">(0,0) [1,4]</td><td colspan="4" rowspan="2">(0,1) [2,4]</td></tr><tr><td colspan="1" rowspan="2">(1,0) [2,1]</td><td colspan="1" rowspan="1">(1,1) [1,1]</td><td colspan="2" rowspan="1">(1,2) [1,2]</td></tr><tr><td colspan="4" rowspan="2">(2,0) [2,4]</td><td colspan="2" rowspan="2">(2,1) [2,2]</td><td colspan="1" rowspan="1">(2,2) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(3,0) [1,1]</td><td colspan="1" rowspan="1">(3,1) [1,1]</td></tr><tr><td colspan="2" rowspan="3">(4,0) [3,2]</td><td colspan="1" rowspan="1">(4,1) [1,1]</td><td colspan="1" rowspan="3">(4,2) [3,1]</td><td colspan="4" rowspan="4">(4,3) [4,4]</td></tr><tr><td colspan="1" rowspan="1">(5,0) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(6,0) [1,1]</td></tr><tr><td colspan="4" rowspan="1">(7,0) [1,4]</td></tr></table></body></html>
|
@@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:26485211a78da74df7df5b1ebfd34cb017437179cf14937265b23959e1356d56
|
||||
size 23675
|
3
HTML-TableExtract-2.11.tar.gz
Normal file
3
HTML-TableExtract-2.11.tar.gz
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1861d55a2aa1728ef56ea2d08d630b9a008456f1106994e4e49e76f56e4955ee
|
||||
size 27123
|
@@ -1,3 +1,11 @@
|
||||
-------------------------------------------------------------------
|
||||
Tue Dec 20 09:13:30 UTC 2011 - coolo@suse.com
|
||||
|
||||
- update to 2.11
|
||||
- added parsing context, override for eof() and parse() for
|
||||
memory clear on new docs or post-eof()
|
||||
- fixed some long standing test warnings
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de
|
||||
|
||||
|
@@ -16,24 +16,24 @@
|
||||
#
|
||||
|
||||
|
||||
|
||||
Name: perl-HTML-TableExtract
|
||||
Version: 2.10
|
||||
Release: 86
|
||||
License: GPL+ or Artistic
|
||||
Version: 2.11
|
||||
Release: 0
|
||||
%define cpan_name HTML-TableExtract
|
||||
Summary: For extracting the content contained in tables within an HTML document
|
||||
Url: http://search.cpan.org/dist/HTML-TableExtract/
|
||||
Summary: Perl module for extracting the content contained in tables within an HTM[cut]
|
||||
License: GPL-1.0+ or Artistic-1.0
|
||||
Group: Development/Libraries/Perl
|
||||
#Source: http://www.cpan.org/authors/id/M/MS/MSISK/HTML-TableExtract-2.10.tar.gz
|
||||
Source: %{cpan_name}-%{version}.tar.bz2
|
||||
Patch0: %{cpan_name}-2.10-HTML.patch
|
||||
Url: http://search.cpan.org/dist/HTML-TableExtract/
|
||||
Source: http://www.cpan.org/authors/id/M/MS/MSISK/%{cpan_name}-%{version}.tar.gz
|
||||
BuildArch: noarch
|
||||
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
||||
BuildRequires: perl
|
||||
BuildRequires: perl-macros
|
||||
BuildRequires: perl(HTML::ElementTable) >= 1.16
|
||||
BuildRequires: perl(HTML::Parser)
|
||||
#BuildRequires: perl(HTML::Entities)
|
||||
#BuildRequires: perl(HTML::TableExtract)
|
||||
#BuildRequires: perl(testload)
|
||||
Requires: perl(HTML::ElementTable) >= 1.16
|
||||
Requires: perl(HTML::Parser)
|
||||
%{perl_requires}
|
||||
@@ -96,45 +96,9 @@ When extracting only text from tables, the text is decoded with
|
||||
HTML::Entities by default; this can be disabled by setting the _decode_
|
||||
parameter to 0.
|
||||
|
||||
Extraction Modes
|
||||
The default mode of extraction for HTML::TableExtract is raw text or
|
||||
HTML. In this mode, embedded tables are completely decoupled from one
|
||||
another. In this case, HTML::TableExtract is a subclass of
|
||||
HTML::Parser:
|
||||
|
||||
use HTML::TableExtract;
|
||||
|
||||
Alternativevly, tables can be extracted as HTML::ElementTable
|
||||
structures, which are in turn embedded in an HTML::Element tree
|
||||
representing the entire HTML document. Embedded tables are not
|
||||
decoupled from one another since this tree structure must be
|
||||
manitained. In this case, HTML::TableExtract is a subclass of
|
||||
HTML::TreeBuilder (itself a subclass of HTML:::Parser):
|
||||
|
||||
use HTML::TableExtract qw(tree);
|
||||
|
||||
In either case, the basic interface for HTML::TableExtract and the
|
||||
resulting table objects remains the same -- all that changes is what
|
||||
you can do with the resulting data.
|
||||
|
||||
HTML::TableExtract is a subclass of HTML::Parser, and as such inherits
|
||||
all of its basic methods such as 'parse()' and 'parse_file()'. During
|
||||
scans, 'start()', 'end()', and 'text()' are utilized. Feel free to
|
||||
override them, but if you do not eventually invoke them in the SUPER
|
||||
class with some content, results are not guaranteed.
|
||||
|
||||
Advice
|
||||
The main point of this module was to provide a flexible method of
|
||||
extracting tabular information from HTML documents without relying to
|
||||
heavily on the document layout. For that reason, I suggest using
|
||||
_Headers_ whenever possible -- that way, you are anchoring your
|
||||
extraction on what the document is trying to communicate rather than
|
||||
some feature of the HTML comprising the document (other than the fact
|
||||
that the data is contained in a table).
|
||||
|
||||
%prep
|
||||
%setup -q -n %{cpan_name}-%{version}
|
||||
%patch0 -p1
|
||||
find . -type f -print0 | xargs -0 chmod 644
|
||||
|
||||
%build
|
||||
%{__perl} Makefile.PL INSTALLDIRS=vendor
|
||||
@@ -148,11 +112,8 @@ Advice
|
||||
%perl_process_packlist
|
||||
%perl_gen_filelist
|
||||
|
||||
%clean
|
||||
%{__rm} -rf %{buildroot}
|
||||
|
||||
%files -f %{name}.files
|
||||
%defattr(644,root,root,755)
|
||||
%defattr(-,root,root,755)
|
||||
%doc Changes README
|
||||
|
||||
%changelog
|
||||
|
Reference in New Issue
Block a user