forked from pool/perl-HTML-TableExtract
cpanspec, noarch pkg
OBS-URL: https://build.opensuse.org/package/show/devel:languages:perl/perl-HTML-TableExtract?expand=0&rev=20
This commit is contained in:
committed by
Git OBS Bridge
parent
f8356d3ca3
commit
08f522976f
6
HTML-TableExtract-2.10-HTML.patch
Normal file
6
HTML-TableExtract-2.10-HTML.patch
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
diff -ruN HTML-TableExtract-2.10-orig/t/gnarly.html HTML-TableExtract-2.10/t/gnarly.html
|
||||||
|
--- HTML-TableExtract-2.10-orig/t/gnarly.html 2006-05-01 23:22:47.000000000 +0200
|
||||||
|
+++ HTML-TableExtract-2.10/t/gnarly.html 2011-02-25 18:41:08.000000000 +0100
|
||||||
|
@@ -1 +1 @@
|
||||||
|
-<html><head><title>gnarly table</title></head><body><table border=1><tr><td colspan=4 rowspan=1>(0,0) [1,4]</td><td colspan=4 rowspan=2>(0,1) [2,4]</td></tr><tr><td colspan=1 rowspan=2>(1,0) [2,1]</td><td colspan=1 rowspan=1>(1,1) [1,1]</td><td colspan=2 rowspan=1>(1,2) [1,2]</td></tr><tr><td colspan=4 rowspan=2>(2,0) [2,4]</td><td colspan=2 rowspan=2>(2,1) [2,2]</td><td colspan=1 rowspan=1>(2,2) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(3,0) [1,1]</td><td colspan=1 rowspan=1>(3,1) [1,1]</td></tr><tr><td colspan=2 rowspan=3>(4,0) [3,2]</td><td colspan=1 rowspan=1>(4,1) [1,1]</td><td colspan=1 rowspan=3>(4,2) [3,1]</td><td colspan=4 rowspan=4>(4,3) [4,4]</td></tr><tr><td colspan=1 rowspan=1>(5,0) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(6,0) [1,1]</td></tr><tr><td colspan=4 rowspan=1>(7,0) [1,4]</td></tr></table></body></html>
|
||||||
|
+<html><head><title>gnarly table</title></head><body><table border="1"><tr><td colspan="4" rowspan="1">(0,0) [1,4]</td><td colspan="4" rowspan="2">(0,1) [2,4]</td></tr><tr><td colspan="1" rowspan="2">(1,0) [2,1]</td><td colspan="1" rowspan="1">(1,1) [1,1]</td><td colspan="2" rowspan="1">(1,2) [1,2]</td></tr><tr><td colspan="4" rowspan="2">(2,0) [2,4]</td><td colspan="2" rowspan="2">(2,1) [2,2]</td><td colspan="1" rowspan="1">(2,2) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(3,0) [1,1]</td><td colspan="1" rowspan="1">(3,1) [1,1]</td></tr><tr><td colspan="2" rowspan="3">(4,0) [3,2]</td><td colspan="1" rowspan="1">(4,1) [1,1]</td><td colspan="1" rowspan="3">(4,2) [3,1]</td><td colspan="4" rowspan="4">(4,3) [4,4]</td></tr><tr><td colspan="1" rowspan="1">(5,0) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(6,0) [1,1]</td></tr><tr><td colspan="4" rowspan="1">(7,0) [1,4]</td></tr></table></body></html>
|
@@ -1,3 +1,11 @@
|
|||||||
|
-------------------------------------------------------------------
|
||||||
|
Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de
|
||||||
|
|
||||||
|
- recreated by cpanspec 1.78.03
|
||||||
|
o fix deps
|
||||||
|
- add HTML patch
|
||||||
|
- noarch pkg
|
||||||
|
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
Wed Dec 1 13:27:30 UTC 2010 - coolo@novell.com
|
Wed Dec 1 13:27:30 UTC 2010 - coolo@novell.com
|
||||||
|
|
||||||
|
@@ -15,58 +15,142 @@
|
|||||||
# Please submit bugfixes or comments via http://bugs.opensuse.org/
|
# Please submit bugfixes or comments via http://bugs.opensuse.org/
|
||||||
#
|
#
|
||||||
|
|
||||||
# norootforbuild
|
|
||||||
|
|
||||||
|
|
||||||
Name: perl-HTML-TableExtract
|
Name: perl-HTML-TableExtract
|
||||||
Url: http://cpan.org/modules/by-module/HTML/
|
|
||||||
License: Public Domain, Freeware
|
|
||||||
Group: Development/Libraries/Perl
|
|
||||||
AutoReqProv: on
|
|
||||||
Requires: perl-HTML-Parser
|
|
||||||
BuildRequires: perl-HTML-Parser
|
|
||||||
BuildRequires: perl-macros
|
|
||||||
# Needed only for tests:
|
|
||||||
BuildRequires: perl-HTML-Tree perl-Test-Pod-Coverage
|
|
||||||
Summary: Simplifies extraction of information within tables in HTML documents
|
|
||||||
Version: 2.10
|
Version: 2.10
|
||||||
Release: 81
|
Release: 82
|
||||||
Source: HTML-TableExtract-%{version}.tar.bz2
|
License: GPL+ or Artistic
|
||||||
|
%define cpan_name HTML-TableExtract
|
||||||
|
Summary: For extracting the content contained in tables within an HTML document
|
||||||
|
Url: http://search.cpan.org/dist/HTML-TableExtract/
|
||||||
|
Group: Development/Libraries/Perl
|
||||||
|
#Source: http://www.cpan.org/authors/id/M/MS/MSISK/HTML-TableExtract-2.10.tar.gz
|
||||||
|
Source: %{cpan_name}-%{version}.tar.bz2
|
||||||
|
Patch0: %{cpan_name}-2.10-HTML.patch
|
||||||
|
BuildArch: noarch
|
||||||
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
||||||
|
BuildRequires: perl
|
||||||
|
BuildRequires: perl-macros
|
||||||
|
BuildRequires: perl(HTML::ElementTable) >= 1.16
|
||||||
|
BuildRequires: perl(HTML::Parser)
|
||||||
|
Requires: perl(HTML::ElementTable) >= 1.16
|
||||||
|
Requires: perl(HTML::Parser)
|
||||||
%{perl_requires}
|
%{perl_requires}
|
||||||
|
|
||||||
%description
|
%description
|
||||||
HTML::TableExtract is a module that simplifies the extraction of
|
HTML::TableExtract is a subclass of HTML::Parser that serves to extract the
|
||||||
information contained in tables within HTML documents.
|
information from tables of interest contained within an HTML document. The
|
||||||
|
information from each extracted table is stored in table objects. Tables
|
||||||
|
can be extracted as text, HTML, or HTML::ElementTable structures (for
|
||||||
|
in-place editing or manipulation).
|
||||||
|
|
||||||
Tables of note may be specified using Headers, Depth, Count,
|
There are currently four constraints available to specify which tables you
|
||||||
Attributes, or some combination of the three. See the module
|
would like to extract from a document: _Headers_, _Depth_, _Count_, and
|
||||||
documentation for details.
|
_Attributes_.
|
||||||
|
|
||||||
|
_Headers_, the most flexible and adaptive of the techniques, involves
|
||||||
|
specifying text in an array that you expect to appear above the data in the
|
||||||
|
tables of interest. Once all headers have been located in a row of that
|
||||||
|
table, all further cells beneath the columns that matched your headers are
|
||||||
|
extracted. All other columns are ignored: think of it as vertical slices
|
||||||
|
through a table. In addition, TableExtract automatically rearranges each
|
||||||
|
row in the same order as the headers you provided. If you would like to
|
||||||
|
disable this, set _automap_ to 0 during object creation, and instead rely
|
||||||
|
on the column_map() method to find out the order in which the headers were
|
||||||
|
found. Furthermore, TableExtract will automatically compensate for cell
|
||||||
|
span issues so that columns are really the same columns as you would
|
||||||
|
visually see in a browser. This behavior can be disabled by setting the
|
||||||
|
_gridmap_ parameter to 0. HTML is stripped from the entire textual content
|
||||||
|
of a cell before header matches are attempted -- unless the _keep_html_
|
||||||
|
parameter was enabled.
|
||||||
|
|
||||||
|
_Depth_ and _Count_ are more specific ways to specify tables in relation to
|
||||||
|
one another. _Depth_ represents how deeply a table resides in other tables.
|
||||||
|
The depth of a top-level table in the document is 0. A table within a
|
||||||
|
top-level table has a depth of 1, and so on. Each depth can be thought of
|
||||||
|
as a layer; tables sharing the same depth are on the same layer. Within
|
||||||
|
each of these layers, _Count_ represents the order in which a table was
|
||||||
|
seen at that depth, starting with 0. Providing both a _depth_ and a _count_
|
||||||
|
will uniquely specify a table within a document.
|
||||||
|
|
||||||
|
_Attributes_ match based on the attributes of the html <table> tag, for
|
||||||
|
example, boder widths or background color.
|
||||||
|
|
||||||
|
Each of the _Headers_, _Depth_, _Count_, and _Attributes_ specifications
|
||||||
|
are cumulative in their effect on the overall extraction. For instance, if
|
||||||
|
you specify only a _Depth_, then you get all tables at that depth (note
|
||||||
|
that these could very well reside in separate higher- level tables
|
||||||
|
throughout the document since depth extends across tables). If you specify
|
||||||
|
only a _Count_, then the tables at that _Count_ from all depths are
|
||||||
|
returned (i.e., the _n_th occurrence of a table at each depth). If you only
|
||||||
|
specify _Headers_, then you get all tables in the document containing those
|
||||||
|
column headers. If you have specified multiple constraints of _Headers_,
|
||||||
|
_Depth_, _Count_, and _Attributes_, then each constraint has veto power
|
||||||
|
over whether a particular table is extracted.
|
||||||
|
|
||||||
|
If no _Headers_, _Depth_, _Count_, or _Attributes_ are specified, then all
|
||||||
|
tables match.
|
||||||
|
|
||||||
|
When extracting only text from tables, the text is decoded with
|
||||||
|
HTML::Entities by default; this can be disabled by setting the _decode_
|
||||||
|
parameter to 0.
|
||||||
|
|
||||||
|
Extraction Modes
|
||||||
|
The default mode of extraction for HTML::TableExtract is raw text or
|
||||||
|
HTML. In this mode, embedded tables are completely decoupled from one
|
||||||
|
another. In this case, HTML::TableExtract is a subclass of
|
||||||
|
HTML::Parser:
|
||||||
|
|
||||||
|
use HTML::TableExtract;
|
||||||
|
|
||||||
|
Alternativevly, tables can be extracted as HTML::ElementTable
|
||||||
|
structures, which are in turn embedded in an HTML::Element tree
|
||||||
|
representing the entire HTML document. Embedded tables are not
|
||||||
|
decoupled from one another since this tree structure must be
|
||||||
|
manitained. In this case, HTML::TableExtract is a subclass of
|
||||||
|
HTML::TreeBuilder (itself a subclass of HTML:::Parser):
|
||||||
|
|
||||||
|
use HTML::TableExtract qw(tree);
|
||||||
|
|
||||||
|
In either case, the basic interface for HTML::TableExtract and the
|
||||||
|
resulting table objects remains the same -- all that changes is what
|
||||||
|
you can do with the resulting data.
|
||||||
|
|
||||||
|
HTML::TableExtract is a subclass of HTML::Parser, and as such inherits
|
||||||
|
all of its basic methods such as 'parse()' and 'parse_file()'. During
|
||||||
|
scans, 'start()', 'end()', and 'text()' are utilized. Feel free to
|
||||||
|
override them, but if you do not eventually invoke them in the SUPER
|
||||||
|
class with some content, results are not guaranteed.
|
||||||
|
|
||||||
|
Advice
|
||||||
|
The main point of this module was to provide a flexible method of
|
||||||
|
extracting tabular information from HTML documents without relying to
|
||||||
|
heavily on the document layout. For that reason, I suggest using
|
||||||
|
_Headers_ whenever possible -- that way, you are anchoring your
|
||||||
|
extraction on what the document is trying to communicate rather than
|
||||||
|
some feature of the HTML comprising the document (other than the fact
|
||||||
|
that the data is contained in a table).
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -q -n HTML-TableExtract-%{version}
|
%setup -q -n %{cpan_name}-%{version}
|
||||||
|
%patch0 -p1
|
||||||
|
|
||||||
%build
|
%build
|
||||||
perl Makefile.PL
|
%{__perl} Makefile.PL INSTALLDIRS=vendor
|
||||||
make %{?_smp_mflags}
|
%{__make} %{?_smp_mflags}
|
||||||
|
|
||||||
%check
|
%check
|
||||||
make test
|
%{__make} test
|
||||||
|
|
||||||
%install
|
%install
|
||||||
make DESTDIR=$RPM_BUILD_ROOT install_vendor
|
%perl_make_install
|
||||||
%perl_process_packlist
|
%perl_process_packlist
|
||||||
|
%perl_gen_filelist
|
||||||
|
|
||||||
%clean
|
%clean
|
||||||
[ "$RPM_BUILD_ROOT" != "/" ] && [ -d $RPM_BUILD_ROOT ] && rm -rf $RPM_BUILD_ROOT
|
%{__rm} -rf %{buildroot}
|
||||||
|
|
||||||
%files
|
%files -f %{name}.files
|
||||||
%defattr(-,root,root)
|
%defattr(644,root,root,755)
|
||||||
%doc Changes MANIFEST README
|
%doc Changes README
|
||||||
%doc %{_mandir}/man?/*
|
|
||||||
%{perl_vendorlib}/HTML
|
|
||||||
%{perl_vendorarch}/auto/HTML-TableExtract
|
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
Reference in New Issue
Block a user