forked from pool/perl-HTML-TableExtract
cpanspec, noarch pkg
OBS-URL: https://build.opensuse.org/package/show/devel:languages:perl/perl-HTML-TableExtract?expand=0&rev=20
This commit is contained in:
committed by
Git OBS Bridge
parent
f8356d3ca3
commit
08f522976f
6
HTML-TableExtract-2.10-HTML.patch
Normal file
6
HTML-TableExtract-2.10-HTML.patch
Normal file
@@ -0,0 +1,6 @@
|
||||
diff -ruN HTML-TableExtract-2.10-orig/t/gnarly.html HTML-TableExtract-2.10/t/gnarly.html
|
||||
--- HTML-TableExtract-2.10-orig/t/gnarly.html 2006-05-01 23:22:47.000000000 +0200
|
||||
+++ HTML-TableExtract-2.10/t/gnarly.html 2011-02-25 18:41:08.000000000 +0100
|
||||
@@ -1 +1 @@
|
||||
-<html><head><title>gnarly table</title></head><body><table border=1><tr><td colspan=4 rowspan=1>(0,0) [1,4]</td><td colspan=4 rowspan=2>(0,1) [2,4]</td></tr><tr><td colspan=1 rowspan=2>(1,0) [2,1]</td><td colspan=1 rowspan=1>(1,1) [1,1]</td><td colspan=2 rowspan=1>(1,2) [1,2]</td></tr><tr><td colspan=4 rowspan=2>(2,0) [2,4]</td><td colspan=2 rowspan=2>(2,1) [2,2]</td><td colspan=1 rowspan=1>(2,2) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(3,0) [1,1]</td><td colspan=1 rowspan=1>(3,1) [1,1]</td></tr><tr><td colspan=2 rowspan=3>(4,0) [3,2]</td><td colspan=1 rowspan=1>(4,1) [1,1]</td><td colspan=1 rowspan=3>(4,2) [3,1]</td><td colspan=4 rowspan=4>(4,3) [4,4]</td></tr><tr><td colspan=1 rowspan=1>(5,0) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(6,0) [1,1]</td></tr><tr><td colspan=4 rowspan=1>(7,0) [1,4]</td></tr></table></body></html>
|
||||
+<html><head><title>gnarly table</title></head><body><table border="1"><tr><td colspan="4" rowspan="1">(0,0) [1,4]</td><td colspan="4" rowspan="2">(0,1) [2,4]</td></tr><tr><td colspan="1" rowspan="2">(1,0) [2,1]</td><td colspan="1" rowspan="1">(1,1) [1,1]</td><td colspan="2" rowspan="1">(1,2) [1,2]</td></tr><tr><td colspan="4" rowspan="2">(2,0) [2,4]</td><td colspan="2" rowspan="2">(2,1) [2,2]</td><td colspan="1" rowspan="1">(2,2) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(3,0) [1,1]</td><td colspan="1" rowspan="1">(3,1) [1,1]</td></tr><tr><td colspan="2" rowspan="3">(4,0) [3,2]</td><td colspan="1" rowspan="1">(4,1) [1,1]</td><td colspan="1" rowspan="3">(4,2) [3,1]</td><td colspan="4" rowspan="4">(4,3) [4,4]</td></tr><tr><td colspan="1" rowspan="1">(5,0) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(6,0) [1,1]</td></tr><tr><td colspan="4" rowspan="1">(7,0) [1,4]</td></tr></table></body></html>
|
@@ -1,3 +1,11 @@
|
||||
-------------------------------------------------------------------
|
||||
Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de
|
||||
|
||||
- recreated by cpanspec 1.78.03
|
||||
o fix deps
|
||||
- add HTML patch
|
||||
- noarch pkg
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Wed Dec 1 13:27:30 UTC 2010 - coolo@novell.com
|
||||
|
||||
|
@@ -15,58 +15,142 @@
|
||||
# Please submit bugfixes or comments via http://bugs.opensuse.org/
|
||||
#
|
||||
|
||||
# norootforbuild
|
||||
|
||||
|
||||
Name: perl-HTML-TableExtract
|
||||
Url: http://cpan.org/modules/by-module/HTML/
|
||||
License: Public Domain, Freeware
|
||||
Group: Development/Libraries/Perl
|
||||
AutoReqProv: on
|
||||
Requires: perl-HTML-Parser
|
||||
BuildRequires: perl-HTML-Parser
|
||||
BuildRequires: perl-macros
|
||||
# Needed only for tests:
|
||||
BuildRequires: perl-HTML-Tree perl-Test-Pod-Coverage
|
||||
Summary: Simplifies extraction of information within tables in HTML documents
|
||||
Version: 2.10
|
||||
Release: 81
|
||||
Source: HTML-TableExtract-%{version}.tar.bz2
|
||||
Release: 82
|
||||
License: GPL+ or Artistic
|
||||
%define cpan_name HTML-TableExtract
|
||||
Summary: For extracting the content contained in tables within an HTML document
|
||||
Url: http://search.cpan.org/dist/HTML-TableExtract/
|
||||
Group: Development/Libraries/Perl
|
||||
#Source: http://www.cpan.org/authors/id/M/MS/MSISK/HTML-TableExtract-2.10.tar.gz
|
||||
Source: %{cpan_name}-%{version}.tar.bz2
|
||||
Patch0: %{cpan_name}-2.10-HTML.patch
|
||||
BuildArch: noarch
|
||||
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
||||
BuildRequires: perl
|
||||
BuildRequires: perl-macros
|
||||
BuildRequires: perl(HTML::ElementTable) >= 1.16
|
||||
BuildRequires: perl(HTML::Parser)
|
||||
Requires: perl(HTML::ElementTable) >= 1.16
|
||||
Requires: perl(HTML::Parser)
|
||||
%{perl_requires}
|
||||
|
||||
%description
|
||||
HTML::TableExtract is a module that simplifies the extraction of
|
||||
information contained in tables within HTML documents.
|
||||
HTML::TableExtract is a subclass of HTML::Parser that serves to extract the
|
||||
information from tables of interest contained within an HTML document. The
|
||||
information from each extracted table is stored in table objects. Tables
|
||||
can be extracted as text, HTML, or HTML::ElementTable structures (for
|
||||
in-place editing or manipulation).
|
||||
|
||||
Tables of note may be specified using Headers, Depth, Count,
|
||||
Attributes, or some combination of the three. See the module
|
||||
documentation for details.
|
||||
There are currently four constraints available to specify which tables you
|
||||
would like to extract from a document: _Headers_, _Depth_, _Count_, and
|
||||
_Attributes_.
|
||||
|
||||
_Headers_, the most flexible and adaptive of the techniques, involves
|
||||
specifying text in an array that you expect to appear above the data in the
|
||||
tables of interest. Once all headers have been located in a row of that
|
||||
table, all further cells beneath the columns that matched your headers are
|
||||
extracted. All other columns are ignored: think of it as vertical slices
|
||||
through a table. In addition, TableExtract automatically rearranges each
|
||||
row in the same order as the headers you provided. If you would like to
|
||||
disable this, set _automap_ to 0 during object creation, and instead rely
|
||||
on the column_map() method to find out the order in which the headers were
|
||||
found. Furthermore, TableExtract will automatically compensate for cell
|
||||
span issues so that columns are really the same columns as you would
|
||||
visually see in a browser. This behavior can be disabled by setting the
|
||||
_gridmap_ parameter to 0. HTML is stripped from the entire textual content
|
||||
of a cell before header matches are attempted -- unless the _keep_html_
|
||||
parameter was enabled.
|
||||
|
||||
_Depth_ and _Count_ are more specific ways to specify tables in relation to
|
||||
one another. _Depth_ represents how deeply a table resides in other tables.
|
||||
The depth of a top-level table in the document is 0. A table within a
|
||||
top-level table has a depth of 1, and so on. Each depth can be thought of
|
||||
as a layer; tables sharing the same depth are on the same layer. Within
|
||||
each of these layers, _Count_ represents the order in which a table was
|
||||
seen at that depth, starting with 0. Providing both a _depth_ and a _count_
|
||||
will uniquely specify a table within a document.
|
||||
|
||||
_Attributes_ match based on the attributes of the html <table> tag, for
|
||||
example, boder widths or background color.
|
||||
|
||||
Each of the _Headers_, _Depth_, _Count_, and _Attributes_ specifications
|
||||
are cumulative in their effect on the overall extraction. For instance, if
|
||||
you specify only a _Depth_, then you get all tables at that depth (note
|
||||
that these could very well reside in separate higher- level tables
|
||||
throughout the document since depth extends across tables). If you specify
|
||||
only a _Count_, then the tables at that _Count_ from all depths are
|
||||
returned (i.e., the _n_th occurrence of a table at each depth). If you only
|
||||
specify _Headers_, then you get all tables in the document containing those
|
||||
column headers. If you have specified multiple constraints of _Headers_,
|
||||
_Depth_, _Count_, and _Attributes_, then each constraint has veto power
|
||||
over whether a particular table is extracted.
|
||||
|
||||
If no _Headers_, _Depth_, _Count_, or _Attributes_ are specified, then all
|
||||
tables match.
|
||||
|
||||
When extracting only text from tables, the text is decoded with
|
||||
HTML::Entities by default; this can be disabled by setting the _decode_
|
||||
parameter to 0.
|
||||
|
||||
Extraction Modes
|
||||
The default mode of extraction for HTML::TableExtract is raw text or
|
||||
HTML. In this mode, embedded tables are completely decoupled from one
|
||||
another. In this case, HTML::TableExtract is a subclass of
|
||||
HTML::Parser:
|
||||
|
||||
use HTML::TableExtract;
|
||||
|
||||
Alternativevly, tables can be extracted as HTML::ElementTable
|
||||
structures, which are in turn embedded in an HTML::Element tree
|
||||
representing the entire HTML document. Embedded tables are not
|
||||
decoupled from one another since this tree structure must be
|
||||
manitained. In this case, HTML::TableExtract is a subclass of
|
||||
HTML::TreeBuilder (itself a subclass of HTML:::Parser):
|
||||
|
||||
use HTML::TableExtract qw(tree);
|
||||
|
||||
In either case, the basic interface for HTML::TableExtract and the
|
||||
resulting table objects remains the same -- all that changes is what
|
||||
you can do with the resulting data.
|
||||
|
||||
HTML::TableExtract is a subclass of HTML::Parser, and as such inherits
|
||||
all of its basic methods such as 'parse()' and 'parse_file()'. During
|
||||
scans, 'start()', 'end()', and 'text()' are utilized. Feel free to
|
||||
override them, but if you do not eventually invoke them in the SUPER
|
||||
class with some content, results are not guaranteed.
|
||||
|
||||
Advice
|
||||
The main point of this module was to provide a flexible method of
|
||||
extracting tabular information from HTML documents without relying to
|
||||
heavily on the document layout. For that reason, I suggest using
|
||||
_Headers_ whenever possible -- that way, you are anchoring your
|
||||
extraction on what the document is trying to communicate rather than
|
||||
some feature of the HTML comprising the document (other than the fact
|
||||
that the data is contained in a table).
|
||||
|
||||
%prep
|
||||
%setup -q -n HTML-TableExtract-%{version}
|
||||
%setup -q -n %{cpan_name}-%{version}
|
||||
%patch0 -p1
|
||||
|
||||
%build
|
||||
perl Makefile.PL
|
||||
make %{?_smp_mflags}
|
||||
%{__perl} Makefile.PL INSTALLDIRS=vendor
|
||||
%{__make} %{?_smp_mflags}
|
||||
|
||||
%check
|
||||
make test
|
||||
%{__make} test
|
||||
|
||||
%install
|
||||
make DESTDIR=$RPM_BUILD_ROOT install_vendor
|
||||
%perl_make_install
|
||||
%perl_process_packlist
|
||||
%perl_gen_filelist
|
||||
|
||||
%clean
|
||||
[ "$RPM_BUILD_ROOT" != "/" ] && [ -d $RPM_BUILD_ROOT ] && rm -rf $RPM_BUILD_ROOT
|
||||
%{__rm} -rf %{buildroot}
|
||||
|
||||
%files
|
||||
%defattr(-,root,root)
|
||||
%doc Changes MANIFEST README
|
||||
%doc %{_mandir}/man?/*
|
||||
%{perl_vendorlib}/HTML
|
||||
%{perl_vendorarch}/auto/HTML-TableExtract
|
||||
%files -f %{name}.files
|
||||
%defattr(644,root,root,755)
|
||||
%doc Changes README
|
||||
|
||||
%changelog
|
||||
|
Reference in New Issue
Block a user