From 08f522976f3a30f13094c3dc440187c35f5ad4b2996c992dc55cf0af6b3bead0 Mon Sep 17 00:00:00 2001 From: Christian Wittmer Date: Fri, 25 Feb 2011 17:52:50 +0000 Subject: [PATCH] cpanspec, noarch pkg OBS-URL: https://build.opensuse.org/package/show/devel:languages:perl/perl-HTML-TableExtract?expand=0&rev=20 --- HTML-TableExtract-2.10-HTML.patch | 6 ++ perl-HTML-TableExtract.changes | 8 ++ perl-HTML-TableExtract.spec | 148 +++++++++++++++++++++++------- 3 files changed, 130 insertions(+), 32 deletions(-) create mode 100644 HTML-TableExtract-2.10-HTML.patch diff --git a/HTML-TableExtract-2.10-HTML.patch b/HTML-TableExtract-2.10-HTML.patch new file mode 100644 index 0000000..e29f486 --- /dev/null +++ b/HTML-TableExtract-2.10-HTML.patch @@ -0,0 +1,6 @@ +diff -ruN HTML-TableExtract-2.10-orig/t/gnarly.html HTML-TableExtract-2.10/t/gnarly.html +--- HTML-TableExtract-2.10-orig/t/gnarly.html 2006-05-01 23:22:47.000000000 +0200 ++++ HTML-TableExtract-2.10/t/gnarly.html 2011-02-25 18:41:08.000000000 +0100 +@@ -1 +1 @@ +-gnarly table
(0,0) [1,4](0,1) [2,4]
(1,0) [2,1](1,1) [1,1](1,2) [1,2]
(2,0) [2,4](2,1) [2,2](2,2) [1,1]
(3,0) [1,1](3,1) [1,1]
(4,0) [3,2](4,1) [1,1](4,2) [3,1](4,3) [4,4]
(5,0) [1,1]
(6,0) [1,1]
(7,0) [1,4]
++gnarly table
(0,0) [1,4](0,1) [2,4]
(1,0) [2,1](1,1) [1,1](1,2) [1,2]
(2,0) [2,4](2,1) [2,2](2,2) [1,1]
(3,0) [1,1](3,1) [1,1]
(4,0) [3,2](4,1) [1,1](4,2) [3,1](4,3) [4,4]
(5,0) [1,1]
(6,0) [1,1]
(7,0) [1,4]
diff --git a/perl-HTML-TableExtract.changes b/perl-HTML-TableExtract.changes index f46248f..69bcc45 100644 --- a/perl-HTML-TableExtract.changes +++ b/perl-HTML-TableExtract.changes @@ -1,3 +1,11 @@ +------------------------------------------------------------------- +Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de + +- recreated by cpanspec 1.78.03 + o fix deps +- add HTML patch +- noarch pkg + ------------------------------------------------------------------- Wed Dec 1 13:27:30 UTC 2010 - coolo@novell.com diff --git a/perl-HTML-TableExtract.spec b/perl-HTML-TableExtract.spec index 44160f6..5da40a1 100644 --- a/perl-HTML-TableExtract.spec +++ b/perl-HTML-TableExtract.spec @@ -15,58 +15,142 @@ # Please submit bugfixes or comments via http://bugs.opensuse.org/ # -# norootforbuild - - Name: perl-HTML-TableExtract -Url: http://cpan.org/modules/by-module/HTML/ -License: Public Domain, Freeware -Group: Development/Libraries/Perl -AutoReqProv: on -Requires: perl-HTML-Parser -BuildRequires: perl-HTML-Parser -BuildRequires: perl-macros -# Needed only for tests: -BuildRequires: perl-HTML-Tree perl-Test-Pod-Coverage -Summary: Simplifies extraction of information within tables in HTML documents Version: 2.10 -Release: 81 -Source: HTML-TableExtract-%{version}.tar.bz2 +Release: 82 +License: GPL+ or Artistic +%define cpan_name HTML-TableExtract +Summary: For extracting the content contained in tables within an HTML document +Url: http://search.cpan.org/dist/HTML-TableExtract/ +Group: Development/Libraries/Perl +#Source: http://www.cpan.org/authors/id/M/MS/MSISK/HTML-TableExtract-2.10.tar.gz +Source: %{cpan_name}-%{version}.tar.bz2 +Patch0: %{cpan_name}-2.10-HTML.patch +BuildArch: noarch BuildRoot: %{_tmppath}/%{name}-%{version}-build +BuildRequires: perl +BuildRequires: perl-macros +BuildRequires: perl(HTML::ElementTable) >= 1.16 +BuildRequires: perl(HTML::Parser) +Requires: perl(HTML::ElementTable) >= 1.16 +Requires: perl(HTML::Parser) %{perl_requires} %description -HTML::TableExtract is a module that simplifies the extraction of -information contained in tables within HTML documents. +HTML::TableExtract is a subclass of HTML::Parser that serves to extract the +information from tables of interest contained within an HTML document. The +information from each extracted table is stored in table objects. Tables +can be extracted as text, HTML, or HTML::ElementTable structures (for +in-place editing or manipulation). -Tables of note may be specified using Headers, Depth, Count, -Attributes, or some combination of the three. See the module -documentation for details. +There are currently four constraints available to specify which tables you +would like to extract from a document: _Headers_, _Depth_, _Count_, and +_Attributes_. +_Headers_, the most flexible and adaptive of the techniques, involves +specifying text in an array that you expect to appear above the data in the +tables of interest. Once all headers have been located in a row of that +table, all further cells beneath the columns that matched your headers are +extracted. All other columns are ignored: think of it as vertical slices +through a table. In addition, TableExtract automatically rearranges each +row in the same order as the headers you provided. If you would like to +disable this, set _automap_ to 0 during object creation, and instead rely +on the column_map() method to find out the order in which the headers were +found. Furthermore, TableExtract will automatically compensate for cell +span issues so that columns are really the same columns as you would +visually see in a browser. This behavior can be disabled by setting the +_gridmap_ parameter to 0. HTML is stripped from the entire textual content +of a cell before header matches are attempted -- unless the _keep_html_ +parameter was enabled. +_Depth_ and _Count_ are more specific ways to specify tables in relation to +one another. _Depth_ represents how deeply a table resides in other tables. +The depth of a top-level table in the document is 0. A table within a +top-level table has a depth of 1, and so on. Each depth can be thought of +as a layer; tables sharing the same depth are on the same layer. Within +each of these layers, _Count_ represents the order in which a table was +seen at that depth, starting with 0. Providing both a _depth_ and a _count_ +will uniquely specify a table within a document. + +_Attributes_ match based on the attributes of the html tag, for +example, boder widths or background color. + +Each of the _Headers_, _Depth_, _Count_, and _Attributes_ specifications +are cumulative in their effect on the overall extraction. For instance, if +you specify only a _Depth_, then you get all tables at that depth (note +that these could very well reside in separate higher- level tables +throughout the document since depth extends across tables). If you specify +only a _Count_, then the tables at that _Count_ from all depths are +returned (i.e., the _n_th occurrence of a table at each depth). If you only +specify _Headers_, then you get all tables in the document containing those +column headers. If you have specified multiple constraints of _Headers_, +_Depth_, _Count_, and _Attributes_, then each constraint has veto power +over whether a particular table is extracted. + +If no _Headers_, _Depth_, _Count_, or _Attributes_ are specified, then all +tables match. + +When extracting only text from tables, the text is decoded with +HTML::Entities by default; this can be disabled by setting the _decode_ +parameter to 0. + +Extraction Modes + The default mode of extraction for HTML::TableExtract is raw text or + HTML. In this mode, embedded tables are completely decoupled from one + another. In this case, HTML::TableExtract is a subclass of + HTML::Parser: + + use HTML::TableExtract; + + Alternativevly, tables can be extracted as HTML::ElementTable + structures, which are in turn embedded in an HTML::Element tree + representing the entire HTML document. Embedded tables are not + decoupled from one another since this tree structure must be + manitained. In this case, HTML::TableExtract is a subclass of + HTML::TreeBuilder (itself a subclass of HTML:::Parser): + + use HTML::TableExtract qw(tree); + + In either case, the basic interface for HTML::TableExtract and the + resulting table objects remains the same -- all that changes is what + you can do with the resulting data. + + HTML::TableExtract is a subclass of HTML::Parser, and as such inherits + all of its basic methods such as 'parse()' and 'parse_file()'. During + scans, 'start()', 'end()', and 'text()' are utilized. Feel free to + override them, but if you do not eventually invoke them in the SUPER + class with some content, results are not guaranteed. + +Advice + The main point of this module was to provide a flexible method of + extracting tabular information from HTML documents without relying to + heavily on the document layout. For that reason, I suggest using + _Headers_ whenever possible -- that way, you are anchoring your + extraction on what the document is trying to communicate rather than + some feature of the HTML comprising the document (other than the fact + that the data is contained in a table). %prep -%setup -q -n HTML-TableExtract-%{version} +%setup -q -n %{cpan_name}-%{version} +%patch0 -p1 %build -perl Makefile.PL -make %{?_smp_mflags} +%{__perl} Makefile.PL INSTALLDIRS=vendor +%{__make} %{?_smp_mflags} %check -make test +%{__make} test %install -make DESTDIR=$RPM_BUILD_ROOT install_vendor +%perl_make_install %perl_process_packlist +%perl_gen_filelist %clean -[ "$RPM_BUILD_ROOT" != "/" ] && [ -d $RPM_BUILD_ROOT ] && rm -rf $RPM_BUILD_ROOT +%{__rm} -rf %{buildroot} -%files -%defattr(-,root,root) -%doc Changes MANIFEST README -%doc %{_mandir}/man?/* -%{perl_vendorlib}/HTML -%{perl_vendorarch}/auto/HTML-TableExtract +%files -f %{name}.files +%defattr(644,root,root,755) +%doc Changes README %changelog