forked from pool/perl-HTML-TableExtract
		
	cpanspec, noarch pkg
OBS-URL: https://build.opensuse.org/package/show/devel:languages:perl/perl-HTML-TableExtract?expand=0&rev=20
This commit is contained in:
		
				
					committed by
					
						 Git OBS Bridge
						Git OBS Bridge
					
				
			
			
				
	
			
			
			
						parent
						
							f8356d3ca3
						
					
				
				
					commit
					08f522976f
				
			
							
								
								
									
										6
									
								
								HTML-TableExtract-2.10-HTML.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								HTML-TableExtract-2.10-HTML.patch
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| diff -ruN HTML-TableExtract-2.10-orig/t/gnarly.html HTML-TableExtract-2.10/t/gnarly.html | ||||
| --- HTML-TableExtract-2.10-orig/t/gnarly.html	2006-05-01 23:22:47.000000000 +0200 | ||||
| +++ HTML-TableExtract-2.10/t/gnarly.html	2011-02-25 18:41:08.000000000 +0100 | ||||
| @@ -1 +1 @@ | ||||
| -<html><head><title>gnarly table</title></head><body><table border=1><tr><td colspan=4 rowspan=1>(0,0) [1,4]</td><td colspan=4 rowspan=2>(0,1) [2,4]</td></tr><tr><td colspan=1 rowspan=2>(1,0) [2,1]</td><td colspan=1 rowspan=1>(1,1) [1,1]</td><td colspan=2 rowspan=1>(1,2) [1,2]</td></tr><tr><td colspan=4 rowspan=2>(2,0) [2,4]</td><td colspan=2 rowspan=2>(2,1) [2,2]</td><td colspan=1 rowspan=1>(2,2) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(3,0) [1,1]</td><td colspan=1 rowspan=1>(3,1) [1,1]</td></tr><tr><td colspan=2 rowspan=3>(4,0) [3,2]</td><td colspan=1 rowspan=1>(4,1) [1,1]</td><td colspan=1 rowspan=3>(4,2) [3,1]</td><td colspan=4 rowspan=4>(4,3) [4,4]</td></tr><tr><td colspan=1 rowspan=1>(5,0) [1,1]</td></tr><tr><td colspan=1 rowspan=1>(6,0) [1,1]</td></tr><tr><td colspan=4 rowspan=1>(7,0) [1,4]</td></tr></table></body></html> | ||||
| +<html><head><title>gnarly table</title></head><body><table border="1"><tr><td colspan="4" rowspan="1">(0,0) [1,4]</td><td colspan="4" rowspan="2">(0,1) [2,4]</td></tr><tr><td colspan="1" rowspan="2">(1,0) [2,1]</td><td colspan="1" rowspan="1">(1,1) [1,1]</td><td colspan="2" rowspan="1">(1,2) [1,2]</td></tr><tr><td colspan="4" rowspan="2">(2,0) [2,4]</td><td colspan="2" rowspan="2">(2,1) [2,2]</td><td colspan="1" rowspan="1">(2,2) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(3,0) [1,1]</td><td colspan="1" rowspan="1">(3,1) [1,1]</td></tr><tr><td colspan="2" rowspan="3">(4,0) [3,2]</td><td colspan="1" rowspan="1">(4,1) [1,1]</td><td colspan="1" rowspan="3">(4,2) [3,1]</td><td colspan="4" rowspan="4">(4,3) [4,4]</td></tr><tr><td colspan="1" rowspan="1">(5,0) [1,1]</td></tr><tr><td colspan="1" rowspan="1">(6,0) [1,1]</td></tr><tr><td colspan="4" rowspan="1">(7,0) [1,4]</td></tr></table></body></html> | ||||
| @@ -1,3 +1,11 @@ | ||||
| ------------------------------------------------------------------- | ||||
| Fri Feb 25 17:51:03 UTC 2011 - chris@computersalat.de | ||||
|  | ||||
| - recreated by cpanspec 1.78.03 | ||||
|   o fix deps | ||||
| - add HTML patch | ||||
| - noarch pkg | ||||
|  | ||||
| ------------------------------------------------------------------- | ||||
| Wed Dec  1 13:27:30 UTC 2010 - coolo@novell.com | ||||
|  | ||||
|   | ||||
| @@ -15,58 +15,142 @@ | ||||
| # Please submit bugfixes or comments via http://bugs.opensuse.org/ | ||||
| # | ||||
|  | ||||
| # norootforbuild | ||||
|  | ||||
|  | ||||
| Name:           perl-HTML-TableExtract | ||||
| Url:            http://cpan.org/modules/by-module/HTML/ | ||||
| License:        Public Domain, Freeware | ||||
| Group:          Development/Libraries/Perl | ||||
| AutoReqProv:    on | ||||
| Requires:       perl-HTML-Parser | ||||
| BuildRequires:  perl-HTML-Parser | ||||
| BuildRequires:  perl-macros | ||||
| # Needed only for tests: | ||||
| BuildRequires:  perl-HTML-Tree perl-Test-Pod-Coverage | ||||
| Summary:        Simplifies extraction of information within tables in HTML documents | ||||
| Version:        2.10 | ||||
| Release:        81 | ||||
| Source:         HTML-TableExtract-%{version}.tar.bz2 | ||||
| Release:        82 | ||||
| License:        GPL+ or Artistic | ||||
| %define cpan_name HTML-TableExtract | ||||
| Summary:        For extracting the content contained in tables within an HTML document | ||||
| Url:            http://search.cpan.org/dist/HTML-TableExtract/ | ||||
| Group:          Development/Libraries/Perl | ||||
| #Source:         http://www.cpan.org/authors/id/M/MS/MSISK/HTML-TableExtract-2.10.tar.gz | ||||
| Source:         %{cpan_name}-%{version}.tar.bz2 | ||||
| Patch0:         %{cpan_name}-2.10-HTML.patch | ||||
| BuildArch:      noarch | ||||
| BuildRoot:      %{_tmppath}/%{name}-%{version}-build | ||||
| BuildRequires:  perl | ||||
| BuildRequires:  perl-macros | ||||
| BuildRequires:  perl(HTML::ElementTable) >= 1.16 | ||||
| BuildRequires:  perl(HTML::Parser) | ||||
| Requires:       perl(HTML::ElementTable) >= 1.16 | ||||
| Requires:       perl(HTML::Parser) | ||||
| %{perl_requires} | ||||
|  | ||||
| %description | ||||
| HTML::TableExtract is a module that simplifies the extraction of | ||||
| information contained in tables within HTML documents. | ||||
| HTML::TableExtract is a subclass of HTML::Parser that serves to extract the | ||||
| information from tables of interest contained within an HTML document. The | ||||
| information from each extracted table is stored in table objects. Tables | ||||
| can be extracted as text, HTML, or HTML::ElementTable structures (for | ||||
| in-place editing or manipulation). | ||||
|  | ||||
| Tables of note may be specified using Headers, Depth, Count, | ||||
| Attributes, or some combination of the three. See the module | ||||
| documentation for details. | ||||
| There are currently four constraints available to specify which tables you | ||||
| would like to extract from a document: _Headers_, _Depth_, _Count_, and | ||||
| _Attributes_. | ||||
|  | ||||
| _Headers_, the most flexible and adaptive of the techniques, involves | ||||
| specifying text in an array that you expect to appear above the data in the | ||||
| tables of interest. Once all headers have been located in a row of that | ||||
| table, all further cells beneath the columns that matched your headers are | ||||
| extracted. All other columns are ignored: think of it as vertical slices | ||||
| through a table. In addition, TableExtract automatically rearranges each | ||||
| row in the same order as the headers you provided. If you would like to | ||||
| disable this, set _automap_ to 0 during object creation, and instead rely | ||||
| on the column_map() method to find out the order in which the headers were | ||||
| found. Furthermore, TableExtract will automatically compensate for cell | ||||
| span issues so that columns are really the same columns as you would | ||||
| visually see in a browser. This behavior can be disabled by setting the | ||||
| _gridmap_ parameter to 0. HTML is stripped from the entire textual content | ||||
| of a cell before header matches are attempted -- unless the _keep_html_ | ||||
| parameter was enabled. | ||||
|  | ||||
| _Depth_ and _Count_ are more specific ways to specify tables in relation to | ||||
| one another. _Depth_ represents how deeply a table resides in other tables. | ||||
| The depth of a top-level table in the document is 0. A table within a | ||||
| top-level table has a depth of 1, and so on. Each depth can be thought of | ||||
| as a layer; tables sharing the same depth are on the same layer. Within | ||||
| each of these layers, _Count_ represents the order in which a table was | ||||
| seen at that depth, starting with 0. Providing both a _depth_ and a _count_ | ||||
| will uniquely specify a table within a document. | ||||
|  | ||||
| _Attributes_ match based on the attributes of the html <table> tag, for | ||||
| example, boder widths or background color. | ||||
|  | ||||
| Each of the _Headers_, _Depth_, _Count_, and _Attributes_ specifications | ||||
| are cumulative in their effect on the overall extraction. For instance, if | ||||
| you specify only a _Depth_, then you get all tables at that depth (note | ||||
| that these could very well reside in separate higher- level tables | ||||
| throughout the document since depth extends across tables). If you specify | ||||
| only a _Count_, then the tables at that _Count_ from all depths are | ||||
| returned (i.e., the _n_th occurrence of a table at each depth). If you only | ||||
| specify _Headers_, then you get all tables in the document containing those | ||||
| column headers. If you have specified multiple constraints of _Headers_, | ||||
| _Depth_, _Count_, and _Attributes_, then each constraint has veto power | ||||
| over whether a particular table is extracted. | ||||
|  | ||||
| If no _Headers_, _Depth_, _Count_, or _Attributes_ are specified, then all | ||||
| tables match. | ||||
|  | ||||
| When extracting only text from tables, the text is decoded with | ||||
| HTML::Entities by default; this can be disabled by setting the _decode_ | ||||
| parameter to 0. | ||||
|  | ||||
| Extraction Modes | ||||
|     The default mode of extraction for HTML::TableExtract is raw text or | ||||
|     HTML. In this mode, embedded tables are completely decoupled from one | ||||
|     another. In this case, HTML::TableExtract is a subclass of | ||||
|     HTML::Parser: | ||||
|  | ||||
|       use HTML::TableExtract; | ||||
|  | ||||
|     Alternativevly, tables can be extracted as HTML::ElementTable | ||||
|     structures, which are in turn embedded in an HTML::Element tree | ||||
|     representing the entire HTML document. Embedded tables are not | ||||
|     decoupled from one another since this tree structure must be | ||||
|     manitained. In this case, HTML::TableExtract is a subclass of | ||||
|     HTML::TreeBuilder (itself a subclass of HTML:::Parser): | ||||
|  | ||||
|       use HTML::TableExtract qw(tree); | ||||
|  | ||||
|     In either case, the basic interface for HTML::TableExtract and the | ||||
|     resulting table objects remains the same -- all that changes is what | ||||
|     you can do with the resulting data. | ||||
|  | ||||
|     HTML::TableExtract is a subclass of HTML::Parser, and as such inherits | ||||
|     all of its basic methods such as 'parse()' and 'parse_file()'. During | ||||
|     scans, 'start()', 'end()', and 'text()' are utilized. Feel free to | ||||
|     override them, but if you do not eventually invoke them in the SUPER | ||||
|     class with some content, results are not guaranteed. | ||||
|  | ||||
| Advice | ||||
|     The main point of this module was to provide a flexible method of | ||||
|     extracting tabular information from HTML documents without relying to | ||||
|     heavily on the document layout. For that reason, I suggest using | ||||
|     _Headers_ whenever possible -- that way, you are anchoring your | ||||
|     extraction on what the document is trying to communicate rather than | ||||
|     some feature of the HTML comprising the document (other than the fact | ||||
|     that the data is contained in a table). | ||||
|  | ||||
| %prep | ||||
| %setup -q -n HTML-TableExtract-%{version} | ||||
| %setup -q -n %{cpan_name}-%{version} | ||||
| %patch0 -p1 | ||||
|  | ||||
| %build | ||||
| perl Makefile.PL | ||||
| make %{?_smp_mflags} | ||||
| %{__perl} Makefile.PL INSTALLDIRS=vendor | ||||
| %{__make} %{?_smp_mflags} | ||||
|  | ||||
| %check | ||||
| make test | ||||
| %{__make} test | ||||
|  | ||||
| %install | ||||
| make DESTDIR=$RPM_BUILD_ROOT install_vendor | ||||
| %perl_make_install | ||||
| %perl_process_packlist | ||||
| %perl_gen_filelist | ||||
|  | ||||
| %clean | ||||
| [ "$RPM_BUILD_ROOT" != "/" ] && [ -d $RPM_BUILD_ROOT ] && rm -rf $RPM_BUILD_ROOT | ||||
| %{__rm} -rf %{buildroot} | ||||
|  | ||||
| %files | ||||
| %defattr(-,root,root) | ||||
| %doc Changes MANIFEST README | ||||
| %doc %{_mandir}/man?/* | ||||
| %{perl_vendorlib}/HTML | ||||
| %{perl_vendorarch}/auto/HTML-TableExtract | ||||
| %files -f %{name}.files | ||||
| %defattr(644,root,root,755) | ||||
| %doc Changes README | ||||
|  | ||||
| %changelog | ||||
|   | ||||
		Reference in New Issue
	
	Block a user