diff --git a/nltk_data.tar.xz b/nltk_data.tar.xz new file mode 100644 index 0000000..b6fd225 --- /dev/null +++ b/nltk_data.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41383a4774bf7227f4563f46543460ba07a6921f7bcc6185519e87ea9e4323f +size 453871052 diff --git a/port-2to3.patch b/port-2to3.patch new file mode 100644 index 0000000..7a5697d --- /dev/null +++ b/port-2to3.patch @@ -0,0 +1,48 @@ +--- + nltk_data/corpora/pl196x/splitter.py | 4 ++-- + nltk_data/taggers/universal_tagset/universal_tags.py | 5 ----- + tools/find_deprecated.py | 2 +- + 3 files changed, 3 insertions(+), 8 deletions(-) + +--- a/nltk_data/corpora/pl196x/splitter.py ++++ b/nltk_data/corpora/pl196x/splitter.py +@@ -1,4 +1,4 @@ +-#!/usr/bin/python ++#!/usr/bin/python3 + + import sys, re + +@@ -7,7 +7,7 @@ TEXTID = re.compile(r'') + + if __name__ == '__main__': + if len(sys.argv) != 2: +- print 'One argument required: a pl196x corpus to split.' ++ print('One argument required: a pl196x corpus to split.') + sys.exit() + + inputFileName = sys.argv[1] +--- a/nltk_data/taggers/universal_tagset/universal_tags.py ++++ b/nltk_data/taggers/universal_tagset/universal_tags.py +@@ -22,11 +22,6 @@ X - other: foreign words, typos, abbrevi + @author: Nathan Schneider (nschneid) + @since: 2011-05-06 + ''' +- +-# Strive towards Python 3 compatibility +-from __future__ import print_function, unicode_literals, division +-from future_builtins import map, filter +- + import re, glob + from collections import defaultdict + +--- a/tools/find_deprecated.py ++++ b/tools/find_deprecated.py +@@ -29,7 +29,7 @@ import textwrap + import tokenize + from doctest import DocTestParser, register_optionflag + +-from cStringIO import StringIO ++from io import StringIO + + import nltk.corpus + from nltk import defaultdict diff --git a/python-nltk.changes b/python-nltk.changes index 974487d..697fd51 100644 --- a/python-nltk.changes +++ b/python-nltk.changes @@ -1,3 +1,12 @@ +------------------------------------------------------------------- +Mon Dec 26 10:41:22 UTC 2022 - Matej Cepl + +- Complete nltk_data.tar.xz for offline testing +- Fix failing tests (gh#nltk/nltk#2969) by adding patches: + - port-2to3.patch + - skip-networked-test.patch +- Clean up the SPEC to get rid of rpmlint warnings. + ------------------------------------------------------------------- Tue Mar 22 07:48:14 UTC 2022 - Matej Cepl diff --git a/python-nltk.rpmlintrc b/python-nltk.rpmlintrc new file mode 100644 index 0000000..9816082 --- /dev/null +++ b/python-nltk.rpmlintrc @@ -0,0 +1 @@ +addFilter("E: zero-length /usr/lib/python3\.\d/site-packages/nltk/tbl/api\.py") diff --git a/python-nltk.spec b/python-nltk.spec index a90c1ac..6189c2c 100644 --- a/python-nltk.spec +++ b/python-nltk.spec @@ -25,7 +25,18 @@ Release: 0 Summary: Natural Language Toolkit License: Apache-2.0 URL: http://nltk.org/ -Source: https://files.pythonhosted.org/packages/source/n/nltk/%{pyname}-%{version}.zip +Source0: https://files.pythonhosted.org/packages/source/n/nltk/%{pyname}-%{version}.zip +# Downloaded NLTK data via python3 -m nltk.downloader, +# then unzip downloaded zip archive. +# see https://www.nltk.org/data.html for more details +Source1: nltk_data.tar.xz +Source99: python-nltk.rpmlintrc +# PATCH-FIX-UPSTREAM skip-networked-test.patch gh#nltk/nltk#2969 mcepl@suse.com +# skip tests requiring network connection +Patch0: skip-networked-test.patch +# PATCH-FIX-UPSTREAM port-2to3.patch bsc#[0-9]+ mcepl@suse.com +# port scripts in nltk_data to Python 3 +Patch1: port-2to3.patch BuildRequires: %{python_module regex} BuildRequires: %{python_module setuptools} BuildRequires: %{python_module six} @@ -33,6 +44,27 @@ BuildRequires: %{pythons} BuildRequires: fdupes BuildRequires: python-rpm-macros BuildRequires: unzip +# For testing +BuildRequires: %{python_module tk} +BuildRequires: %{python_module click} +BuildRequires: %{python_module pytest} +# BuildRequires: %%{python_module gensim} +BuildRequires: %{python_module joblib} +BuildRequires: %{python_module Jinja2} +BuildRequires: %{python_module matplotlib} +BuildRequires: %{python_module numpy} +BuildRequires: %{python_module pyparsing} +BuildRequires: %{python_module pytest-cov} +BuildRequires: %{python_module pytest-mock} +BuildRequires: %{python_module python-crfsuite} +BuildRequires: %{python_module regex} +BuildRequires: %{python_module requests} +BuildRequires: %{python_module scikit-learn} +BuildRequires: %{python_module scipy} +BuildRequires: %{python_module text-unidecode} +BuildRequires: %{python_module tqdm} +BuildRequires: %{python_module twython} +# Requires: python-regex Requires: python-six Recommends: python-gensim @@ -49,19 +81,49 @@ Requires(postun):update-alternatives BuildArch: noarch %python_subpackages +# changedir = nltk/test + %description NLTK -- the Natural Language Toolkit -- is a suite of Python modules, data sets and tutorials supporting research and development in Natural Language Processing. %prep -%autosetup -p1 -n %{pyname}-%{version} +%autosetup -p1 -a1 -n %{pyname}-%{version} -# sed -i "1,4{/\/usr\/bin\/env/d}" nltk/corpus/reader/knbc.py -# sed -i "1,4{/\/usr\/bin\/env/d}" nltk/test/runtests.py -# sed -i "1,4{/\/usr\/bin\/env/d}" nltk/test/unit/test_tgrep.py -# sed -i "1,4{/\/usr\/bin\/env/d}" nltk/tgrep.py -# sed -i "1,4{/\/usr\/bin\/env/d}" nltk/tokenize/stanford_segmenter.py +# Remove obsolete scripts +rm tools/nltk_term_index.py tools/run_doctests.py nltk_data/corpora/semcor/semcor.py + +# Fix EOL +sed -i 's/\r/\n/g; s/\n$//' \ + README.md \ + nltk/corpus/reader/knbc.py \ + nltk/test/unit/test_tgrep.py \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py \ + nltk/corpus/reader/knbc.py \ + nltk/test/unit/test_tgrep.py \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py \ + nltk/corpus/reader/knbc.py \ + nltk/test/unit/test_tgrep.py \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py + +# Remove unrequired shebangs +sed -E -i "/#![[:space:]]*\/usr\/bin\/env python/d" \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py \ + nltk/test/unit/test_tgrep.py \ + nltk/corpus/reader/knbc.py + +# Switch shebangs to the standard Python interpreter +sed -E -i "s|#![[:space:]]*%{_bindir}/env python|#!%{_bindir}/python3|" \ + setup.py \ + tools/global_replace.py \ + nltk_data/corpora/pl196x/splitter.py \ + tools/find_deprecated.py \ + tools/svnmime.py %build %python_build @@ -75,8 +137,9 @@ chmod -x %{buildroot}%{$python_sitelib}/nltk/test/dependency.doctest } %check -# FOLLOWING http://www.nltk.org/install.html -%python_exec -c "import nltk" || exit 1 +export NLTK_DATA=$(readlink -f ./nltk_data/) +# export PYTEST_ADDOPTS="--doctest-modules" +%pytest -k 'not network' %post %python_install_alternative nltk diff --git a/skip-networked-test.patch b/skip-networked-test.patch new file mode 100644 index 0000000..82d62af --- /dev/null +++ b/skip-networked-test.patch @@ -0,0 +1,35 @@ +--- + nltk/test/unit/test_downloader.py | 4 ++++ + setup.cfg | 4 ++++ + 2 files changed, 8 insertions(+) + +--- a/nltk/test/unit/test_downloader.py ++++ b/nltk/test/unit/test_downloader.py +@@ -1,6 +1,9 @@ + from nltk import download + ++import pytest + ++ ++@pytest.mark.network + def test_downloader_using_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir exists""" + +@@ -9,6 +12,7 @@ def test_downloader_using_existing_paren + assert download_status is True + + ++@pytest.mark.network + def test_downloader_using_non_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir does not exist""" + +--- a/setup.cfg ++++ b/setup.cfg +@@ -1,3 +1,7 @@ ++[tool:pytest] ++markers = ++ network: test case requires network connection ++ + [metadata] + license_files = + LICENSE.txt