commit 3ac2448eef712434b97e01b0f81cb9c7da70f66d3340db3bbd7803c178bc4f4f Author: Matej Cepl Date: Tue Jul 2 21:12:29 2024 +0000 - Use tarball from GitHub instead of the Zip archive from PyPI, the latter has very messy combination of CRLF and LF EOLs, which are hard to patch. - Refresh all patches from the original locations. - Add CVE-2024-39705-disable-download.patch to make a crude workaround around CVE-2024-39705 (gh#nltk/nltk#3266, bsc#1227174). OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-nltk?expand=0&rev=47 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9b03811 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,23 @@ +## Default LFS +*.7z filter=lfs diff=lfs merge=lfs -text +*.bsp filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.gem filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.jar filter=lfs diff=lfs merge=lfs -text +*.lz filter=lfs diff=lfs merge=lfs -text +*.lzma filter=lfs diff=lfs merge=lfs -text +*.obscpio filter=lfs diff=lfs merge=lfs -text +*.oxt filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.rpm filter=lfs diff=lfs merge=lfs -text +*.tbz filter=lfs diff=lfs merge=lfs -text +*.tbz2 filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.ttf filter=lfs diff=lfs merge=lfs -text +*.txz filter=lfs diff=lfs merge=lfs -text +*.whl filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57affb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.osc diff --git a/CVE-2024-39705-disable-download.patch b/CVE-2024-39705-disable-download.patch new file mode 100644 index 0000000..917d451 --- /dev/null +++ b/CVE-2024-39705-disable-download.patch @@ -0,0 +1,104 @@ +--- + nltk/app/chartparser_app.py | 13 +++++++++++++ + nltk/corpus/reader/util.py | 2 ++ + nltk/data.py | 2 ++ + nltk/parse/transitionparser.py | 2 ++ + nltk/tbl/demo.py | 4 +++- + 5 files changed, 22 insertions(+), 1 deletion(-) + +--- a/nltk/app/chartparser_app.py ++++ b/nltk/app/chartparser_app.py +@@ -800,6 +800,10 @@ class ChartComparer: + showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}") + + def load_chart_dialog(self, *args): ++ showerror("Security Error", ++ "Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") ++ return + filename = askopenfilename( + filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" + ) +@@ -811,6 +815,8 @@ class ChartComparer: + showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}") + + def load_chart(self, filename): ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + with open(filename, "rb") as infile: + chart = pickle.load(infile) + name = os.path.basename(filename) +@@ -2268,6 +2274,10 @@ class ChartParserApp: + if not filename: + return + try: ++ showerror("Security Error", ++ "Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") ++ return + with open(filename, "rb") as infile: + chart = pickle.load(infile) + self._chart = chart +@@ -2306,6 +2316,9 @@ class ChartParserApp: + return + try: + if filename.endswith(".pickle"): ++ showerror("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") ++ return + with open(filename, "rb") as infile: + grammar = pickle.load(infile) + else: +--- a/nltk/corpus/reader/util.py ++++ b/nltk/corpus/reader/util.py +@@ -521,6 +521,8 @@ class PickleCorpusView(StreamBackedCorpu + + def read_block(self, stream): + result = [] ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + for i in range(self.BLOCK_SIZE): + try: + result.append(pickle.load(stream)) +--- a/nltk/data.py ++++ b/nltk/data.py +@@ -752,6 +752,8 @@ def load( + if format == "raw": + resource_val = opened_resource.read() + elif format == "pickle": ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + resource_val = pickle.load(opened_resource) + elif format == "json": + import json +--- a/nltk/parse/transitionparser.py ++++ b/nltk/parse/transitionparser.py +@@ -553,6 +553,8 @@ class TransitionParser(ParserI): + """ + result = [] + # First load the model ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + model = pickle.load(open(modelFile, "rb")) + operation = Transition(self._algorithm) + +--- a/nltk/tbl/demo.py ++++ b/nltk/tbl/demo.py +@@ -253,6 +253,8 @@ def postag( + ) + ) + with open(cache_baseline_tagger) as print_rules: ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + baseline_tagger = pickle.load(print_rules) + print(f"Reloaded pickled tagger from {cache_baseline_tagger}") + else: +@@ -327,7 +329,7 @@ def postag( + with open(serialize_output) as print_rules: + brill_tagger_reloaded = pickle.load(print_rules) + print(f"Reloaded pickled tagger from {serialize_output}") +- taggedtest_reloaded = brill_tagger.tag_sents(testing_data) ++ taggedtest_reloaded = brill_tagger_reloaded.tag_sents(testing_data) + if taggedtest == taggedtest_reloaded: + print("Reloaded tagger tried on test set, results identical") + else: diff --git a/nltk-3.8.1.tar.gz b/nltk-3.8.1.tar.gz new file mode 100644 index 0000000..47e3c7c --- /dev/null +++ b/nltk-3.8.1.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:676970e2b7aa0a7184e68f76e0c4f2756fd1b82559a509d5656a23117faeb658 +size 2867926 diff --git a/nltk-pr3207-py312.patch b/nltk-pr3207-py312.patch new file mode 100644 index 0000000..8524834 --- /dev/null +++ b/nltk-pr3207-py312.patch @@ -0,0 +1,110 @@ +From 25d35fc4283dedd2053ec6d821f4b707fff8d72c Mon Sep 17 00:00:00 2001 +From: Konstantin Chernyshev +Date: Thu, 16 Nov 2023 19:00:15 +0100 +Subject: [PATCH 1/8] ci: enable 3.12 in ci tests + +--- + .github/workflows/ci.yaml | 2 +- + README.md | 2 +- + nltk/test/unit/translate/test_bleu.py | 1 - + nltk/translate/bleu_score.py | 29 +++++++++++++++++++++++++++-- + setup.py | 3 ++- + 5 files changed, 31 insertions(+), 6 deletions(-) + +--- a/.github/workflows/ci.yaml ++++ b/.github/workflows/ci.yaml +@@ -76,7 +76,7 @@ jobs: + needs: [cache_nltk_data, cache_third_party] + strategy: + matrix: +- python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] ++ python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + os: [ubuntu-latest, macos-latest, windows-latest] + fail-fast: false + runs-on: ${{ matrix.os }} +--- a/README.md ++++ b/README.md +@@ -4,7 +4,7 @@ + + NLTK -- the Natural Language Toolkit -- is a suite of open source Python + modules, data sets, and tutorials supporting research and development in Natural +-Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11. ++Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12. + + For documentation, please visit [nltk.org](https://www.nltk.org/). + +--- a/nltk/test/unit/translate/test_bleu.py ++++ b/nltk/test/unit/translate/test_bleu.py +@@ -2,7 +2,6 @@ + Tests for BLEU translation evaluation metric + """ + +-import io + import unittest + + from nltk.data import find +--- a/nltk/translate/bleu_score.py ++++ b/nltk/translate/bleu_score.py +@@ -7,16 +7,41 @@ + # For license information, see LICENSE.TXT + + """BLEU score implementation.""" +- + import math + import sys + import warnings + from collections import Counter +-from fractions import Fraction ++from fractions import Fraction as _Fraction + + from nltk.util import ngrams + + ++class Fraction(_Fraction): ++ """Fraction with _normalize=False support for 3.12""" ++ ++ def __new__(cls, numerator=0, denominator=None, _normalize=False): ++ if sys.version_info >= (3, 12): ++ self = super().__new__(cls, numerator, denominator) ++ else: ++ self = super().__new__(cls, numerator, denominator, _normalize=_normalize) ++ self._normalize = _normalize ++ self._original_numerator = numerator ++ self._original_denominator = denominator ++ return self ++ ++ @property ++ def numerator(self): ++ if not self._normalize: ++ return self._original_numerator ++ return super().numerator ++ ++ @property ++ def denominator(self): ++ if not self._normalize: ++ return self._original_denominator ++ return super().denominator ++ ++ + def sentence_bleu( + references, + hypothesis, +--- a/setup.py ++++ b/setup.py +@@ -67,7 +67,7 @@ setup( + }, + long_description="""\ + The Natural Language Toolkit (NLTK) is a Python package for +-natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""", ++natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""", + license="Apache License, Version 2.0", + keywords=[ + "NLP", +@@ -100,6 +100,7 @@ natural language processing. NLTK requi + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ++ "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Human Machine Interfaces", diff --git a/nltk_data.tar.xz b/nltk_data.tar.xz new file mode 100644 index 0000000..97a033a --- /dev/null +++ b/nltk_data.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f79462ac99f414b4850943720bed4a59c1bb15bfc8f1ce16b26165da6db07680 +size 393271816 diff --git a/python-nltk.changes b/python-nltk.changes new file mode 100644 index 0000000..2b401ed --- /dev/null +++ b/python-nltk.changes @@ -0,0 +1,409 @@ +------------------------------------------------------------------- +Mon Jul 1 21:02:45 UTC 2024 - Matej Cepl + +- Use tarball from GitHub instead of the Zip archive from PyPI, + the latter has very messy combination of CRLF and LF EOLs, + which are hard to patch. +- Refresh all patches from the original locations. +- Add CVE-2024-39705-disable-download.patch to make a crude + workaround around CVE-2024-39705 (gh#nltk/nltk#3266, + bsc#1227174). + +------------------------------------------------------------------- +Thu Mar 21 17:41:52 UTC 2024 - Ben Greiner + +- Update to 3.8.1 + * Resolve RCE & XSS vulnerabilities in localhost WordNet Browser + * Add Python 3.11 support +- Update nltk_data archive +- Drop port-2to3.patch +- Add nltk-pr3207-py312.patch for Python 3.12 support + * gh#nltk/nltk#3207 + +------------------------------------------------------------------- +Tue Mar 28 08:36:04 UTC 2023 - pgajdos@suse.com + +- python-six is not required + +------------------------------------------------------------------- +Fri Jan 6 15:32:43 UTC 2023 - Yogalakshmi Arunachalam + +- Update to 3.8 + +* Refactor dispersion plot (#3082) +* Provide type hints for LazyCorpusLoader variables (#3081) +* Throw warning when LanguageModel is initialized with incorrect vocabulary (#3080) +* Fix WordNet's all_synsets() function (#3078) +* Resolve TreebankWordDetokenizer inconsistency with end-of-string contractions (#3070) +* Support both iso639-3 codes and BCP-47 language tags (#3060) +* Avoid DeprecationWarning in Regexp tokenizer (#3055) +* Fix many doctests, add doctests to CI (#3054, #3050, #3048) +* Fix bool field not being read in VerbNet (#3044) +* Greatly improve time efficiency of SyllableTokenizer when tokenizing numbers (#3042) +* Fix encodings of Polish udhr corpus reader (#3038) +* Allow TweetTokenizer to tokenize emoji flag sequences (#3034) +* Prevent LazyModule from increasing the size of nltk.__dict__ (#3033) +* Fix CoreNLPServer non-default port issue (#3031) +* Add "acion" suffix to the Spanish SnowballStemmer (#3030) +* Allow loading WordNet without OMW (#3026) +* Use input() in nltk.chat.chatbot() for Jupyter support (#3022) +* Fix edit_distance_align() in distance.py (#3017) +* Tackle performance and accuracy regression of sentence tokenizer since NLTK 3.6.6 (#3014) +* Add the Iota operator to semantic logic (#3010) +* Resolve critical errors in WordNet app (#3008) +* Resolve critical error in CHILDES Corpus (#2998) +* Make WordNet information_content() accept adjective satellites (#2995) +* Add "strict=True" parameter to CoreNLP (#2993, #3043) +* Resolve issue with WordNet's synset_from_sense_key (#2988) +* Handle WordNet synsets that were lost in mapping (#2985) +* Resolve TypeError in Boxer (#2979) +* Add function to retrieve WordNet synonyms (#2978) +* Warn about nonexistent OMW offsets instead of raising an error (#2974) +* Fix missing ic argument in res, jcn and lin similarity functions of WordNet (#2970) +* Add support for the extended OMW (#2946) +* Fix LC cutoff policy of text tiling (#2936) +* Optimize ConditionalFreqDist.__add__ performance (#2939) +* Add Markdown corpus reader (#2902) + +------------------------------------------------------------------- +Mon Dec 26 10:41:22 UTC 2022 - Matej Cepl + +- Complete nltk_data.tar.xz for offline testing +- Fix failing tests (gh#nltk/nltk#2969) by adding patches: + - port-2to3.patch + - skip-networked-test.patch +- Clean up the SPEC to get rid of rpmlint warnings. + +------------------------------------------------------------------- +Tue Mar 22 07:48:14 UTC 2022 - Matej Cepl + +- Update to 3.7 + - Improve and update the NLTK team page on nltk.org (#2855, + #2941) + - Drop support for Python 3.6, support Python 3.10 (#2920) +- Update to 3.6.7 + - Resolve IndexError in `sent_tokenize` and `word_tokenize` + (#2922) +- Update to 3.6.6 + - Refactor `gensim.doctest` to work for gensim 4.0.0 and up + (#2914) + - Add Precision, Recall, F-measure, Confusion Matrix to Taggers + (#2862) + - Added warnings if .zip files exist without any corresponding + .csv files. (#2908) + - Fix `FileNotFoundError` when the `download_dir` is + a non-existing nested folder (#2910) + - Rename omw to omw-1.4 (#2907) + - Resolve ReDoS opportunity by fixing incorrectly specified + regex (#2906, bsc#1191030, CVE-2021-3828). + - Support OMW 1.4 (#2899) + - Deprecate Tree get and set node methods (#2900) + - Fix broken inaugural test case (#2903) + - Use Multilingual Wordnet Data from OMW with newer Wordnet + versions (#2889) + - Keep NLTKs "tokenize" module working with pathlib (#2896) + - Make prettyprinter to be more readable (#2893) + - Update links to the nltk book (#2895) + - Add `CITATION.cff` to nltk (#2880) + - Resolve serious ReDoS in PunktSentenceTokenizer (#2869) + - Delete old CI config files (#2881) + - Improve Tokenize documentation + add TokenizerI as superclass + for TweetTokenizer (#2878) + - Fix expected value for BLEU score doctest after changes from + #2572 + - Add multi Bleu functionality and tests (#2793) + - Deprecate 'return_str' parameter in NLTKWordTokenizer and + TreebankWordTokenizer (#2883) + - Allow empty string in CFG's + more (#2888) + - Partition `tree.py` module into `tree` package + pickle fix + (#2863) + - Fix several TreebankWordTokenizer and NLTKWordTokenizer bugs + (#2877) + - Rewind Wordnet data file after each lookup (#2868) + - Correct __init__ call for SyntaxCorpusReader subclasses + (#2872) + - Documentation fixes (#2873) + - Fix levenstein distance for duplicated letters (#2849) + - Support alternative Wordnet versions (#2860) + - Remove hundreds of formatting warnings for nltk.org (#2859) + - Modernize `nltk.org/howto` pages (#2856) + - Fix Bleu Score smoothing function from taking log(0) (#2839) + - Update third party tools to newer versions and removing + MaltParser fixed version (#2832) + - Fix TypeError: _pretty() takes 1 positional argument but 2 + were given in sem/drt.py (#2854) + - Replace `http` with `https` in most URLs (#2852) +- Update to 3.6.5 + - modernised nltk.org website + - addressed LGTM.com issues + - support ZWJ sequences emoji and skin tone modifer emoji in + TweetTokenizer + - METEOR evaluation now requires pre-tokenized input + - Code linting and type hinting + - implement get_refs function for DrtLambdaExpression + - Enable automated CoreNLP, Senna, Prover9/Mace4, Megam, + MaltParser CI tests + - specify minimum regex version that supports regex.Pattern + - avoid re.Pattern and regex.Pattern which fail for Python 3.6, + 3.7 +- Update to 3.6.4 + - deprecate `nltk.usage(obj)` in favor of `help(obj)` + - resolve ReDoS vulnerability in Corpus Reader + - solidify performance tests + - improve phone number recognition in tweet tokenizer + - refactored CISTEM stemmer for German + - identify NLTK Team as the author + - replace travis badge with github actions badge + - add SECURITY.md +- Update to 3.6.3 + - Dropped support for Python 3.5 + - Run CI tests on Windows, too + - Moved from Travis CI to GitHub Actions + - Code and comment cleanups + - Visualize WordNet relation graphs using Graphviz + - Fixed large error in METEOR score + - Apply isort, pyupgrade, black, added as pre-commit hooks + - Prevent debug_decisions in Punkt from throwing IndexError + - Resolved ZeroDivisionError in RIBES with dissimilar sentences + - Initialize WordNet IC total counts with smoothing value + - Fixed AttributeError for Arabic ARLSTem2 stemmer + - Many fixes and improvements to lm language model package + - Fix bug in nltk.metrics.aline, C_skip = -10 + - Improvements to TweetTokenizer + - Optional show arg for FreqDist.plot, ConditionalFreqDist.plot + - edit_distance now computes Damerau-Levenshtein edit-distance +- Update to 3.6.2 + - move test code to nltk/test + - fix bug in NgramAssocMeasures (order preserving fix) +- Update to 3.6 + - add support for Python 3.9 + - add Tree.fromlist + - compute Minimum Spanning Tree of unweighted graph using BFS + - fix bug with infinite loop in Wordnet closure and tree + - fix bug in calculating BLEU using smoothing method 4 + - Wordnet synset similarities work for all pos + - new Arabic light stemmer (ARLSTem2) + - new syllable tokenizer (LegalitySyllableTokenizer) + - remove nose in favor of pytest + +------------------------------------------------------------------- +Thu Apr 23 13:54:08 UTC 2020 - John Vandenberg + +- Update to v3.5 + * add support for Python 3.8 + * drop support for Python 2 + * create NLTK's own Tokenizer class distinct from the Treebank + reference tokeniser + * update Vader sentiment analyser + * fix JSON serialization of some PoS taggers + * minor improvements in grammar.CFG, Vader, pl196x corpus reader, + StringTokenizer + * change implementation <= and >= for FreqDist so they are partial + orders + * make FreqDist iterable + * correctly handle Penn Treebank trees with a unlabeled branching + top node + +------------------------------------------------------------------- +Sat Mar 14 09:07:16 UTC 2020 - Tomáš Chvátal + +- Fix build without python2 + +------------------------------------------------------------------- +Mon Oct 14 14:00:43 UTC 2019 - Matej Cepl + +- Replace %fdupes -s with plain %fdupes; hardlinks are better. + +------------------------------------------------------------------- +Wed Sep 11 11:05:01 UTC 2019 - Tomáš Chvátal + +- Update to 3.4.5 (bsc#1146427, CVE-2019-14751): + * Fixed security bug in downloader: Zip slip vulnerability - for the + unlikely situation where a user configures their downloader to use + a compromised server CVE-2019-14751 + +------------------------------------------------------------------- +Tue Jul 23 13:52:24 UTC 2019 - Tomáš Chvátal + +- Update to 3.4.4: + * fix bug in plot function (probability.py) + * add improved PanLex Swadesh corpus reader + * add Text.generate() + * add QuadgramAssocMeasures + * add SSP to tokenizers + * return confidence of best tag from AveragedPerceptron + * make plot methods return Axes objects + * don't require list arguments to PositiveNaiveBayesClassifier.train + * fix Tree classes to work with native Python copy library + * fix inconsistency for NomBank + * fix random seeding in LanguageModel.generate + * fix ConditionalFreqDist mutation on tabulate/plot call + * fix broken links in documentation + * fix misc Wordnet issues + * update installation instructions + +------------------------------------------------------------------- +Thu May 23 12:41:31 UTC 2019 - pgajdos@suse.com + +- version update to 3.4.1 + * add chomsky_normal_form for CFGs + * add meteor score + * add minimum edit/Levenshtein distance based alignment function + * allow access to collocation list via text.collocation_list() + * support corenlp server options + * drop support for Python 3.4 + * other minor fixes + +------------------------------------------------------------------- +Sun Feb 10 16:19:17 UTC 2019 - John Vandenberg + +- Remove Python 3 dependency on singledispatch + +------------------------------------------------------------------- +Sat Feb 9 16:16:11 UTC 2019 - John Vandenberg + +- Update to v3.4 + + Support Python 3.7 + + New Language Modeling package + + Cistem Stemmer for German + + Support Russian National Corpus incl POS tag model + + Krippendorf Alpha inter-rater reliability test + + Comprehensive code clean-ups + + Switch continuous integration from Jenkins to Travis +- from v3.3 + + Support Python 3.6 + + New interface to CoreNLP + + Support synset retrieval by sense key + + Minor fixes to CoNLL Corpus Reader + + AlignedSent + + Fixed minor inconsistencies in APIs and API documentation + + Better conformance to PEP8 + + Drop Moses Tokenizer (incompatible license) + +------------------------------------------------------------------- +Wed Feb 6 09:44:56 UTC 2019 - John Vandenberg + +- Add missing dependency six +- Remove unnecessary build dependency six +- Recommend all optional dependencies + +------------------------------------------------------------------- +Tue Mar 6 20:35:00 UTC 2018 - jengelh@inai.de + +- Trim redundant wording from description. + +------------------------------------------------------------------- +Mon Mar 5 15:02:00 UTC 2018 - badshah400@gmail.com + +- Use \%license instead of \%doc to install License.txt. + +------------------------------------------------------------------- +Tue Jan 30 17:16:13 UTC 2018 - guigo.lourenco@gmail.com + +- Depend on the full python interpreter to fix sqlite3 import + during %check + +------------------------------------------------------------------- +Tue Jan 16 11:02:13 UTC 2018 - guigo.lourenco@gmail.com + +- Depend on python-rpm-macros +- Build for both Python2 and Python3 + +------------------------------------------------------------------- +Tue Dec 19 15:50:13 UTC 2017 - badshah400@gmail.com + +- Update to version 3.2.5: + * Arabic stemmers (ARLSTem, Snowball) + * NIST MT evaluation metric and added NIST + international_tokenize + * Moses tokenizer + * Document Russian tagger + * Fix to Stanford segmenter + * Improve treebank detokenizer, VerbNet, Vader + * Misc code and documentation cleanups + * Implement fixes suggested by LGTM +- Convert specfile to python single-spec style. +- Drop unneeded BuildRequires: python-PyYAML, python-xml, + python-devel; not required for building. +- Change existing Requires to Recommends: these are really needed + for additional features, and not required for basic nltk usage. +- Add new Recommends: python-scipy, python-matplotlib, + python-pyparsing, and python-gensim; enables other optional + features. +- Run fdupes to link-up duplicate files. +- Remove exec permissions for a file not intended to be executed + (not in exec path, no hashbang, etc.) +- Remove hashbangs from non-executable files. +- Run tests following the suggestion from + http://www.nltk.org/install.html. + +------------------------------------------------------------------- +Tue Feb 21 13:11:31 UTC 2017 - stephan.barth@suse.com + +- update to version 3.2.2 + Upstream changelog: + Support for Aline, ChrF and GLEU MT evaluation metrics, Russian POS tagger + model, Moses detokenizer, rewrite Porter Stemmer and FrameNet corpus reader, + update FrameNet Corpus to version 1.7, fixes: stanford_segmenter.py, + SentiText, CoNLL Corpus Reader, BLEU, naivebayes, Krippendorff’s alpha, + Punkt, Moses tokenizer, TweetTokenizer, ToktokTokenizer; improvements to + testing framework + +------------------------------------------------------------------- +Fri Oct 14 00:31:15 UTC 2016 - toddrme2178@gmail.com + +- Update to version 3.2.1 + + No changelog available + +------------------------------------------------------------------- +Thu May 21 14:53:43 UTC 2015 - toddrme2178@gmail.com + +- Remove upstreamed nltk-2.0.4-dont-use-python-distribute.patch +- Update to version 3.0.2 + + No changelog available + +------------------------------------------------------------------- +Sun Dec 8 13:33:14 UTC 2013 - p.drouand@gmail.com + +- Update to version 2.0.4 + + No changelog available +- Add nltk-2.0.4-dont-use-python-distribute.patch ; force use of + python-setuptools instead of python-distribute + +------------------------------------------------------------------- +Thu Oct 24 11:09:19 UTC 2013 - speilicke@suse.com + +- Require python-setuptools instead of distribute (upstreams merged) + +------------------------------------------------------------------- +Fri Sep 23 12:29:05 UTC 2011 - saschpe@suse.de + +- Update to version 2.0.1rc1 + +------------------------------------------------------------------- +Sun Feb 7 18:51:07 CST 2010 - oddrationale@gmail.com + +- fixed copyright and license statements +- removed PyYAML, and added dependency to installers and download + instructions +- updated to LogicParser, DRT (Dan Garrette) +- WordNet similarity metrics return None instead of -1 when + they fail to find a path (Steve Bethard) +- shortest_path_distance uses instance hypernyms (Jordan + Boyd-Graber) +- clean_html improved (Bjorn Maeland) +- batch_parse, batch_interpret and batch_evaluate functions allow + grammar or grammar filename as argument +- more Portuguese examples (portuguese_en.doctest, examples/pt.py) + +------------------------------------------------------------------- +Thu Dec 10 17:23:51 CST 2009 - oddrationale@gmail.com + +- added python-nltk-remove-yaml.patch to pevent conflict with + python-yaml +- added Requires: python-yaml + +------------------------------------------------------------------- +Wed Dec 9 15:39:35 CST 2009 - oddrationale@gmail.com + +- Initial Release (Version 2.0b7): Sun Feb 7 18:50:18 CST 2010 diff --git a/python-nltk.rpmlintrc b/python-nltk.rpmlintrc new file mode 100644 index 0000000..1c89015 --- /dev/null +++ b/python-nltk.rpmlintrc @@ -0,0 +1,2 @@ +addFilter("E: zero-length /usr/lib/python3\.\d+/site-packages/nltk/tbl/api\.py") +addFilter("explicit-lib-dependency python3\d*-joblib") diff --git a/python-nltk.spec b/python-nltk.spec new file mode 100644 index 0000000..21b8297 --- /dev/null +++ b/python-nltk.spec @@ -0,0 +1,193 @@ +# +# spec file for package python-nltk +# +# Copyright (c) 2024 SUSE LLC +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via https://bugs.opensuse.org/ +# + + +%define modname nltk +Name: python-nltk +Version: 3.8.1 +Release: 0 +Summary: Natural Language Toolkit +License: Apache-2.0 +URL: http://nltk.org/ +# SourceRepository: https://github.com/nltk/nltk +Source0: https://github.com/nltk/%{modname}/archive/refs/tags/%{version}.tar.gz#/%{modname}-%{version}.tar.gz +# Download/Update NLTK data: +# quilt setup python-nltk.spec +# pushd nltk-?.?.? +# python3 -m nltk.downloader -d nltk_data tests \ +# averaged_perceptron_tagger_ru \ +# brown \ +# cess_cat \ +# cess_esp \ +# conll2007 \ +# floresta \ +# gutenberg \ +# inaugural \ +# indian \ +# large_grammars \ +# nombank.1.0 \ +# omw-1.4 \ +# pl196x \ +# ptb \ +# punkt \ +# rte \ +# sinica_treebank \ +# stopwords \ +# treebank \ +# udhr \ +# universal_tagset \ +# wordnet \ +# wordnet_ic \ +# words +# tar -cJf ../nltk_data.tar.xz nltk_data +# popd +# see https://www.nltk.org/data.html for more details +Source1: nltk_data.tar.xz +Source99: python-nltk.rpmlintrc +# PATCH-FIX-UPSTREAM skip-networked-test.patch gh#nltk/nltk#2969 mcepl@suse.com +# skip tests requiring network connection +Patch0: skip-networked-test.patch +# PATCH-FIX-UPSTREAM nltk-pr3207-py312.patch gh#nltk/nltk#3207 +Patch1: nltk-pr3207-py312.patch +# PATCH-FIX-UPSTREAM CVE-2024-39705-disable-download.patch bsc#1227174 mcepl@suse.com +# this patch makes things totally awesome +Patch2: CVE-2024-39705-disable-download.patch +BuildRequires: %{python_module base >= 3.7} +BuildRequires: %{python_module pip} +BuildRequires: %{python_module setuptools} +BuildRequires: %{python_module wheel} +BuildRequires: %{pythons} +BuildRequires: fdupes +BuildRequires: python-rpm-macros +BuildRequires: unzip +# SECTION runtime +BuildRequires: %{python_module regex >= 2021.8.3} +BuildRequires: %{python_module click} +BuildRequires: %{python_module joblib} +BuildRequires: %{python_module tqdm} +# /SECTION +# SECTION test +BuildRequires: %{python_module tk} +BuildRequires: %{python_module Jinja2} +BuildRequires: %{python_module matplotlib} +BuildRequires: %{python_module numpy} +BuildRequires: %{python_module pyparsing} +BuildRequires: %{python_module pytest-cov} +BuildRequires: %{python_module pytest-mock} +BuildRequires: %{python_module pytest} +BuildRequires: %{python_module python-crfsuite} +BuildRequires: %{python_module requests} +BuildRequires: %{python_module scikit-learn} +BuildRequires: %{python_module scipy} +BuildRequires: %{python_module text-unidecode} +BuildRequires: %{python_module twython} +# /SECTION +Requires: python-regex >= 2021.8.3 +Requires: python-click +Requires: python-joblib +Requires: python-tqdm +Recommends: python-gensim +Recommends: python-matplotlib +Recommends: python-numpy +Recommends: python-pyparsing +Recommends: python-python-crfsuite +Recommends: python-requests +Recommends: python-scikit-learn +Recommends: python-scipy +Recommends: python-twython +Requires(post): update-alternatives +Requires(postun): update-alternatives +BuildArch: noarch +%python_subpackages + +# changedir = nltk/test + +%description +NLTK -- the Natural Language Toolkit -- is a suite of +Python modules, data sets and tutorials supporting research and +development in Natural Language Processing. + +%prep +%setup -q -a1 -n %{modname}-%{version} + +# Fix EOL +sed -i 's/\r/\n/g; s/\n$//' \ + README.md \ + nltk/corpus/reader/knbc.py \ + nltk/test/unit/test_tgrep.py \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py \ + nltk/corpus/reader/knbc.py \ + nltk/test/unit/test_tgrep.py \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py \ + nltk/corpus/reader/knbc.py \ + nltk/test/unit/test_tgrep.py \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py + +# Remove unrequired shebangs +sed -E -i "/#![[:space:]]*\/usr\/bin\/env python/d" \ + nltk/tgrep.py \ + nltk/tokenize/stanford_segmenter.py \ + nltk/test/unit/test_tgrep.py \ + nltk/corpus/reader/knbc.py + +# Switch shebangs to the standard Python interpreter +sed -E -i "s|#![[:space:]]*%{_bindir}/env python|#!%{_bindir}/python3|" \ + setup.py \ + tools/global_replace.py \ + nltk_data/corpora/pl196x/splitter.py \ + tools/find_deprecated.py + +%autopatch -p1 + +%build +%pyproject_wheel + +%install +%pyproject_install +%python_clone -a %{buildroot}%{_bindir}/nltk + +%{python_expand %fdupes %{buildroot}%{$python_sitelib}/ +chmod -x %{buildroot}%{$python_sitelib}/nltk/test/dependency.doctest +} + +%check +export NLTK_DATA=$(readlink -f ./nltk_data/) +# export PYTEST_ADDOPTS="--doctest-modules" +# Skip tests requiring pickle.load gh#nltk/nltk#3266 (CVE-2024-39705) +skip_tests=" or test_basic or test_increment or test_pad_asterisk or test_pad_dotdot" +skip_tests+=" or test_pos_tag_eng or test_pos_tag_eng_universal or test_pos_tag_rus" +skip_tests+=" or test_pos_tag_rus_universal or test_pos_tag_unknown_lang" +skip_tests+=" or test_sent_tokenize or test_unspecified_lang or test_word_tokenize" +%pytest -k "not (network ${skip_tests})" + +%post +%python_install_alternative nltk + +%postun +%python_uninstall_alternative nltk + +%files %{python_files} +%doc README.md +%license LICENSE.txt +%{python_sitelib}/nltk/ +%{python_sitelib}/nltk-%{version}.dist-info/ +%python_alternative %{_bindir}/nltk + +%changelog diff --git a/skip-networked-test.patch b/skip-networked-test.patch new file mode 100644 index 0000000..f1cd8f7 --- /dev/null +++ b/skip-networked-test.patch @@ -0,0 +1,35 @@ +--- + nltk/test/unit/test_downloader.py | 4 ++++ + setup.cfg | 4 ++++ + 2 files changed, 8 insertions(+) + +--- a/nltk/test/unit/test_downloader.py ++++ b/nltk/test/unit/test_downloader.py +@@ -1,6 +1,9 @@ + from nltk import download + ++import pytest + ++ ++@pytest.mark.network + def test_downloader_using_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir exists""" + +@@ -9,6 +12,7 @@ def test_downloader_using_existing_paren + assert download_status is True + + ++@pytest.mark.network + def test_downloader_using_non_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir does not exist""" + +--- a/setup.cfg ++++ b/setup.cfg +@@ -1,3 +1,7 @@ ++[tool:pytest] ++markers = ++ network: test case requires network connection ++ + [metadata] + license_files = + LICENSE.txt