- Add CVE-2024-39705.patch upstream patch to fix unsafe pickle usage.

(CVE-2024-39705, gh#nltk/nltk#3266, bsc#1227174). - Drop CVE-2024-39705-disable-download.patch as it's not needed anymore. OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-nltk?expand=0&rev=49
2024-07-26 07:21:06 +00:00
commit 9adfbd9e5d
11 changed files with 928 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,23 @@
+## Default LFS
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.bsp filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.gem filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.jar filter=lfs diff=lfs merge=lfs -text
+*.lz filter=lfs diff=lfs merge=lfs -text
+*.lzma filter=lfs diff=lfs merge=lfs -text
+*.obscpio filter=lfs diff=lfs merge=lfs -text
+*.oxt filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.rpm filter=lfs diff=lfs merge=lfs -text
+*.tbz filter=lfs diff=lfs merge=lfs -text
+*.tbz2 filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text
+*.txz filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+.osc
--- a/CVE-2024-39705-disable-download.patch
+++ b/CVE-2024-39705-disable-download.patch
@@ -0,0 +1,104 @@
+---
+ nltk/app/chartparser_app.py    |   13 +++++++++++++
+ nltk/corpus/reader/util.py     |    2 ++
+ nltk/data.py                   |    2 ++
+ nltk/parse/transitionparser.py |    2 ++
+ nltk/tbl/demo.py               |    4 +++-
+ 5 files changed, 22 insertions(+), 1 deletion(-)
+
+--- a/nltk/app/chartparser_app.py
+++ b/nltk/app/chartparser_app.py
+@@ -800,6 +800,10 @@ class ChartComparer:
+             showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}")
+ 
+     def load_chart_dialog(self, *args):
+        showerror("Security Error",
+                  "Due to gh#nltk/nltk#3266, deserializing from " +
+                  "a pickle is forbidden.")
+        return
+         filename = askopenfilename(
+             filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+         )
+@@ -811,6 +815,8 @@ class ChartComparer:
+             showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}")
+ 
+     def load_chart(self, filename):
+        raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
+                           "a pickle is forbidden.")
+         with open(filename, "rb") as infile:
+             chart = pickle.load(infile)
+         name = os.path.basename(filename)
+@@ -2268,6 +2274,10 @@ class ChartParserApp:
+         if not filename:
+             return
+         try:
+            showerror("Security Error",
+                      "Due to gh#nltk/nltk#3266, deserializing from " +
+                      "a pickle is forbidden.")
+            return
+             with open(filename, "rb") as infile:
+                 chart = pickle.load(infile)
+             self._chart = chart
+@@ -2306,6 +2316,9 @@ class ChartParserApp:
+             return
+         try:
+             if filename.endswith(".pickle"):
+                showerror("Due to gh#nltk/nltk#3266, deserializing from " +
+                          "a pickle is forbidden.")
+                return
+                 with open(filename, "rb") as infile:
+                     grammar = pickle.load(infile)
+             else:
+--- a/nltk/corpus/reader/util.py
+++ b/nltk/corpus/reader/util.py
+@@ -521,6 +521,8 @@ class PickleCorpusView(StreamBackedCorpu
+ 
+     def read_block(self, stream):
+         result = []
+        raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
+                           "a pickle is forbidden.")
+         for i in range(self.BLOCK_SIZE):
+             try:
+                 result.append(pickle.load(stream))
+--- a/nltk/data.py
+++ b/nltk/data.py
+@@ -752,6 +752,8 @@ def load(
+     if format == "raw":
+         resource_val = opened_resource.read()
+     elif format == "pickle":
+        raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
+                           "a pickle is forbidden.")
+         resource_val = pickle.load(opened_resource)
+     elif format == "json":
+         import json
+--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
+@@ -553,6 +553,8 @@ class TransitionParser(ParserI):
+         """
+         result = []
+         # First load the model
+        raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
+                           "a pickle is forbidden.")
+         model = pickle.load(open(modelFile, "rb"))
+         operation = Transition(self._algorithm)
+ 
+--- a/nltk/tbl/demo.py
+++ b/nltk/tbl/demo.py
+@@ -253,6 +253,8 @@ def postag(
+                 )
+             )
+         with open(cache_baseline_tagger) as print_rules:
+            raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
+                               "a pickle is forbidden.")
+             baseline_tagger = pickle.load(print_rules)
+             print(f"Reloaded pickled tagger from {cache_baseline_tagger}")
+     else:
+@@ -327,7 +329,7 @@ def postag(
+         with open(serialize_output) as print_rules:
+             brill_tagger_reloaded = pickle.load(print_rules)
+         print(f"Reloaded pickled tagger from {serialize_output}")
+-        taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
+        taggedtest_reloaded = brill_tagger_reloaded.tag_sents(testing_data)
+         if taggedtest == taggedtest_reloaded:
+             print("Reloaded tagger tried on test set, results identical")
+         else:
--- a/CVE-2024-39705.patch
+++ b/CVE-2024-39705.patch
@@ -0,0 +1,38 @@
+From a12d0a6a8cdba58d5e4e5f92ac62bb80fc26c624 Mon Sep 17 00:00:00 2001
+From: Eric Kafe <kafe.eric@gmail.com>
+Date: Tue, 23 Jul 2024 09:09:09 +0200
+Subject: [PATCH] Prevent data.load from unpickling classes or functions
+
+---
+ nltk/data.py | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+diff --git a/nltk/data.py b/nltk/data.py
+index cc9229b0a2..fb242721c5 100644
+--- a/nltk/data.py
+++ b/nltk/data.py
+@@ -658,6 +658,15 @@ def retrieve(resource_url, filename=None, verbose=True):
+ }
+ 
+ 
+def restricted_pickle_load(string):
+    """
+    Prevents any class or function from loading.
+    """
+    from nltk.app.wordnet_app import RestrictedUnpickler
+
+    return RestrictedUnpickler(BytesIO(string)).load()
+
+
+ def load(
+     resource_url,
+     format="auto",
+@@ -751,7 +760,7 @@ def load(
+     if format == "raw":
+         resource_val = opened_resource.read()
+     elif format == "pickle":
+-        resource_val = pickle.load(opened_resource)
+        resource_val = restricted_pickle_load(opened_resource.read())
+     elif format == "json":
+         import json
+ 
--- a/nltk-3.8.1.tar.gz
+++ b/nltk-3.8.1.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:676970e2b7aa0a7184e68f76e0c4f2756fd1b82559a509d5656a23117faeb658
+size 2867926
--- a/nltk-pr3207-py312.patch
+++ b/nltk-pr3207-py312.patch
@@ -0,0 +1,110 @@
+From 25d35fc4283dedd2053ec6d821f4b707fff8d72c Mon Sep 17 00:00:00 2001
+From: Konstantin Chernyshev <k4black@ya.ru>
+Date: Thu, 16 Nov 2023 19:00:15 +0100
+Subject: [PATCH 1/8] ci: enable 3.12 in ci tests
+
+---
+ .github/workflows/ci.yaml             |    2 +-
+ README.md                             |    2 +-
+ nltk/test/unit/translate/test_bleu.py |    1 -
+ nltk/translate/bleu_score.py          |   29 +++++++++++++++++++++++++++--
+ setup.py                              |    3 ++-
+ 5 files changed, 31 insertions(+), 6 deletions(-)
+
+--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
+@@ -76,7 +76,7 @@ jobs:
+     needs: [cache_nltk_data, cache_third_party]
+     strategy:
+       matrix:
+-        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
+         os: [ubuntu-latest, macos-latest, windows-latest]
+       fail-fast: false
+     runs-on: ${{ matrix.os }}
+--- a/README.md
+++ b/README.md
+@@ -4,7 +4,7 @@
+ 
+ NLTK -- the Natural Language Toolkit -- is a suite of open source Python
+ modules, data sets, and tutorials supporting research and development in Natural
+-Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11.
+Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.
+ 
+ For documentation, please visit [nltk.org](https://www.nltk.org/).
+ 
+--- a/nltk/test/unit/translate/test_bleu.py
+++ b/nltk/test/unit/translate/test_bleu.py
+@@ -2,7 +2,6 @@
+ Tests for BLEU translation evaluation metric
+ """
+ 
+-import io
+ import unittest
+ 
+ from nltk.data import find
+--- a/nltk/translate/bleu_score.py
+++ b/nltk/translate/bleu_score.py
+@@ -7,16 +7,41 @@
+ # For license information, see LICENSE.TXT
+ 
+ """BLEU score implementation."""
+-
+ import math
+ import sys
+ import warnings
+ from collections import Counter
+-from fractions import Fraction
+from fractions import Fraction as _Fraction
+ 
+ from nltk.util import ngrams
+ 
+ 
+class Fraction(_Fraction):
+    """Fraction with _normalize=False support for 3.12"""
+
+    def __new__(cls, numerator=0, denominator=None, _normalize=False):
+        if sys.version_info >= (3, 12):
+            self = super().__new__(cls, numerator, denominator)
+        else:
+            self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
+        self._normalize = _normalize
+        self._original_numerator = numerator
+        self._original_denominator = denominator
+        return self
+
+    @property
+    def numerator(self):
+        if not self._normalize:
+            return self._original_numerator
+        return super().numerator
+
+    @property
+    def denominator(self):
+        if not self._normalize:
+            return self._original_denominator
+        return super().denominator
+
+
+ def sentence_bleu(
+     references,
+     hypothesis,
+--- a/setup.py
+++ b/setup.py
+@@ -67,7 +67,7 @@ setup(
+     },
+     long_description="""\
+ The Natural Language Toolkit (NLTK) is a Python package for
+-natural language processing.  NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""",
+natural language processing.  NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""",
+     license="Apache License, Version 2.0",
+     keywords=[
+         "NLP",
+@@ -100,6 +100,7 @@ natural language processing.  NLTK requi
+         "Programming Language :: Python :: 3.9",
+         "Programming Language :: Python :: 3.10",
+         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+         "Topic :: Scientific/Engineering",
+         "Topic :: Scientific/Engineering :: Artificial Intelligence",
+         "Topic :: Scientific/Engineering :: Human Machine Interfaces",
--- a/nltk_data.tar.xz
+++ b/nltk_data.tar.xz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f79462ac99f414b4850943720bed4a59c1bb15bfc8f1ce16b26165da6db07680
+size 393271816
--- a/python-nltk.changes
+++ b/python-nltk.changes
@@ -0,0 +1,417 @@
+-------------------------------------------------------------------
+Fri Jul 26 07:14:33 UTC 2024 - Daniel Garcia <daniel.garcia@suse.com>
+
+- Add CVE-2024-39705.patch upstream patch to fix unsafe pickle usage.
+  (CVE-2024-39705, gh#nltk/nltk#3266, bsc#1227174).
+- Drop CVE-2024-39705-disable-download.patch as it's not needed
+  anymore.
+
+-------------------------------------------------------------------
+Mon Jul  1 21:02:45 UTC 2024 - Matej Cepl <mcepl@cepl.eu>
+
+- Use tarball from GitHub instead of the Zip archive from PyPI,
+  the latter has very messy combination of CRLF and LF EOLs,
+  which are hard to patch.
+- Refresh all patches from the original locations.
+- Add CVE-2024-39705-disable-download.patch to make a crude
+  workaround around CVE-2024-39705 (gh#nltk/nltk#3266,
+  bsc#1227174).
+
+-------------------------------------------------------------------
+Thu Mar 21 17:41:52 UTC 2024 - Ben Greiner <code@bnavigator.de>
+
+- Update to 3.8.1
+  * Resolve RCE & XSS vulnerabilities in localhost WordNet Browser
+  * Add Python 3.11 support
+- Update nltk_data archive
+- Drop port-2to3.patch
+- Add nltk-pr3207-py312.patch for Python 3.12 support
+  * gh#nltk/nltk#3207
+
+-------------------------------------------------------------------
+Tue Mar 28 08:36:04 UTC 2023 - pgajdos@suse.com
+
+- python-six is not required
+
+-------------------------------------------------------------------
+Fri Jan  6 15:32:43 UTC 2023 - Yogalakshmi Arunachalam <yarunachalam@suse.com>
+
+- Update to 3.8 
+
+* Refactor dispersion plot (#3082)
+* Provide type hints for LazyCorpusLoader variables (#3081)
+* Throw warning when LanguageModel is initialized with incorrect vocabulary (#3080)
+* Fix WordNet's all_synsets() function (#3078)
+* Resolve TreebankWordDetokenizer inconsistency with end-of-string contractions (#3070)
+* Support both iso639-3 codes and BCP-47 language tags (#3060)
+* Avoid DeprecationWarning in Regexp tokenizer (#3055)
+* Fix many doctests, add doctests to CI (#3054, #3050, #3048)
+* Fix bool field not being read in VerbNet (#3044)
+* Greatly improve time efficiency of SyllableTokenizer when tokenizing numbers (#3042)
+* Fix encodings of Polish udhr corpus reader (#3038)
+* Allow TweetTokenizer to tokenize emoji flag sequences (#3034)
+* Prevent LazyModule from increasing the size of nltk.__dict__ (#3033)
+* Fix CoreNLPServer non-default port issue (#3031)
+* Add "acion" suffix to the Spanish SnowballStemmer (#3030)
+* Allow loading WordNet without OMW (#3026)
+* Use input() in nltk.chat.chatbot() for Jupyter support (#3022)
+* Fix edit_distance_align() in distance.py (#3017)
+* Tackle performance and accuracy regression of sentence tokenizer since NLTK 3.6.6 (#3014)
+* Add the Iota operator to semantic logic (#3010)
+* Resolve critical errors in WordNet app (#3008)
+* Resolve critical error in CHILDES Corpus (#2998)
+* Make WordNet information_content() accept adjective satellites (#2995)
+* Add "strict=True" parameter to CoreNLP (#2993, #3043)
+* Resolve issue with WordNet's synset_from_sense_key (#2988)
+* Handle WordNet synsets that were lost in mapping (#2985)
+* Resolve TypeError in Boxer (#2979)
+* Add function to retrieve WordNet synonyms (#2978)
+* Warn about nonexistent OMW offsets instead of raising an error (#2974)
+* Fix missing ic argument in res, jcn and lin similarity functions of WordNet (#2970)
+* Add support for the extended OMW (#2946)
+* Fix LC cutoff policy of text tiling (#2936)
+* Optimize ConditionalFreqDist.__add__ performance (#2939)
+* Add Markdown corpus reader (#2902)
+ 
+-------------------------------------------------------------------
+Mon Dec 26 10:41:22 UTC 2022 - Matej Cepl <mcepl@suse.com>
+
+- Complete nltk_data.tar.xz for offline testing
+- Fix failing tests (gh#nltk/nltk#2969) by adding patches:
+  - port-2to3.patch
+  - skip-networked-test.patch
+- Clean up the SPEC to get rid of rpmlint warnings.
+
+-------------------------------------------------------------------
+Tue Mar 22 07:48:14 UTC 2022 - Matej Cepl <mcepl@suse.com>
+
+- Update to 3.7
+  - Improve and update the NLTK team page on nltk.org (#2855,
+    #2941)
+  - Drop support for Python 3.6, support Python 3.10 (#2920)
+- Update to 3.6.7
+  - Resolve IndexError in `sent_tokenize` and `word_tokenize`
+    (#2922)
+- Update to 3.6.6
+  - Refactor `gensim.doctest` to work for gensim 4.0.0 and up
+    (#2914)
+  - Add Precision, Recall, F-measure, Confusion Matrix to Taggers
+    (#2862)
+  - Added warnings if .zip files exist without any corresponding
+    .csv files. (#2908)
+  - Fix `FileNotFoundError` when the `download_dir` is
+    a non-existing nested folder (#2910)
+  - Rename omw to omw-1.4 (#2907)
+  - Resolve ReDoS opportunity by fixing incorrectly specified
+    regex (#2906, bsc#1191030, CVE-2021-3828).
+  - Support OMW 1.4 (#2899)
+  - Deprecate Tree get and set node methods (#2900)
+  - Fix broken inaugural test case (#2903)
+  - Use Multilingual Wordnet Data from OMW with newer Wordnet
+    versions (#2889)
+  - Keep NLTKs "tokenize" module working with pathlib (#2896)
+  - Make prettyprinter to be more readable (#2893)
+  - Update links to the nltk book (#2895)
+  - Add `CITATION.cff` to nltk (#2880)
+  - Resolve serious ReDoS in PunktSentenceTokenizer (#2869)
+  - Delete old CI config files (#2881)
+  - Improve Tokenize documentation + add TokenizerI as superclass
+    for TweetTokenizer (#2878)
+  - Fix expected value for BLEU score doctest after changes from
+    #2572
+  - Add multi Bleu functionality and tests (#2793)
+  - Deprecate 'return_str' parameter in NLTKWordTokenizer and
+    TreebankWordTokenizer (#2883)
+  - Allow empty string in CFG's + more (#2888)
+  - Partition `tree.py` module into `tree` package + pickle fix
+    (#2863)
+  - Fix several TreebankWordTokenizer and NLTKWordTokenizer bugs
+    (#2877)
+  - Rewind Wordnet data file after each lookup (#2868)
+  - Correct __init__ call for SyntaxCorpusReader subclasses
+    (#2872)
+  - Documentation fixes (#2873)
+  - Fix levenstein distance for duplicated letters (#2849)
+  - Support alternative Wordnet versions (#2860)
+  - Remove hundreds of formatting warnings for nltk.org (#2859)
+  - Modernize `nltk.org/howto` pages (#2856)
+  - Fix Bleu Score smoothing function from taking log(0) (#2839)
+  - Update third party tools to newer versions and removing
+    MaltParser fixed version (#2832)
+  - Fix TypeError: _pretty() takes 1 positional argument but 2
+    were given in sem/drt.py (#2854)
+  - Replace `http` with `https` in most URLs (#2852)
+- Update to 3.6.5
+  - modernised nltk.org website
+  - addressed LGTM.com issues
+  - support ZWJ sequences emoji and skin tone modifer emoji in
+    TweetTokenizer
+  - METEOR evaluation now requires pre-tokenized input
+  - Code linting and type hinting
+  - implement get_refs function for DrtLambdaExpression
+  - Enable automated CoreNLP, Senna, Prover9/Mace4, Megam,
+    MaltParser CI tests
+  - specify minimum regex version that supports regex.Pattern
+  - avoid re.Pattern and regex.Pattern which fail for Python 3.6,
+    3.7
+- Update to 3.6.4
+  - deprecate `nltk.usage(obj)` in favor of `help(obj)`
+  - resolve ReDoS vulnerability in Corpus Reader
+  - solidify performance tests
+  - improve phone number recognition in tweet tokenizer
+  - refactored CISTEM stemmer for German
+  - identify NLTK Team as the author
+  - replace travis badge with github actions badge
+  - add SECURITY.md
+- Update to 3.6.3
+  - Dropped support for Python 3.5
+  - Run CI tests on Windows, too
+  - Moved from Travis CI to GitHub Actions
+  - Code and comment cleanups
+  - Visualize WordNet relation graphs using Graphviz
+  - Fixed large error in METEOR score
+  - Apply isort, pyupgrade, black, added as pre-commit hooks
+  - Prevent debug_decisions in Punkt from throwing IndexError
+  - Resolved ZeroDivisionError in RIBES with dissimilar sentences
+  - Initialize WordNet IC total counts with smoothing value
+  - Fixed AttributeError for Arabic ARLSTem2 stemmer
+  - Many fixes and improvements to lm language model package
+  - Fix bug in nltk.metrics.aline, C_skip = -10
+  - Improvements to TweetTokenizer
+  - Optional show arg for FreqDist.plot, ConditionalFreqDist.plot
+  - edit_distance now computes Damerau-Levenshtein edit-distance
+- Update to 3.6.2
+  - move test code to nltk/test
+  - fix bug in NgramAssocMeasures (order preserving fix)
+- Update to 3.6
+  - add support for Python 3.9
+  - add Tree.fromlist
+  - compute Minimum Spanning Tree of unweighted graph using BFS
+  - fix bug with infinite loop in Wordnet closure and tree
+  - fix bug in calculating BLEU using smoothing method 4
+  - Wordnet synset similarities work for all pos
+  - new Arabic light stemmer (ARLSTem2)
+  - new syllable tokenizer (LegalitySyllableTokenizer)
+  - remove nose in favor of pytest
+
+-------------------------------------------------------------------
+Thu Apr 23 13:54:08 UTC 2020 - John Vandenberg <jayvdb@gmail.com>
+
+- Update to v3.5
+  * add support for Python 3.8
+  * drop support for Python 2
+  * create NLTK's own Tokenizer class distinct from the Treebank
+    reference tokeniser
+  * update Vader sentiment analyser
+  * fix JSON serialization of some PoS taggers
+  * minor improvements in grammar.CFG, Vader, pl196x corpus reader,
+    StringTokenizer
+  * change implementation <= and >= for FreqDist so they are partial
+    orders
+  * make FreqDist iterable
+  * correctly handle Penn Treebank trees with a unlabeled branching
+    top node
+
+-------------------------------------------------------------------
+Sat Mar 14 09:07:16 UTC 2020 - Tomáš Chvátal <tchvatal@suse.com>
+
+- Fix build without python2
+
+-------------------------------------------------------------------
+Mon Oct 14 14:00:43 UTC 2019 - Matej Cepl <mcepl@suse.com>
+
+- Replace %fdupes -s with plain %fdupes; hardlinks are better.
+
+-------------------------------------------------------------------
+Wed Sep 11 11:05:01 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com>
+
+- Update to 3.4.5 (bsc#1146427, CVE-2019-14751):
+  * Fixed security bug in downloader: Zip slip vulnerability - for the
+    unlikely situation where a user configures their downloader to use
+    a compromised server CVE-2019-14751
+
+-------------------------------------------------------------------
+Tue Jul 23 13:52:24 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com>
+
+- Update to 3.4.4:
+  * fix bug in plot function (probability.py)
+  * add improved PanLex Swadesh corpus reader
+  * add Text.generate()
+  * add QuadgramAssocMeasures
+  * add SSP to tokenizers
+  * return confidence of best tag from AveragedPerceptron
+  * make plot methods return Axes objects
+  * don't require list arguments to PositiveNaiveBayesClassifier.train
+  * fix Tree classes to work with native Python copy library
+  * fix inconsistency for NomBank
+  * fix random seeding in LanguageModel.generate
+  * fix ConditionalFreqDist mutation on tabulate/plot call
+  * fix broken links in documentation
+  * fix misc Wordnet issues
+  * update installation instructions
+
+-------------------------------------------------------------------
+Thu May 23 12:41:31 UTC 2019 - pgajdos@suse.com
+
+- version update to 3.4.1
+  * add chomsky_normal_form for CFGs
+  * add meteor score
+  * add minimum edit/Levenshtein distance based alignment function
+  * allow access to collocation list via text.collocation_list()
+  * support corenlp server options
+  * drop support for Python 3.4
+  * other minor fixes
+
+-------------------------------------------------------------------
+Sun Feb 10 16:19:17 UTC 2019 - John Vandenberg <jayvdb@gmail.com>
+
+- Remove Python 3 dependency on singledispatch
+
+-------------------------------------------------------------------
+Sat Feb  9 16:16:11 UTC 2019 - John Vandenberg <jayvdb@gmail.com>
+
+- Update to v3.4
+  + Support Python 3.7
+  + New Language Modeling package
+  + Cistem Stemmer for German
+  + Support Russian National Corpus incl POS tag model
+  + Krippendorf Alpha inter-rater reliability test
+  + Comprehensive code clean-ups
+  + Switch continuous integration from Jenkins to Travis
+- from v3.3
+  + Support Python 3.6
+  + New interface to CoreNLP
+  + Support synset retrieval by sense key
+  + Minor fixes to CoNLL Corpus Reader
+  + AlignedSent
+  + Fixed minor inconsistencies in APIs and API documentation
+  + Better conformance to PEP8
+  + Drop Moses Tokenizer (incompatible license)
+
+-------------------------------------------------------------------
+Wed Feb  6 09:44:56 UTC 2019 - John Vandenberg <jayvdb@gmail.com>
+
+- Add missing dependency six
+- Remove unnecessary build dependency six
+- Recommend all optional dependencies
+
+-------------------------------------------------------------------
+Tue Mar  6 20:35:00 UTC 2018 - jengelh@inai.de
+
+- Trim redundant wording from description.
+
+-------------------------------------------------------------------
+Mon Mar  5 15:02:00 UTC 2018 - badshah400@gmail.com
+
+- Use \%license instead of \%doc to install License.txt.
+
+-------------------------------------------------------------------
+Tue Jan 30 17:16:13 UTC 2018 - guigo.lourenco@gmail.com
+
+- Depend on the full python interpreter to fix sqlite3 import
+  during %check
+
+-------------------------------------------------------------------
+Tue Jan 16 11:02:13 UTC 2018 - guigo.lourenco@gmail.com
+
+- Depend on python-rpm-macros
+- Build for both Python2 and Python3
+
+-------------------------------------------------------------------
+Tue Dec 19 15:50:13 UTC 2017 - badshah400@gmail.com
+
+- Update to version 3.2.5:
+  * Arabic stemmers (ARLSTem, Snowball)
+  * NIST MT evaluation metric and added NIST
+    international_tokenize
+  * Moses tokenizer
+  * Document Russian tagger
+  * Fix to Stanford segmenter
+  * Improve treebank detokenizer, VerbNet, Vader
+  * Misc code and documentation cleanups
+  * Implement fixes suggested by LGTM
+- Convert specfile to python single-spec style.
+- Drop unneeded BuildRequires: python-PyYAML, python-xml,
+  python-devel; not required for building.
+- Change existing Requires to Recommends: these are really needed
+  for additional features, and not required for basic nltk usage.
+- Add new Recommends: python-scipy, python-matplotlib,
+  python-pyparsing, and python-gensim; enables other optional
+  features.
+- Run fdupes to link-up duplicate files.
+- Remove exec permissions for a file not intended to be executed
+  (not in exec path, no hashbang, etc.)
+- Remove hashbangs from non-executable files.
+- Run tests following the suggestion from
+  http://www.nltk.org/install.html.
+
+-------------------------------------------------------------------
+Tue Feb 21 13:11:31 UTC 2017 - stephan.barth@suse.com
+
+- update to version 3.2.2
+  Upstream changelog:
+  Support for Aline, ChrF and GLEU MT evaluation metrics, Russian POS tagger
+  model, Moses detokenizer, rewrite Porter Stemmer and FrameNet corpus reader,
+  update FrameNet Corpus to version 1.7, fixes: stanford_segmenter.py,
+  SentiText, CoNLL Corpus Reader, BLEU, naivebayes, Krippendorff’s alpha,
+  Punkt, Moses tokenizer, TweetTokenizer, ToktokTokenizer; improvements to
+  testing framework
+
+-------------------------------------------------------------------
+Fri Oct 14 00:31:15 UTC 2016 - toddrme2178@gmail.com
+
+- Update to version 3.2.1
+  + No changelog available
+
+-------------------------------------------------------------------
+Thu May 21 14:53:43 UTC 2015 - toddrme2178@gmail.com
+
+- Remove upstreamed nltk-2.0.4-dont-use-python-distribute.patch
+- Update to version 3.0.2
+  + No changelog available
+
+-------------------------------------------------------------------
+Sun Dec  8 13:33:14 UTC 2013 - p.drouand@gmail.com
+
+- Update to version 2.0.4
+  + No changelog available
+- Add nltk-2.0.4-dont-use-python-distribute.patch ; force use of
+  python-setuptools instead of python-distribute
+
+-------------------------------------------------------------------
+Thu Oct 24 11:09:19 UTC 2013 - speilicke@suse.com
+
+- Require python-setuptools instead of distribute (upstreams merged)
+
+-------------------------------------------------------------------
+Fri Sep 23 12:29:05 UTC 2011 - saschpe@suse.de
+
+- Update to version 2.0.1rc1
+
+-------------------------------------------------------------------
+Sun Feb  7 18:51:07 CST 2010 - oddrationale@gmail.com
+ 
+- fixed copyright and license statements
+- removed PyYAML, and added dependency to installers and download
+  instructions
+- updated to LogicParser, DRT (Dan Garrette)
+- WordNet similarity metrics return None instead of -1 when
+  they fail to find a path (Steve Bethard)
+- shortest_path_distance uses instance hypernyms (Jordan
+  Boyd-Graber)
+- clean_html improved (Bjorn Maeland)
+- batch_parse, batch_interpret and batch_evaluate functions allow
+  grammar or grammar filename as argument
+- more Portuguese examples (portuguese_en.doctest, examples/pt.py)
+
+-------------------------------------------------------------------
+Thu Dec 10 17:23:51 CST 2009 - oddrationale@gmail.com
+
+- added python-nltk-remove-yaml.patch to pevent conflict with
+  python-yaml
+- added Requires: python-yaml
+
+-------------------------------------------------------------------
+Wed Dec  9 15:39:35 CST 2009 - oddrationale@gmail.com  
+  
+- Initial Release (Version 2.0b7): Sun Feb  7 18:50:18 CST 2010
--- a/python-nltk.rpmlintrc
+++ b/python-nltk.rpmlintrc
@@ -0,0 +1,2 @@
+addFilter("E: zero-length /usr/lib/python3\.\d+/site-packages/nltk/tbl/api\.py")
+addFilter("explicit-lib-dependency python3\d*-joblib")
--- a/python-nltk.spec
+++ b/python-nltk.spec
@@ -0,0 +1,192 @@
+#
+# spec file for package python-nltk
+#
+# Copyright (c) 2024 SUSE LLC
+#
+# All modifications and additions to the file contributed by third parties
+# remain the property of their copyright owners, unless otherwise agreed
+# upon. The license for this file, and modifications and additions to the
+# file, is the same license as for the pristine package itself (unless the
+# license for the pristine package is not an Open Source License, in which
+# case the license is the MIT License). An "Open Source License" is a
+# license that conforms to the Open Source Definition (Version 1.9)
+# published by the Open Source Initiative.
+
+# Please submit bugfixes or comments via https://bugs.opensuse.org/
+#
+
+
+%define modname nltk
+Name:           python-nltk
+Version:        3.8.1
+Release:        0
+Summary:        Natural Language Toolkit
+License:        Apache-2.0
+URL:            http://nltk.org/
+# SourceRepository: https://github.com/nltk/nltk
+Source0:        https://github.com/nltk/%{modname}/archive/refs/tags/%{version}.tar.gz#/%{modname}-%{version}.tar.gz
+# Download/Update NLTK data:
+#     quilt setup python-nltk.spec
+#     pushd nltk-?.?.?
+#     python3 -m nltk.downloader -d nltk_data tests \
+#          averaged_perceptron_tagger_ru \
+#          brown \
+#          cess_cat \
+#          cess_esp \
+#          conll2007 \
+#          floresta \
+#          gutenberg \
+#          inaugural \
+#          indian \
+#          large_grammars \
+#          nombank.1.0 \
+#          omw-1.4 \
+#          pl196x \
+#          ptb \
+#          punkt \
+#          rte \
+#          sinica_treebank \
+#          stopwords \
+#          treebank \
+#          udhr \
+#          universal_tagset \
+#          wordnet \
+#          wordnet_ic \
+#          words
+#     tar -cJf ../nltk_data.tar.xz nltk_data
+#     popd
+# see https://www.nltk.org/data.html for more details
+Source1:        nltk_data.tar.xz
+Source99:       python-nltk.rpmlintrc
+# PATCH-FIX-UPSTREAM skip-networked-test.patch gh#nltk/nltk#2969 mcepl@suse.com
+# skip tests requiring network connection
+Patch0:         skip-networked-test.patch
+# PATCH-FIX-UPSTREAM nltk-pr3207-py312.patch gh#nltk/nltk#3207
+Patch1:         nltk-pr3207-py312.patch
+# PATCH-FIX-UPSTREAM CVE-2024-39705.patch bsc#1227174 gh#nltk/nltk#3290
+Patch2:         CVE-2024-39705.patch
+BuildRequires:  %{python_module base >= 3.7}
+BuildRequires:  %{python_module pip}
+BuildRequires:  %{python_module setuptools}
+BuildRequires:  %{python_module wheel}
+BuildRequires:  %{pythons}
+BuildRequires:  fdupes
+BuildRequires:  python-rpm-macros
+BuildRequires:  unzip
+# SECTION runtime
+BuildRequires:  %{python_module regex >= 2021.8.3}
+BuildRequires:  %{python_module click}
+BuildRequires:  %{python_module joblib}
+BuildRequires:  %{python_module tqdm}
+# /SECTION
+# SECTION test
+BuildRequires:  %{python_module tk}
+BuildRequires:  %{python_module Jinja2}
+BuildRequires:  %{python_module matplotlib}
+BuildRequires:  %{python_module numpy}
+BuildRequires:  %{python_module pyparsing}
+BuildRequires:  %{python_module pytest-cov}
+BuildRequires:  %{python_module pytest-mock}
+BuildRequires:  %{python_module pytest}
+BuildRequires:  %{python_module python-crfsuite}
+BuildRequires:  %{python_module requests}
+BuildRequires:  %{python_module scikit-learn}
+BuildRequires:  %{python_module scipy}
+BuildRequires:  %{python_module text-unidecode}
+BuildRequires:  %{python_module twython}
+# /SECTION
+Requires:       python-regex >= 2021.8.3
+Requires:       python-click
+Requires:       python-joblib
+Requires:       python-tqdm
+Recommends:     python-gensim
+Recommends:     python-matplotlib
+Recommends:     python-numpy
+Recommends:     python-pyparsing
+Recommends:     python-python-crfsuite
+Recommends:     python-requests
+Recommends:     python-scikit-learn
+Recommends:     python-scipy
+Recommends:     python-twython
+Requires(post): update-alternatives
+Requires(postun): update-alternatives
+BuildArch:      noarch
+%python_subpackages
+
+# changedir = nltk/test
+
+%description
+NLTK -- the Natural Language Toolkit -- is a suite of
+Python modules, data sets and tutorials supporting research and
+development in Natural Language Processing.
+
+%prep
+%setup -q -a1 -n %{modname}-%{version}
+
+# Fix EOL
+sed -i 's/\r/\n/g; s/\n$//' \
+    README.md \
+    nltk/corpus/reader/knbc.py \
+    nltk/test/unit/test_tgrep.py \
+    nltk/tgrep.py \
+    nltk/tokenize/stanford_segmenter.py \
+    nltk/corpus/reader/knbc.py \
+    nltk/test/unit/test_tgrep.py \
+    nltk/tgrep.py \
+    nltk/tokenize/stanford_segmenter.py \
+    nltk/corpus/reader/knbc.py \
+    nltk/test/unit/test_tgrep.py \
+    nltk/tgrep.py \
+    nltk/tokenize/stanford_segmenter.py
+
+# Remove unrequired shebangs
+sed -E -i "/#![[:space:]]*\/usr\/bin\/env python/d" \
+    nltk/tgrep.py \
+    nltk/tokenize/stanford_segmenter.py \
+    nltk/test/unit/test_tgrep.py \
+    nltk/corpus/reader/knbc.py
+
+# Switch shebangs to the standard Python interpreter
+sed -E -i "s|#![[:space:]]*%{_bindir}/env python|#!%{_bindir}/python3|" \
+    setup.py \
+    tools/global_replace.py \
+    nltk_data/corpora/pl196x/splitter.py \
+    tools/find_deprecated.py
+
+%autopatch -p1
+
+%build
+%pyproject_wheel
+
+%install
+%pyproject_install
+%python_clone -a %{buildroot}%{_bindir}/nltk
+
+%{python_expand %fdupes %{buildroot}%{$python_sitelib}/
+chmod -x %{buildroot}%{$python_sitelib}/nltk/test/dependency.doctest
+}
+
+%check
+export NLTK_DATA=$(readlink -f ./nltk_data/)
+# export PYTEST_ADDOPTS="--doctest-modules"
+# Skip tests requiring pickle.load gh#nltk/nltk#3266 (CVE-2024-39705)
+skip_tests=" or test_basic or test_increment or test_pad_asterisk or test_pad_dotdot"
+skip_tests+=" or test_pos_tag_eng or test_pos_tag_eng_universal or test_pos_tag_rus"
+skip_tests+=" or test_pos_tag_rus_universal or test_pos_tag_unknown_lang"
+skip_tests+=" or test_sent_tokenize or test_unspecified_lang or test_word_tokenize"
+%pytest -k "not (network ${skip_tests})"
+
+%post
+%python_install_alternative nltk
+
+%postun
+%python_uninstall_alternative nltk
+
+%files %{python_files}
+%doc README.md
+%license LICENSE.txt
+%{python_sitelib}/nltk/
+%{python_sitelib}/nltk-%{version}.dist-info/
+%python_alternative %{_bindir}/nltk
+
+%changelog
--- a/skip-networked-test.patch
+++ b/skip-networked-test.patch
@@ -0,0 +1,35 @@
+---
+ nltk/test/unit/test_downloader.py |    4 ++++
+ setup.cfg                         |    4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/nltk/test/unit/test_downloader.py
+++ b/nltk/test/unit/test_downloader.py
+@@ -1,6 +1,9 @@
+ from nltk import download
+ 
+import pytest
+ 
+
+@pytest.mark.network
+ def test_downloader_using_existing_parent_download_dir(tmp_path):
+     """Test that download works properly when the parent folder of the download_dir exists"""
+ 
+@@ -9,6 +12,7 @@ def test_downloader_using_existing_paren
+     assert download_status is True
+ 
+ 
+@pytest.mark.network
+ def test_downloader_using_non_existing_parent_download_dir(tmp_path):
+     """Test that download works properly when the parent folder of the download_dir does not exist"""
+ 
+--- a/setup.cfg
+++ b/setup.cfg
+@@ -1,3 +1,7 @@
+[tool:pytest]
+markers =
+   network: test case requires network connection
+
+ [metadata]
+ license_files =
+     LICENSE.txt