forked from pool/python-nltk
- Add CVE-2024-39705.patch upstream patch to fix unsafe pickle usage.
(CVE-2024-39705, gh#nltk/nltk#3266, bsc#1227174). - Drop CVE-2024-39705-disable-download.patch as it's not needed anymore. OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-nltk?expand=0&rev=49
This commit is contained in:
commit
9adfbd9e5d
23
.gitattributes
vendored
Normal file
23
.gitattributes
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
## Default LFS
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.bsp filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.gem filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.jar filter=lfs diff=lfs merge=lfs -text
|
||||
*.lz filter=lfs diff=lfs merge=lfs -text
|
||||
*.lzma filter=lfs diff=lfs merge=lfs -text
|
||||
*.obscpio filter=lfs diff=lfs merge=lfs -text
|
||||
*.oxt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
*.png filter=lfs diff=lfs merge=lfs -text
|
||||
*.rpm filter=lfs diff=lfs merge=lfs -text
|
||||
*.tbz filter=lfs diff=lfs merge=lfs -text
|
||||
*.tbz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.ttf filter=lfs diff=lfs merge=lfs -text
|
||||
*.txz filter=lfs diff=lfs merge=lfs -text
|
||||
*.whl filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
.osc
|
104
CVE-2024-39705-disable-download.patch
Normal file
104
CVE-2024-39705-disable-download.patch
Normal file
@ -0,0 +1,104 @@
|
||||
---
|
||||
nltk/app/chartparser_app.py | 13 +++++++++++++
|
||||
nltk/corpus/reader/util.py | 2 ++
|
||||
nltk/data.py | 2 ++
|
||||
nltk/parse/transitionparser.py | 2 ++
|
||||
nltk/tbl/demo.py | 4 +++-
|
||||
5 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/nltk/app/chartparser_app.py
|
||||
+++ b/nltk/app/chartparser_app.py
|
||||
@@ -800,6 +800,10 @@ class ChartComparer:
|
||||
showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}")
|
||||
|
||||
def load_chart_dialog(self, *args):
|
||||
+ showerror("Security Error",
|
||||
+ "Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
+ return
|
||||
filename = askopenfilename(
|
||||
filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
|
||||
)
|
||||
@@ -811,6 +815,8 @@ class ChartComparer:
|
||||
showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}")
|
||||
|
||||
def load_chart(self, filename):
|
||||
+ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
with open(filename, "rb") as infile:
|
||||
chart = pickle.load(infile)
|
||||
name = os.path.basename(filename)
|
||||
@@ -2268,6 +2274,10 @@ class ChartParserApp:
|
||||
if not filename:
|
||||
return
|
||||
try:
|
||||
+ showerror("Security Error",
|
||||
+ "Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
+ return
|
||||
with open(filename, "rb") as infile:
|
||||
chart = pickle.load(infile)
|
||||
self._chart = chart
|
||||
@@ -2306,6 +2316,9 @@ class ChartParserApp:
|
||||
return
|
||||
try:
|
||||
if filename.endswith(".pickle"):
|
||||
+ showerror("Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
+ return
|
||||
with open(filename, "rb") as infile:
|
||||
grammar = pickle.load(infile)
|
||||
else:
|
||||
--- a/nltk/corpus/reader/util.py
|
||||
+++ b/nltk/corpus/reader/util.py
|
||||
@@ -521,6 +521,8 @@ class PickleCorpusView(StreamBackedCorpu
|
||||
|
||||
def read_block(self, stream):
|
||||
result = []
|
||||
+ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
for i in range(self.BLOCK_SIZE):
|
||||
try:
|
||||
result.append(pickle.load(stream))
|
||||
--- a/nltk/data.py
|
||||
+++ b/nltk/data.py
|
||||
@@ -752,6 +752,8 @@ def load(
|
||||
if format == "raw":
|
||||
resource_val = opened_resource.read()
|
||||
elif format == "pickle":
|
||||
+ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
resource_val = pickle.load(opened_resource)
|
||||
elif format == "json":
|
||||
import json
|
||||
--- a/nltk/parse/transitionparser.py
|
||||
+++ b/nltk/parse/transitionparser.py
|
||||
@@ -553,6 +553,8 @@ class TransitionParser(ParserI):
|
||||
"""
|
||||
result = []
|
||||
# First load the model
|
||||
+ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
model = pickle.load(open(modelFile, "rb"))
|
||||
operation = Transition(self._algorithm)
|
||||
|
||||
--- a/nltk/tbl/demo.py
|
||||
+++ b/nltk/tbl/demo.py
|
||||
@@ -253,6 +253,8 @@ def postag(
|
||||
)
|
||||
)
|
||||
with open(cache_baseline_tagger) as print_rules:
|
||||
+ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " +
|
||||
+ "a pickle is forbidden.")
|
||||
baseline_tagger = pickle.load(print_rules)
|
||||
print(f"Reloaded pickled tagger from {cache_baseline_tagger}")
|
||||
else:
|
||||
@@ -327,7 +329,7 @@ def postag(
|
||||
with open(serialize_output) as print_rules:
|
||||
brill_tagger_reloaded = pickle.load(print_rules)
|
||||
print(f"Reloaded pickled tagger from {serialize_output}")
|
||||
- taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
|
||||
+ taggedtest_reloaded = brill_tagger_reloaded.tag_sents(testing_data)
|
||||
if taggedtest == taggedtest_reloaded:
|
||||
print("Reloaded tagger tried on test set, results identical")
|
||||
else:
|
38
CVE-2024-39705.patch
Normal file
38
CVE-2024-39705.patch
Normal file
@ -0,0 +1,38 @@
|
||||
From a12d0a6a8cdba58d5e4e5f92ac62bb80fc26c624 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Kafe <kafe.eric@gmail.com>
|
||||
Date: Tue, 23 Jul 2024 09:09:09 +0200
|
||||
Subject: [PATCH] Prevent data.load from unpickling classes or functions
|
||||
|
||||
---
|
||||
nltk/data.py | 11 ++++++++++-
|
||||
1 file changed, 10 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/nltk/data.py b/nltk/data.py
|
||||
index cc9229b0a2..fb242721c5 100644
|
||||
--- a/nltk/data.py
|
||||
+++ b/nltk/data.py
|
||||
@@ -658,6 +658,15 @@ def retrieve(resource_url, filename=None, verbose=True):
|
||||
}
|
||||
|
||||
|
||||
+def restricted_pickle_load(string):
|
||||
+ """
|
||||
+ Prevents any class or function from loading.
|
||||
+ """
|
||||
+ from nltk.app.wordnet_app import RestrictedUnpickler
|
||||
+
|
||||
+ return RestrictedUnpickler(BytesIO(string)).load()
|
||||
+
|
||||
+
|
||||
def load(
|
||||
resource_url,
|
||||
format="auto",
|
||||
@@ -751,7 +760,7 @@ def load(
|
||||
if format == "raw":
|
||||
resource_val = opened_resource.read()
|
||||
elif format == "pickle":
|
||||
- resource_val = pickle.load(opened_resource)
|
||||
+ resource_val = restricted_pickle_load(opened_resource.read())
|
||||
elif format == "json":
|
||||
import json
|
||||
|
3
nltk-3.8.1.tar.gz
Normal file
3
nltk-3.8.1.tar.gz
Normal file
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:676970e2b7aa0a7184e68f76e0c4f2756fd1b82559a509d5656a23117faeb658
|
||||
size 2867926
|
110
nltk-pr3207-py312.patch
Normal file
110
nltk-pr3207-py312.patch
Normal file
@ -0,0 +1,110 @@
|
||||
From 25d35fc4283dedd2053ec6d821f4b707fff8d72c Mon Sep 17 00:00:00 2001
|
||||
From: Konstantin Chernyshev <k4black@ya.ru>
|
||||
Date: Thu, 16 Nov 2023 19:00:15 +0100
|
||||
Subject: [PATCH 1/8] ci: enable 3.12 in ci tests
|
||||
|
||||
---
|
||||
.github/workflows/ci.yaml | 2 +-
|
||||
README.md | 2 +-
|
||||
nltk/test/unit/translate/test_bleu.py | 1 -
|
||||
nltk/translate/bleu_score.py | 29 +++++++++++++++++++++++++++--
|
||||
setup.py | 3 ++-
|
||||
5 files changed, 31 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/.github/workflows/ci.yaml
|
||||
+++ b/.github/workflows/ci.yaml
|
||||
@@ -76,7 +76,7 @@ jobs:
|
||||
needs: [cache_nltk_data, cache_third_party]
|
||||
strategy:
|
||||
matrix:
|
||||
- python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
|
||||
+ python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
runs-on: ${{ matrix.os }}
|
||||
--- a/README.md
|
||||
+++ b/README.md
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
NLTK -- the Natural Language Toolkit -- is a suite of open source Python
|
||||
modules, data sets, and tutorials supporting research and development in Natural
|
||||
-Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11.
|
||||
+Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.
|
||||
|
||||
For documentation, please visit [nltk.org](https://www.nltk.org/).
|
||||
|
||||
--- a/nltk/test/unit/translate/test_bleu.py
|
||||
+++ b/nltk/test/unit/translate/test_bleu.py
|
||||
@@ -2,7 +2,6 @@
|
||||
Tests for BLEU translation evaluation metric
|
||||
"""
|
||||
|
||||
-import io
|
||||
import unittest
|
||||
|
||||
from nltk.data import find
|
||||
--- a/nltk/translate/bleu_score.py
|
||||
+++ b/nltk/translate/bleu_score.py
|
||||
@@ -7,16 +7,41 @@
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""BLEU score implementation."""
|
||||
-
|
||||
import math
|
||||
import sys
|
||||
import warnings
|
||||
from collections import Counter
|
||||
-from fractions import Fraction
|
||||
+from fractions import Fraction as _Fraction
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
+class Fraction(_Fraction):
|
||||
+ """Fraction with _normalize=False support for 3.12"""
|
||||
+
|
||||
+ def __new__(cls, numerator=0, denominator=None, _normalize=False):
|
||||
+ if sys.version_info >= (3, 12):
|
||||
+ self = super().__new__(cls, numerator, denominator)
|
||||
+ else:
|
||||
+ self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
|
||||
+ self._normalize = _normalize
|
||||
+ self._original_numerator = numerator
|
||||
+ self._original_denominator = denominator
|
||||
+ return self
|
||||
+
|
||||
+ @property
|
||||
+ def numerator(self):
|
||||
+ if not self._normalize:
|
||||
+ return self._original_numerator
|
||||
+ return super().numerator
|
||||
+
|
||||
+ @property
|
||||
+ def denominator(self):
|
||||
+ if not self._normalize:
|
||||
+ return self._original_denominator
|
||||
+ return super().denominator
|
||||
+
|
||||
+
|
||||
def sentence_bleu(
|
||||
references,
|
||||
hypothesis,
|
||||
--- a/setup.py
|
||||
+++ b/setup.py
|
||||
@@ -67,7 +67,7 @@ setup(
|
||||
},
|
||||
long_description="""\
|
||||
The Natural Language Toolkit (NLTK) is a Python package for
|
||||
-natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""",
|
||||
+natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""",
|
||||
license="Apache License, Version 2.0",
|
||||
keywords=[
|
||||
"NLP",
|
||||
@@ -100,6 +100,7 @@ natural language processing. NLTK requi
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
+ "Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Scientific/Engineering :: Human Machine Interfaces",
|
3
nltk_data.tar.xz
Normal file
3
nltk_data.tar.xz
Normal file
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f79462ac99f414b4850943720bed4a59c1bb15bfc8f1ce16b26165da6db07680
|
||||
size 393271816
|
417
python-nltk.changes
Normal file
417
python-nltk.changes
Normal file
@ -0,0 +1,417 @@
|
||||
-------------------------------------------------------------------
|
||||
Fri Jul 26 07:14:33 UTC 2024 - Daniel Garcia <daniel.garcia@suse.com>
|
||||
|
||||
- Add CVE-2024-39705.patch upstream patch to fix unsafe pickle usage.
|
||||
(CVE-2024-39705, gh#nltk/nltk#3266, bsc#1227174).
|
||||
- Drop CVE-2024-39705-disable-download.patch as it's not needed
|
||||
anymore.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Mon Jul 1 21:02:45 UTC 2024 - Matej Cepl <mcepl@cepl.eu>
|
||||
|
||||
- Use tarball from GitHub instead of the Zip archive from PyPI,
|
||||
the latter has very messy combination of CRLF and LF EOLs,
|
||||
which are hard to patch.
|
||||
- Refresh all patches from the original locations.
|
||||
- Add CVE-2024-39705-disable-download.patch to make a crude
|
||||
workaround around CVE-2024-39705 (gh#nltk/nltk#3266,
|
||||
bsc#1227174).
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Thu Mar 21 17:41:52 UTC 2024 - Ben Greiner <code@bnavigator.de>
|
||||
|
||||
- Update to 3.8.1
|
||||
* Resolve RCE & XSS vulnerabilities in localhost WordNet Browser
|
||||
* Add Python 3.11 support
|
||||
- Update nltk_data archive
|
||||
- Drop port-2to3.patch
|
||||
- Add nltk-pr3207-py312.patch for Python 3.12 support
|
||||
* gh#nltk/nltk#3207
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Mar 28 08:36:04 UTC 2023 - pgajdos@suse.com
|
||||
|
||||
- python-six is not required
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Fri Jan 6 15:32:43 UTC 2023 - Yogalakshmi Arunachalam <yarunachalam@suse.com>
|
||||
|
||||
- Update to 3.8
|
||||
|
||||
* Refactor dispersion plot (#3082)
|
||||
* Provide type hints for LazyCorpusLoader variables (#3081)
|
||||
* Throw warning when LanguageModel is initialized with incorrect vocabulary (#3080)
|
||||
* Fix WordNet's all_synsets() function (#3078)
|
||||
* Resolve TreebankWordDetokenizer inconsistency with end-of-string contractions (#3070)
|
||||
* Support both iso639-3 codes and BCP-47 language tags (#3060)
|
||||
* Avoid DeprecationWarning in Regexp tokenizer (#3055)
|
||||
* Fix many doctests, add doctests to CI (#3054, #3050, #3048)
|
||||
* Fix bool field not being read in VerbNet (#3044)
|
||||
* Greatly improve time efficiency of SyllableTokenizer when tokenizing numbers (#3042)
|
||||
* Fix encodings of Polish udhr corpus reader (#3038)
|
||||
* Allow TweetTokenizer to tokenize emoji flag sequences (#3034)
|
||||
* Prevent LazyModule from increasing the size of nltk.__dict__ (#3033)
|
||||
* Fix CoreNLPServer non-default port issue (#3031)
|
||||
* Add "acion" suffix to the Spanish SnowballStemmer (#3030)
|
||||
* Allow loading WordNet without OMW (#3026)
|
||||
* Use input() in nltk.chat.chatbot() for Jupyter support (#3022)
|
||||
* Fix edit_distance_align() in distance.py (#3017)
|
||||
* Tackle performance and accuracy regression of sentence tokenizer since NLTK 3.6.6 (#3014)
|
||||
* Add the Iota operator to semantic logic (#3010)
|
||||
* Resolve critical errors in WordNet app (#3008)
|
||||
* Resolve critical error in CHILDES Corpus (#2998)
|
||||
* Make WordNet information_content() accept adjective satellites (#2995)
|
||||
* Add "strict=True" parameter to CoreNLP (#2993, #3043)
|
||||
* Resolve issue with WordNet's synset_from_sense_key (#2988)
|
||||
* Handle WordNet synsets that were lost in mapping (#2985)
|
||||
* Resolve TypeError in Boxer (#2979)
|
||||
* Add function to retrieve WordNet synonyms (#2978)
|
||||
* Warn about nonexistent OMW offsets instead of raising an error (#2974)
|
||||
* Fix missing ic argument in res, jcn and lin similarity functions of WordNet (#2970)
|
||||
* Add support for the extended OMW (#2946)
|
||||
* Fix LC cutoff policy of text tiling (#2936)
|
||||
* Optimize ConditionalFreqDist.__add__ performance (#2939)
|
||||
* Add Markdown corpus reader (#2902)
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Mon Dec 26 10:41:22 UTC 2022 - Matej Cepl <mcepl@suse.com>
|
||||
|
||||
- Complete nltk_data.tar.xz for offline testing
|
||||
- Fix failing tests (gh#nltk/nltk#2969) by adding patches:
|
||||
- port-2to3.patch
|
||||
- skip-networked-test.patch
|
||||
- Clean up the SPEC to get rid of rpmlint warnings.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Mar 22 07:48:14 UTC 2022 - Matej Cepl <mcepl@suse.com>
|
||||
|
||||
- Update to 3.7
|
||||
- Improve and update the NLTK team page on nltk.org (#2855,
|
||||
#2941)
|
||||
- Drop support for Python 3.6, support Python 3.10 (#2920)
|
||||
- Update to 3.6.7
|
||||
- Resolve IndexError in `sent_tokenize` and `word_tokenize`
|
||||
(#2922)
|
||||
- Update to 3.6.6
|
||||
- Refactor `gensim.doctest` to work for gensim 4.0.0 and up
|
||||
(#2914)
|
||||
- Add Precision, Recall, F-measure, Confusion Matrix to Taggers
|
||||
(#2862)
|
||||
- Added warnings if .zip files exist without any corresponding
|
||||
.csv files. (#2908)
|
||||
- Fix `FileNotFoundError` when the `download_dir` is
|
||||
a non-existing nested folder (#2910)
|
||||
- Rename omw to omw-1.4 (#2907)
|
||||
- Resolve ReDoS opportunity by fixing incorrectly specified
|
||||
regex (#2906, bsc#1191030, CVE-2021-3828).
|
||||
- Support OMW 1.4 (#2899)
|
||||
- Deprecate Tree get and set node methods (#2900)
|
||||
- Fix broken inaugural test case (#2903)
|
||||
- Use Multilingual Wordnet Data from OMW with newer Wordnet
|
||||
versions (#2889)
|
||||
- Keep NLTKs "tokenize" module working with pathlib (#2896)
|
||||
- Make prettyprinter to be more readable (#2893)
|
||||
- Update links to the nltk book (#2895)
|
||||
- Add `CITATION.cff` to nltk (#2880)
|
||||
- Resolve serious ReDoS in PunktSentenceTokenizer (#2869)
|
||||
- Delete old CI config files (#2881)
|
||||
- Improve Tokenize documentation + add TokenizerI as superclass
|
||||
for TweetTokenizer (#2878)
|
||||
- Fix expected value for BLEU score doctest after changes from
|
||||
#2572
|
||||
- Add multi Bleu functionality and tests (#2793)
|
||||
- Deprecate 'return_str' parameter in NLTKWordTokenizer and
|
||||
TreebankWordTokenizer (#2883)
|
||||
- Allow empty string in CFG's + more (#2888)
|
||||
- Partition `tree.py` module into `tree` package + pickle fix
|
||||
(#2863)
|
||||
- Fix several TreebankWordTokenizer and NLTKWordTokenizer bugs
|
||||
(#2877)
|
||||
- Rewind Wordnet data file after each lookup (#2868)
|
||||
- Correct __init__ call for SyntaxCorpusReader subclasses
|
||||
(#2872)
|
||||
- Documentation fixes (#2873)
|
||||
- Fix levenstein distance for duplicated letters (#2849)
|
||||
- Support alternative Wordnet versions (#2860)
|
||||
- Remove hundreds of formatting warnings for nltk.org (#2859)
|
||||
- Modernize `nltk.org/howto` pages (#2856)
|
||||
- Fix Bleu Score smoothing function from taking log(0) (#2839)
|
||||
- Update third party tools to newer versions and removing
|
||||
MaltParser fixed version (#2832)
|
||||
- Fix TypeError: _pretty() takes 1 positional argument but 2
|
||||
were given in sem/drt.py (#2854)
|
||||
- Replace `http` with `https` in most URLs (#2852)
|
||||
- Update to 3.6.5
|
||||
- modernised nltk.org website
|
||||
- addressed LGTM.com issues
|
||||
- support ZWJ sequences emoji and skin tone modifer emoji in
|
||||
TweetTokenizer
|
||||
- METEOR evaluation now requires pre-tokenized input
|
||||
- Code linting and type hinting
|
||||
- implement get_refs function for DrtLambdaExpression
|
||||
- Enable automated CoreNLP, Senna, Prover9/Mace4, Megam,
|
||||
MaltParser CI tests
|
||||
- specify minimum regex version that supports regex.Pattern
|
||||
- avoid re.Pattern and regex.Pattern which fail for Python 3.6,
|
||||
3.7
|
||||
- Update to 3.6.4
|
||||
- deprecate `nltk.usage(obj)` in favor of `help(obj)`
|
||||
- resolve ReDoS vulnerability in Corpus Reader
|
||||
- solidify performance tests
|
||||
- improve phone number recognition in tweet tokenizer
|
||||
- refactored CISTEM stemmer for German
|
||||
- identify NLTK Team as the author
|
||||
- replace travis badge with github actions badge
|
||||
- add SECURITY.md
|
||||
- Update to 3.6.3
|
||||
- Dropped support for Python 3.5
|
||||
- Run CI tests on Windows, too
|
||||
- Moved from Travis CI to GitHub Actions
|
||||
- Code and comment cleanups
|
||||
- Visualize WordNet relation graphs using Graphviz
|
||||
- Fixed large error in METEOR score
|
||||
- Apply isort, pyupgrade, black, added as pre-commit hooks
|
||||
- Prevent debug_decisions in Punkt from throwing IndexError
|
||||
- Resolved ZeroDivisionError in RIBES with dissimilar sentences
|
||||
- Initialize WordNet IC total counts with smoothing value
|
||||
- Fixed AttributeError for Arabic ARLSTem2 stemmer
|
||||
- Many fixes and improvements to lm language model package
|
||||
- Fix bug in nltk.metrics.aline, C_skip = -10
|
||||
- Improvements to TweetTokenizer
|
||||
- Optional show arg for FreqDist.plot, ConditionalFreqDist.plot
|
||||
- edit_distance now computes Damerau-Levenshtein edit-distance
|
||||
- Update to 3.6.2
|
||||
- move test code to nltk/test
|
||||
- fix bug in NgramAssocMeasures (order preserving fix)
|
||||
- Update to 3.6
|
||||
- add support for Python 3.9
|
||||
- add Tree.fromlist
|
||||
- compute Minimum Spanning Tree of unweighted graph using BFS
|
||||
- fix bug with infinite loop in Wordnet closure and tree
|
||||
- fix bug in calculating BLEU using smoothing method 4
|
||||
- Wordnet synset similarities work for all pos
|
||||
- new Arabic light stemmer (ARLSTem2)
|
||||
- new syllable tokenizer (LegalitySyllableTokenizer)
|
||||
- remove nose in favor of pytest
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Thu Apr 23 13:54:08 UTC 2020 - John Vandenberg <jayvdb@gmail.com>
|
||||
|
||||
- Update to v3.5
|
||||
* add support for Python 3.8
|
||||
* drop support for Python 2
|
||||
* create NLTK's own Tokenizer class distinct from the Treebank
|
||||
reference tokeniser
|
||||
* update Vader sentiment analyser
|
||||
* fix JSON serialization of some PoS taggers
|
||||
* minor improvements in grammar.CFG, Vader, pl196x corpus reader,
|
||||
StringTokenizer
|
||||
* change implementation <= and >= for FreqDist so they are partial
|
||||
orders
|
||||
* make FreqDist iterable
|
||||
* correctly handle Penn Treebank trees with a unlabeled branching
|
||||
top node
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Sat Mar 14 09:07:16 UTC 2020 - Tomáš Chvátal <tchvatal@suse.com>
|
||||
|
||||
- Fix build without python2
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Mon Oct 14 14:00:43 UTC 2019 - Matej Cepl <mcepl@suse.com>
|
||||
|
||||
- Replace %fdupes -s with plain %fdupes; hardlinks are better.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Wed Sep 11 11:05:01 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com>
|
||||
|
||||
- Update to 3.4.5 (bsc#1146427, CVE-2019-14751):
|
||||
* Fixed security bug in downloader: Zip slip vulnerability - for the
|
||||
unlikely situation where a user configures their downloader to use
|
||||
a compromised server CVE-2019-14751
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Jul 23 13:52:24 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com>
|
||||
|
||||
- Update to 3.4.4:
|
||||
* fix bug in plot function (probability.py)
|
||||
* add improved PanLex Swadesh corpus reader
|
||||
* add Text.generate()
|
||||
* add QuadgramAssocMeasures
|
||||
* add SSP to tokenizers
|
||||
* return confidence of best tag from AveragedPerceptron
|
||||
* make plot methods return Axes objects
|
||||
* don't require list arguments to PositiveNaiveBayesClassifier.train
|
||||
* fix Tree classes to work with native Python copy library
|
||||
* fix inconsistency for NomBank
|
||||
* fix random seeding in LanguageModel.generate
|
||||
* fix ConditionalFreqDist mutation on tabulate/plot call
|
||||
* fix broken links in documentation
|
||||
* fix misc Wordnet issues
|
||||
* update installation instructions
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Thu May 23 12:41:31 UTC 2019 - pgajdos@suse.com
|
||||
|
||||
- version update to 3.4.1
|
||||
* add chomsky_normal_form for CFGs
|
||||
* add meteor score
|
||||
* add minimum edit/Levenshtein distance based alignment function
|
||||
* allow access to collocation list via text.collocation_list()
|
||||
* support corenlp server options
|
||||
* drop support for Python 3.4
|
||||
* other minor fixes
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Sun Feb 10 16:19:17 UTC 2019 - John Vandenberg <jayvdb@gmail.com>
|
||||
|
||||
- Remove Python 3 dependency on singledispatch
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Sat Feb 9 16:16:11 UTC 2019 - John Vandenberg <jayvdb@gmail.com>
|
||||
|
||||
- Update to v3.4
|
||||
+ Support Python 3.7
|
||||
+ New Language Modeling package
|
||||
+ Cistem Stemmer for German
|
||||
+ Support Russian National Corpus incl POS tag model
|
||||
+ Krippendorf Alpha inter-rater reliability test
|
||||
+ Comprehensive code clean-ups
|
||||
+ Switch continuous integration from Jenkins to Travis
|
||||
- from v3.3
|
||||
+ Support Python 3.6
|
||||
+ New interface to CoreNLP
|
||||
+ Support synset retrieval by sense key
|
||||
+ Minor fixes to CoNLL Corpus Reader
|
||||
+ AlignedSent
|
||||
+ Fixed minor inconsistencies in APIs and API documentation
|
||||
+ Better conformance to PEP8
|
||||
+ Drop Moses Tokenizer (incompatible license)
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Wed Feb 6 09:44:56 UTC 2019 - John Vandenberg <jayvdb@gmail.com>
|
||||
|
||||
- Add missing dependency six
|
||||
- Remove unnecessary build dependency six
|
||||
- Recommend all optional dependencies
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Mar 6 20:35:00 UTC 2018 - jengelh@inai.de
|
||||
|
||||
- Trim redundant wording from description.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Mon Mar 5 15:02:00 UTC 2018 - badshah400@gmail.com
|
||||
|
||||
- Use \%license instead of \%doc to install License.txt.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Jan 30 17:16:13 UTC 2018 - guigo.lourenco@gmail.com
|
||||
|
||||
- Depend on the full python interpreter to fix sqlite3 import
|
||||
during %check
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Jan 16 11:02:13 UTC 2018 - guigo.lourenco@gmail.com
|
||||
|
||||
- Depend on python-rpm-macros
|
||||
- Build for both Python2 and Python3
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Dec 19 15:50:13 UTC 2017 - badshah400@gmail.com
|
||||
|
||||
- Update to version 3.2.5:
|
||||
* Arabic stemmers (ARLSTem, Snowball)
|
||||
* NIST MT evaluation metric and added NIST
|
||||
international_tokenize
|
||||
* Moses tokenizer
|
||||
* Document Russian tagger
|
||||
* Fix to Stanford segmenter
|
||||
* Improve treebank detokenizer, VerbNet, Vader
|
||||
* Misc code and documentation cleanups
|
||||
* Implement fixes suggested by LGTM
|
||||
- Convert specfile to python single-spec style.
|
||||
- Drop unneeded BuildRequires: python-PyYAML, python-xml,
|
||||
python-devel; not required for building.
|
||||
- Change existing Requires to Recommends: these are really needed
|
||||
for additional features, and not required for basic nltk usage.
|
||||
- Add new Recommends: python-scipy, python-matplotlib,
|
||||
python-pyparsing, and python-gensim; enables other optional
|
||||
features.
|
||||
- Run fdupes to link-up duplicate files.
|
||||
- Remove exec permissions for a file not intended to be executed
|
||||
(not in exec path, no hashbang, etc.)
|
||||
- Remove hashbangs from non-executable files.
|
||||
- Run tests following the suggestion from
|
||||
http://www.nltk.org/install.html.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Feb 21 13:11:31 UTC 2017 - stephan.barth@suse.com
|
||||
|
||||
- update to version 3.2.2
|
||||
Upstream changelog:
|
||||
Support for Aline, ChrF and GLEU MT evaluation metrics, Russian POS tagger
|
||||
model, Moses detokenizer, rewrite Porter Stemmer and FrameNet corpus reader,
|
||||
update FrameNet Corpus to version 1.7, fixes: stanford_segmenter.py,
|
||||
SentiText, CoNLL Corpus Reader, BLEU, naivebayes, Krippendorff’s alpha,
|
||||
Punkt, Moses tokenizer, TweetTokenizer, ToktokTokenizer; improvements to
|
||||
testing framework
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Fri Oct 14 00:31:15 UTC 2016 - toddrme2178@gmail.com
|
||||
|
||||
- Update to version 3.2.1
|
||||
+ No changelog available
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Thu May 21 14:53:43 UTC 2015 - toddrme2178@gmail.com
|
||||
|
||||
- Remove upstreamed nltk-2.0.4-dont-use-python-distribute.patch
|
||||
- Update to version 3.0.2
|
||||
+ No changelog available
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Sun Dec 8 13:33:14 UTC 2013 - p.drouand@gmail.com
|
||||
|
||||
- Update to version 2.0.4
|
||||
+ No changelog available
|
||||
- Add nltk-2.0.4-dont-use-python-distribute.patch ; force use of
|
||||
python-setuptools instead of python-distribute
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Thu Oct 24 11:09:19 UTC 2013 - speilicke@suse.com
|
||||
|
||||
- Require python-setuptools instead of distribute (upstreams merged)
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Fri Sep 23 12:29:05 UTC 2011 - saschpe@suse.de
|
||||
|
||||
- Update to version 2.0.1rc1
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Sun Feb 7 18:51:07 CST 2010 - oddrationale@gmail.com
|
||||
|
||||
- fixed copyright and license statements
|
||||
- removed PyYAML, and added dependency to installers and download
|
||||
instructions
|
||||
- updated to LogicParser, DRT (Dan Garrette)
|
||||
- WordNet similarity metrics return None instead of -1 when
|
||||
they fail to find a path (Steve Bethard)
|
||||
- shortest_path_distance uses instance hypernyms (Jordan
|
||||
Boyd-Graber)
|
||||
- clean_html improved (Bjorn Maeland)
|
||||
- batch_parse, batch_interpret and batch_evaluate functions allow
|
||||
grammar or grammar filename as argument
|
||||
- more Portuguese examples (portuguese_en.doctest, examples/pt.py)
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Thu Dec 10 17:23:51 CST 2009 - oddrationale@gmail.com
|
||||
|
||||
- added python-nltk-remove-yaml.patch to pevent conflict with
|
||||
python-yaml
|
||||
- added Requires: python-yaml
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Wed Dec 9 15:39:35 CST 2009 - oddrationale@gmail.com
|
||||
|
||||
- Initial Release (Version 2.0b7): Sun Feb 7 18:50:18 CST 2010
|
2
python-nltk.rpmlintrc
Normal file
2
python-nltk.rpmlintrc
Normal file
@ -0,0 +1,2 @@
|
||||
addFilter("E: zero-length /usr/lib/python3\.\d+/site-packages/nltk/tbl/api\.py")
|
||||
addFilter("explicit-lib-dependency python3\d*-joblib")
|
192
python-nltk.spec
Normal file
192
python-nltk.spec
Normal file
@ -0,0 +1,192 @@
|
||||
#
|
||||
# spec file for package python-nltk
|
||||
#
|
||||
# Copyright (c) 2024 SUSE LLC
|
||||
#
|
||||
# All modifications and additions to the file contributed by third parties
|
||||
# remain the property of their copyright owners, unless otherwise agreed
|
||||
# upon. The license for this file, and modifications and additions to the
|
||||
# file, is the same license as for the pristine package itself (unless the
|
||||
# license for the pristine package is not an Open Source License, in which
|
||||
# case the license is the MIT License). An "Open Source License" is a
|
||||
# license that conforms to the Open Source Definition (Version 1.9)
|
||||
# published by the Open Source Initiative.
|
||||
|
||||
# Please submit bugfixes or comments via https://bugs.opensuse.org/
|
||||
#
|
||||
|
||||
|
||||
%define modname nltk
|
||||
Name: python-nltk
|
||||
Version: 3.8.1
|
||||
Release: 0
|
||||
Summary: Natural Language Toolkit
|
||||
License: Apache-2.0
|
||||
URL: http://nltk.org/
|
||||
# SourceRepository: https://github.com/nltk/nltk
|
||||
Source0: https://github.com/nltk/%{modname}/archive/refs/tags/%{version}.tar.gz#/%{modname}-%{version}.tar.gz
|
||||
# Download/Update NLTK data:
|
||||
# quilt setup python-nltk.spec
|
||||
# pushd nltk-?.?.?
|
||||
# python3 -m nltk.downloader -d nltk_data tests \
|
||||
# averaged_perceptron_tagger_ru \
|
||||
# brown \
|
||||
# cess_cat \
|
||||
# cess_esp \
|
||||
# conll2007 \
|
||||
# floresta \
|
||||
# gutenberg \
|
||||
# inaugural \
|
||||
# indian \
|
||||
# large_grammars \
|
||||
# nombank.1.0 \
|
||||
# omw-1.4 \
|
||||
# pl196x \
|
||||
# ptb \
|
||||
# punkt \
|
||||
# rte \
|
||||
# sinica_treebank \
|
||||
# stopwords \
|
||||
# treebank \
|
||||
# udhr \
|
||||
# universal_tagset \
|
||||
# wordnet \
|
||||
# wordnet_ic \
|
||||
# words
|
||||
# tar -cJf ../nltk_data.tar.xz nltk_data
|
||||
# popd
|
||||
# see https://www.nltk.org/data.html for more details
|
||||
Source1: nltk_data.tar.xz
|
||||
Source99: python-nltk.rpmlintrc
|
||||
# PATCH-FIX-UPSTREAM skip-networked-test.patch gh#nltk/nltk#2969 mcepl@suse.com
|
||||
# skip tests requiring network connection
|
||||
Patch0: skip-networked-test.patch
|
||||
# PATCH-FIX-UPSTREAM nltk-pr3207-py312.patch gh#nltk/nltk#3207
|
||||
Patch1: nltk-pr3207-py312.patch
|
||||
# PATCH-FIX-UPSTREAM CVE-2024-39705.patch bsc#1227174 gh#nltk/nltk#3290
|
||||
Patch2: CVE-2024-39705.patch
|
||||
BuildRequires: %{python_module base >= 3.7}
|
||||
BuildRequires: %{python_module pip}
|
||||
BuildRequires: %{python_module setuptools}
|
||||
BuildRequires: %{python_module wheel}
|
||||
BuildRequires: %{pythons}
|
||||
BuildRequires: fdupes
|
||||
BuildRequires: python-rpm-macros
|
||||
BuildRequires: unzip
|
||||
# SECTION runtime
|
||||
BuildRequires: %{python_module regex >= 2021.8.3}
|
||||
BuildRequires: %{python_module click}
|
||||
BuildRequires: %{python_module joblib}
|
||||
BuildRequires: %{python_module tqdm}
|
||||
# /SECTION
|
||||
# SECTION test
|
||||
BuildRequires: %{python_module tk}
|
||||
BuildRequires: %{python_module Jinja2}
|
||||
BuildRequires: %{python_module matplotlib}
|
||||
BuildRequires: %{python_module numpy}
|
||||
BuildRequires: %{python_module pyparsing}
|
||||
BuildRequires: %{python_module pytest-cov}
|
||||
BuildRequires: %{python_module pytest-mock}
|
||||
BuildRequires: %{python_module pytest}
|
||||
BuildRequires: %{python_module python-crfsuite}
|
||||
BuildRequires: %{python_module requests}
|
||||
BuildRequires: %{python_module scikit-learn}
|
||||
BuildRequires: %{python_module scipy}
|
||||
BuildRequires: %{python_module text-unidecode}
|
||||
BuildRequires: %{python_module twython}
|
||||
# /SECTION
|
||||
Requires: python-regex >= 2021.8.3
|
||||
Requires: python-click
|
||||
Requires: python-joblib
|
||||
Requires: python-tqdm
|
||||
Recommends: python-gensim
|
||||
Recommends: python-matplotlib
|
||||
Recommends: python-numpy
|
||||
Recommends: python-pyparsing
|
||||
Recommends: python-python-crfsuite
|
||||
Recommends: python-requests
|
||||
Recommends: python-scikit-learn
|
||||
Recommends: python-scipy
|
||||
Recommends: python-twython
|
||||
Requires(post): update-alternatives
|
||||
Requires(postun): update-alternatives
|
||||
BuildArch: noarch
|
||||
%python_subpackages
|
||||
|
||||
# changedir = nltk/test
|
||||
|
||||
%description
|
||||
NLTK -- the Natural Language Toolkit -- is a suite of
|
||||
Python modules, data sets and tutorials supporting research and
|
||||
development in Natural Language Processing.
|
||||
|
||||
%prep
|
||||
%setup -q -a1 -n %{modname}-%{version}
|
||||
|
||||
# Fix EOL
|
||||
sed -i 's/\r/\n/g; s/\n$//' \
|
||||
README.md \
|
||||
nltk/corpus/reader/knbc.py \
|
||||
nltk/test/unit/test_tgrep.py \
|
||||
nltk/tgrep.py \
|
||||
nltk/tokenize/stanford_segmenter.py \
|
||||
nltk/corpus/reader/knbc.py \
|
||||
nltk/test/unit/test_tgrep.py \
|
||||
nltk/tgrep.py \
|
||||
nltk/tokenize/stanford_segmenter.py \
|
||||
nltk/corpus/reader/knbc.py \
|
||||
nltk/test/unit/test_tgrep.py \
|
||||
nltk/tgrep.py \
|
||||
nltk/tokenize/stanford_segmenter.py
|
||||
|
||||
# Remove unrequired shebangs
|
||||
sed -E -i "/#![[:space:]]*\/usr\/bin\/env python/d" \
|
||||
nltk/tgrep.py \
|
||||
nltk/tokenize/stanford_segmenter.py \
|
||||
nltk/test/unit/test_tgrep.py \
|
||||
nltk/corpus/reader/knbc.py
|
||||
|
||||
# Switch shebangs to the standard Python interpreter
|
||||
sed -E -i "s|#![[:space:]]*%{_bindir}/env python|#!%{_bindir}/python3|" \
|
||||
setup.py \
|
||||
tools/global_replace.py \
|
||||
nltk_data/corpora/pl196x/splitter.py \
|
||||
tools/find_deprecated.py
|
||||
|
||||
%autopatch -p1
|
||||
|
||||
%build
|
||||
%pyproject_wheel
|
||||
|
||||
%install
|
||||
%pyproject_install
|
||||
%python_clone -a %{buildroot}%{_bindir}/nltk
|
||||
|
||||
%{python_expand %fdupes %{buildroot}%{$python_sitelib}/
|
||||
chmod -x %{buildroot}%{$python_sitelib}/nltk/test/dependency.doctest
|
||||
}
|
||||
|
||||
%check
|
||||
export NLTK_DATA=$(readlink -f ./nltk_data/)
|
||||
# export PYTEST_ADDOPTS="--doctest-modules"
|
||||
# Skip tests requiring pickle.load gh#nltk/nltk#3266 (CVE-2024-39705)
|
||||
skip_tests=" or test_basic or test_increment or test_pad_asterisk or test_pad_dotdot"
|
||||
skip_tests+=" or test_pos_tag_eng or test_pos_tag_eng_universal or test_pos_tag_rus"
|
||||
skip_tests+=" or test_pos_tag_rus_universal or test_pos_tag_unknown_lang"
|
||||
skip_tests+=" or test_sent_tokenize or test_unspecified_lang or test_word_tokenize"
|
||||
%pytest -k "not (network ${skip_tests})"
|
||||
|
||||
%post
|
||||
%python_install_alternative nltk
|
||||
|
||||
%postun
|
||||
%python_uninstall_alternative nltk
|
||||
|
||||
%files %{python_files}
|
||||
%doc README.md
|
||||
%license LICENSE.txt
|
||||
%{python_sitelib}/nltk/
|
||||
%{python_sitelib}/nltk-%{version}.dist-info/
|
||||
%python_alternative %{_bindir}/nltk
|
||||
|
||||
%changelog
|
35
skip-networked-test.patch
Normal file
35
skip-networked-test.patch
Normal file
@ -0,0 +1,35 @@
|
||||
---
|
||||
nltk/test/unit/test_downloader.py | 4 ++++
|
||||
setup.cfg | 4 ++++
|
||||
2 files changed, 8 insertions(+)
|
||||
|
||||
--- a/nltk/test/unit/test_downloader.py
|
||||
+++ b/nltk/test/unit/test_downloader.py
|
||||
@@ -1,6 +1,9 @@
|
||||
from nltk import download
|
||||
|
||||
+import pytest
|
||||
|
||||
+
|
||||
+@pytest.mark.network
|
||||
def test_downloader_using_existing_parent_download_dir(tmp_path):
|
||||
"""Test that download works properly when the parent folder of the download_dir exists"""
|
||||
|
||||
@@ -9,6 +12,7 @@ def test_downloader_using_existing_paren
|
||||
assert download_status is True
|
||||
|
||||
|
||||
+@pytest.mark.network
|
||||
def test_downloader_using_non_existing_parent_download_dir(tmp_path):
|
||||
"""Test that download works properly when the parent folder of the download_dir does not exist"""
|
||||
|
||||
--- a/setup.cfg
|
||||
+++ b/setup.cfg
|
||||
@@ -1,3 +1,7 @@
|
||||
+[tool:pytest]
|
||||
+markers =
|
||||
+ network: test case requires network connection
|
||||
+
|
||||
[metadata]
|
||||
license_files =
|
||||
LICENSE.txt
|
Loading…
Reference in New Issue
Block a user