diff --git a/CVE-2024-39705-disable-download.patch b/CVE-2024-39705-disable-download.patch new file mode 100644 index 0000000..917d451 --- /dev/null +++ b/CVE-2024-39705-disable-download.patch @@ -0,0 +1,104 @@ +--- + nltk/app/chartparser_app.py | 13 +++++++++++++ + nltk/corpus/reader/util.py | 2 ++ + nltk/data.py | 2 ++ + nltk/parse/transitionparser.py | 2 ++ + nltk/tbl/demo.py | 4 +++- + 5 files changed, 22 insertions(+), 1 deletion(-) + +--- a/nltk/app/chartparser_app.py ++++ b/nltk/app/chartparser_app.py +@@ -800,6 +800,10 @@ class ChartComparer: + showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}") + + def load_chart_dialog(self, *args): ++ showerror("Security Error", ++ "Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") ++ return + filename = askopenfilename( + filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" + ) +@@ -811,6 +815,8 @@ class ChartComparer: + showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}") + + def load_chart(self, filename): ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + with open(filename, "rb") as infile: + chart = pickle.load(infile) + name = os.path.basename(filename) +@@ -2268,6 +2274,10 @@ class ChartParserApp: + if not filename: + return + try: ++ showerror("Security Error", ++ "Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") ++ return + with open(filename, "rb") as infile: + chart = pickle.load(infile) + self._chart = chart +@@ -2306,6 +2316,9 @@ class ChartParserApp: + return + try: + if filename.endswith(".pickle"): ++ showerror("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") ++ return + with open(filename, "rb") as infile: + grammar = pickle.load(infile) + else: +--- a/nltk/corpus/reader/util.py ++++ b/nltk/corpus/reader/util.py +@@ -521,6 +521,8 @@ class PickleCorpusView(StreamBackedCorpu + + def read_block(self, stream): + result = [] ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + for i in range(self.BLOCK_SIZE): + try: + result.append(pickle.load(stream)) +--- a/nltk/data.py ++++ b/nltk/data.py +@@ -752,6 +752,8 @@ def load( + if format == "raw": + resource_val = opened_resource.read() + elif format == "pickle": ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + resource_val = pickle.load(opened_resource) + elif format == "json": + import json +--- a/nltk/parse/transitionparser.py ++++ b/nltk/parse/transitionparser.py +@@ -553,6 +553,8 @@ class TransitionParser(ParserI): + """ + result = [] + # First load the model ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + model = pickle.load(open(modelFile, "rb")) + operation = Transition(self._algorithm) + +--- a/nltk/tbl/demo.py ++++ b/nltk/tbl/demo.py +@@ -253,6 +253,8 @@ def postag( + ) + ) + with open(cache_baseline_tagger) as print_rules: ++ raise RuntimeError("Due to gh#nltk/nltk#3266, deserializing from " + ++ "a pickle is forbidden.") + baseline_tagger = pickle.load(print_rules) + print(f"Reloaded pickled tagger from {cache_baseline_tagger}") + else: +@@ -327,7 +329,7 @@ def postag( + with open(serialize_output) as print_rules: + brill_tagger_reloaded = pickle.load(print_rules) + print(f"Reloaded pickled tagger from {serialize_output}") +- taggedtest_reloaded = brill_tagger.tag_sents(testing_data) ++ taggedtest_reloaded = brill_tagger_reloaded.tag_sents(testing_data) + if taggedtest == taggedtest_reloaded: + print("Reloaded tagger tried on test set, results identical") + else: diff --git a/nltk-3.8.1.tar.gz b/nltk-3.8.1.tar.gz new file mode 100644 index 0000000..47e3c7c --- /dev/null +++ b/nltk-3.8.1.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:676970e2b7aa0a7184e68f76e0c4f2756fd1b82559a509d5656a23117faeb658 +size 2867926 diff --git a/nltk-3.8.1.zip b/nltk-3.8.1.zip deleted file mode 100644 index 79456a9..0000000 --- a/nltk-3.8.1.zip +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3 -size 4620388 diff --git a/nltk-pr3207-py312.patch b/nltk-pr3207-py312.patch index 75f2bff..8524834 100644 --- a/nltk-pr3207-py312.patch +++ b/nltk-pr3207-py312.patch @@ -4,729 +4,62 @@ Date: Thu, 16 Nov 2023 19:00:15 +0100 Subject: [PATCH 1/8] ci: enable 3.12 in ci tests --- - .github/workflows/ci.yaml | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) + .github/workflows/ci.yaml | 2 +- + README.md | 2 +- + nltk/test/unit/translate/test_bleu.py | 1 - + nltk/translate/bleu_score.py | 29 +++++++++++++++++++++++++++-- + setup.py | 3 ++- + 5 files changed, 31 insertions(+), 6 deletions(-) -Index: nltk-3.8.1/nltk/test/unit/translate/test_bleu.py -=================================================================== ---- nltk-3.8.1.orig/nltk/test/unit/translate/test_bleu.py -+++ nltk-3.8.1/nltk/test/unit/translate/test_bleu.py +--- a/.github/workflows/ci.yaml ++++ b/.github/workflows/ci.yaml +@@ -76,7 +76,7 @@ jobs: + needs: [cache_nltk_data, cache_third_party] + strategy: + matrix: +- python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] ++ python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + os: [ubuntu-latest, macos-latest, windows-latest] + fail-fast: false + runs-on: ${{ matrix.os }} +--- a/README.md ++++ b/README.md +@@ -4,7 +4,7 @@ + + NLTK -- the Natural Language Toolkit -- is a suite of open source Python + modules, data sets, and tutorials supporting research and development in Natural +-Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11. ++Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12. + + For documentation, please visit [nltk.org](https://www.nltk.org/). + +--- a/nltk/test/unit/translate/test_bleu.py ++++ b/nltk/test/unit/translate/test_bleu.py @@ -2,7 +2,6 @@ - Tests for BLEU translation evaluation metric - """ - --import io - import unittest - - from nltk.data import find -Index: nltk-3.8.1/nltk/translate/bleu_score.py -=================================================================== ---- nltk-3.8.1.orig/nltk/translate/bleu_score.py -+++ nltk-3.8.1/nltk/translate/bleu_score.py -@@ -1,685 +1,710 @@ --# Natural Language Toolkit: BLEU Score --# --# Copyright (C) 2001-2023 NLTK Project --# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim --# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan --# URL: --# For license information, see LICENSE.TXT -- --"""BLEU score implementation.""" -- --import math --import sys --import warnings --from collections import Counter --from fractions import Fraction -- --from nltk.util import ngrams -- -- --def sentence_bleu( -- references, -- hypothesis, -- weights=(0.25, 0.25, 0.25, 0.25), -- smoothing_function=None, -- auto_reweigh=False, --): -- """ -- Calculate BLEU score (Bilingual Evaluation Understudy) from -- Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. -- "BLEU: a method for automatic evaluation of machine translation." -- In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf -- -- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', -- ... 'ensures', 'that', 'the', 'military', 'always', -- ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] -- -- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', -- ... 'forever', 'hearing', 'the', 'activity', 'guidebook', -- ... 'that', 'party', 'direct'] -- -- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -- ... 'ensures', 'that', 'the', 'military', 'will', 'forever', -- ... 'heed', 'Party', 'commands'] -- -- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', -- ... 'guarantees', 'the', 'military', 'forces', 'always', -- ... 'being', 'under', 'the', 'command', 'of', 'the', -- ... 'Party'] -- -- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -- ... 'army', 'always', 'to', 'heed', 'the', 'directions', -- ... 'of', 'the', 'party'] -- -- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS -- 0.5045... -- -- If there is no ngrams overlap for any order of n-grams, BLEU returns the -- value 0. This is because the precision for the order of n-grams without -- overlap is 0, and the geometric mean in the final BLEU score computation -- multiplies the 0 with the precision of other n-grams. This results in 0 -- (independently of the precision of the other n-gram orders). The following -- example has zero 3-gram and 4-gram overlaps: -- -- >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS -- 0.0 -- -- To avoid this harsh behaviour when no ngram overlaps are found a smoothing -- function can be used. -- -- >>> chencherry = SmoothingFunction() -- >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, -- ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS -- 0.0370... -- -- The default BLEU calculates a score for up to 4-grams using uniform -- weights (this is called BLEU-4). To evaluate your translations with -- higher/lower order ngrams, use customized weights. E.g. when accounting -- for up to 5-grams with uniform weights (this is called BLEU-5) use: -- -- >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) -- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS -- 0.3920... -- -- Multiple BLEU scores can be computed at once, by supplying a list of weights. -- E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: -- >>> weights = [ -- ... (1./2., 1./2.), -- ... (1./3., 1./3., 1./3.), -- ... (1./4., 1./4., 1./4., 1./4.) -- ... ] -- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS -- [0.7453..., 0.6240..., 0.5045...] -- -- :param references: reference sentences -- :type references: list(list(str)) -- :param hypothesis: a hypothesis sentence -- :type hypothesis: list(str) -- :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) -- :type weights: tuple(float) / list(tuple(float)) -- :param smoothing_function: -- :type smoothing_function: SmoothingFunction -- :param auto_reweigh: Option to re-normalize the weights uniformly. -- :type auto_reweigh: bool -- :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. -- :rtype: float / list(float) -- """ -- return corpus_bleu( -- [references], [hypothesis], weights, smoothing_function, auto_reweigh -- ) -- -- --def corpus_bleu( -- list_of_references, -- hypotheses, -- weights=(0.25, 0.25, 0.25, 0.25), -- smoothing_function=None, -- auto_reweigh=False, --): -- """ -- Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all -- the hypotheses and their respective references. -- -- Instead of averaging the sentence level BLEU scores (i.e. macro-average -- precision), the original BLEU metric (Papineni et al. 2002) accounts for -- the micro-average precision (i.e. summing the numerators and denominators -- for each hypothesis-reference(s) pairs before the division). -- -- >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', -- ... 'ensures', 'that', 'the', 'military', 'always', -- ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] -- >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -- ... 'ensures', 'that', 'the', 'military', 'will', 'forever', -- ... 'heed', 'Party', 'commands'] -- >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', -- ... 'guarantees', 'the', 'military', 'forces', 'always', -- ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] -- >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -- ... 'army', 'always', 'to', 'heed', 'the', 'directions', -- ... 'of', 'the', 'party'] -- -- >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', -- ... 'interested', 'in', 'world', 'history'] -- >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', -- ... 'because', 'he', 'read', 'the', 'book'] -- -- >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] -- >>> hypotheses = [hyp1, hyp2] -- >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS -- 0.5920... -- -- The example below show that corpus_bleu() is different from averaging -- sentence_bleu() for hypotheses -- -- >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) -- >>> score2 = sentence_bleu([ref2a], hyp2) -- >>> (score1 + score2) / 2 # doctest: +ELLIPSIS -- 0.6223... -- -- Custom weights may be supplied to fine-tune the BLEU score further. -- A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. -- >>> weights = (0.1, 0.3, 0.5, 0.1) -- >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS -- 0.5818... -- -- This particular weight gave extra value to trigrams. -- Furthermore, multiple weights can be given, resulting in multiple BLEU scores. -- >>> weights = [ -- ... (0.5, 0.5), -- ... (0.333, 0.333, 0.334), -- ... (0.25, 0.25, 0.25, 0.25), -- ... (0.2, 0.2, 0.2, 0.2, 0.2) -- ... ] -- >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS -- [0.8242..., 0.7067..., 0.5920..., 0.4719...] -- -- :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses -- :type list_of_references: list(list(list(str))) -- :param hypotheses: a list of hypothesis sentences -- :type hypotheses: list(list(str)) -- :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) -- :type weights: tuple(float) / list(tuple(float)) -- :param smoothing_function: -- :type smoothing_function: SmoothingFunction -- :param auto_reweigh: Option to re-normalize the weights uniformly. -- :type auto_reweigh: bool -- :return: The corpus-level BLEU score. -- :rtype: float -- """ -- # Before proceeding to compute BLEU, perform sanity checks. -- -- p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. -- p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. -- hyp_lengths, ref_lengths = 0, 0 -- -- assert len(list_of_references) == len(hypotheses), ( -- "The number of hypotheses and their reference(s) should be the " "same " -- ) -- -- try: -- weights[0][0] -- except TypeError: -- weights = [weights] -- max_weight_length = max(len(weight) for weight in weights) -- -- # Iterate through each hypothesis and their corresponding references. -- for references, hypothesis in zip(list_of_references, hypotheses): -- # For each order of ngram, calculate the numerator and -- # denominator for the corpus-level modified precision. -- for i in range(1, max_weight_length + 1): -- p_i = modified_precision(references, hypothesis, i) -- p_numerators[i] += p_i.numerator -- p_denominators[i] += p_i.denominator -- -- # Calculate the hypothesis length and the closest reference length. -- # Adds them to the corpus-level hypothesis and reference counts. -- hyp_len = len(hypothesis) -- hyp_lengths += hyp_len -- ref_lengths += closest_ref_length(references, hyp_len) -- -- # Calculate corpus-level brevity penalty. -- bp = brevity_penalty(ref_lengths, hyp_lengths) -- -- # Collects the various precision values for the different ngram orders. -- p_n = [ -- Fraction(p_numerators[i], p_denominators[i], _normalize=False) -- for i in range(1, max_weight_length + 1) -- ] -- -- # Returns 0 if there's no matching n-grams -- # We only need to check for p_numerators[1] == 0, since if there's -- # no unigrams, there won't be any higher order ngrams. -- if p_numerators[1] == 0: -- return 0 if len(weights) == 1 else [0] * len(weights) -- -- # If there's no smoothing, set use method0 from SmoothinFunction class. -- if not smoothing_function: -- smoothing_function = SmoothingFunction().method0 -- # Smoothen the modified precision. -- # Note: smoothing_function() may convert values into floats; -- # it tries to retain the Fraction object as much as the -- # smoothing method allows. -- p_n = smoothing_function( -- p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths -- ) -- -- bleu_scores = [] -- for weight in weights: -- # Uniformly re-weighting based on maximum hypothesis lengths if largest -- # order of n-grams < 4 and weights is set at default. -- if auto_reweigh: -- if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): -- weight = (1 / hyp_lengths,) * hyp_lengths -- -- s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) -- s = bp * math.exp(math.fsum(s)) -- bleu_scores.append(s) -- return bleu_scores[0] if len(weights) == 1 else bleu_scores -- -- --def modified_precision(references, hypothesis, n): -- """ -- Calculate modified ngram precision. -- -- The normal precision method may lead to some wrong translations with -- high-precision, e.g., the translation, in which a word of reference -- repeats several times, has very high precision. -- -- This function only returns the Fraction object that contains the numerator -- and denominator necessary to calculate the corpus-level precision. -- To calculate the modified precision for a single pair of hypothesis and -- references, cast the Fraction object into a float. -- -- The famous "the the the ... " example shows that you can get BLEU precision -- by duplicating high frequency words. -- -- >>> reference1 = 'the cat is on the mat'.split() -- >>> reference2 = 'there is a cat on the mat'.split() -- >>> hypothesis1 = 'the the the the the the the'.split() -- >>> references = [reference1, reference2] -- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS -- 0.2857... -- -- In the modified n-gram precision, a reference word will be considered -- exhausted after a matching hypothesis word is identified, e.g. -- -- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -- ... 'ensures', 'that', 'the', 'military', 'will', -- ... 'forever', 'heed', 'Party', 'commands'] -- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', -- ... 'guarantees', 'the', 'military', 'forces', 'always', -- ... 'being', 'under', 'the', 'command', 'of', 'the', -- ... 'Party'] -- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -- ... 'army', 'always', 'to', 'heed', 'the', 'directions', -- ... 'of', 'the', 'party'] -- >>> hypothesis = 'of the'.split() -- >>> references = [reference1, reference2, reference3] -- >>> float(modified_precision(references, hypothesis, n=1)) -- 1.0 -- >>> float(modified_precision(references, hypothesis, n=2)) -- 1.0 -- -- An example of a normal machine translation hypothesis: -- -- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', -- ... 'ensures', 'that', 'the', 'military', 'always', -- ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] -- -- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', -- ... 'forever', 'hearing', 'the', 'activity', 'guidebook', -- ... 'that', 'party', 'direct'] -- -- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -- ... 'ensures', 'that', 'the', 'military', 'will', -- ... 'forever', 'heed', 'Party', 'commands'] -- -- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', -- ... 'guarantees', 'the', 'military', 'forces', 'always', -- ... 'being', 'under', 'the', 'command', 'of', 'the', -- ... 'Party'] -- -- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -- ... 'army', 'always', 'to', 'heed', 'the', 'directions', -- ... 'of', 'the', 'party'] -- >>> references = [reference1, reference2, reference3] -- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS -- 0.9444... -- >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS -- 0.5714... -- >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS -- 0.5882352941176471 -- >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS -- 0.07692... -- -- -- :param references: A list of reference translations. -- :type references: list(list(str)) -- :param hypothesis: A hypothesis translation. -- :type hypothesis: list(str) -- :param n: The ngram order. -- :type n: int -- :return: BLEU's modified precision for the nth order ngram. -- :rtype: Fraction -- """ -- # Extracts all ngrams in hypothesis -- # Set an empty Counter if hypothesis is empty. -- counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() -- # Extract a union of references' counts. -- # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) -- max_counts = {} -- for reference in references: -- reference_counts = ( -- Counter(ngrams(reference, n)) if len(reference) >= n else Counter() -- ) -- for ngram in counts: -- max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) -- -- # Assigns the intersection between hypothesis and references' counts. -- clipped_counts = { -- ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() -- } -- -- numerator = sum(clipped_counts.values()) -- # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. -- # Usually this happens when the ngram order is > len(reference). -- denominator = max(1, sum(counts.values())) -- -- return Fraction(numerator, denominator, _normalize=False) -- -- --def closest_ref_length(references, hyp_len): -- """ -- This function finds the reference that is the closest length to the -- hypothesis. The closest reference length is referred to as *r* variable -- from the brevity penalty formula in Papineni et. al. (2002) -- -- :param references: A list of reference translations. -- :type references: list(list(str)) -- :param hyp_len: The length of the hypothesis. -- :type hyp_len: int -- :return: The length of the reference that's closest to the hypothesis. -- :rtype: int -- """ -- ref_lens = (len(reference) for reference in references) -- closest_ref_len = min( -- ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) -- ) -- return closest_ref_len -- -- --def brevity_penalty(closest_ref_len, hyp_len): -- """ -- Calculate brevity penalty. -- -- As the modified n-gram precision still has the problem from the short -- length sentence, brevity penalty is used to modify the overall BLEU -- score according to length. -- -- An example from the paper. There are three references with length 12, 15 -- and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. -- -- >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 -- >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 -- >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 -- >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 -- >>> references = [reference1, reference2, reference3] -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(references, hyp_len) -- >>> brevity_penalty(closest_ref_len, hyp_len) -- 1.0 -- -- In case a hypothesis translation is shorter than the references, penalty is -- applied. -- -- >>> references = [['a'] * 28, ['a'] * 28] -- >>> hypothesis = ['a'] * 12 -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(references, hyp_len) -- >>> brevity_penalty(closest_ref_len, hyp_len) -- 0.2635971381157267 -- -- The length of the closest reference is used to compute the penalty. If the -- length of a hypothesis is 12, and the reference lengths are 13 and 2, the -- penalty is applied because the hypothesis length (12) is less then the -- closest reference length (13). -- -- >>> references = [['a'] * 13, ['a'] * 2] -- >>> hypothesis = ['a'] * 12 -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(references, hyp_len) -- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS -- 0.9200... -- -- The brevity penalty doesn't depend on reference order. More importantly, -- when two reference sentences are at the same distance, the shortest -- reference sentence length is used. -- -- >>> references = [['a'] * 13, ['a'] * 11] -- >>> hypothesis = ['a'] * 12 -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(references, hyp_len) -- >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) -- >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) -- >>> bp1 == bp2 == 1 -- True -- -- A test example from mteval-v13a.pl (starting from the line 705): -- -- >>> references = [['a'] * 11, ['a'] * 8] -- >>> hypothesis = ['a'] * 7 -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(references, hyp_len) -- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS -- 0.8668... -- -- >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] -- >>> hypothesis = ['a'] * 7 -- >>> hyp_len = len(hypothesis) -- >>> closest_ref_len = closest_ref_length(references, hyp_len) -- >>> brevity_penalty(closest_ref_len, hyp_len) -- 1.0 -- -- :param hyp_len: The length of the hypothesis for a single sentence OR the -- sum of all the hypotheses' lengths for a corpus -- :type hyp_len: int -- :param closest_ref_len: The length of the closest reference for a single -- hypothesis OR the sum of all the closest references for every hypotheses. -- :type closest_ref_len: int -- :return: BLEU's brevity penalty. -- :rtype: float -- """ -- if hyp_len > closest_ref_len: -- return 1 -- # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 -- elif hyp_len == 0: -- return 0 -- else: -- return math.exp(1 - closest_ref_len / hyp_len) -- -- --class SmoothingFunction: -- """ -- This is an implementation of the smoothing techniques -- for segment-level BLEU scores that was presented in -- Boxing Chen and Collin Cherry (2014) A Systematic Comparison of -- Smoothing Techniques for Sentence-Level BLEU. In WMT14. -- http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf -- """ -- -- def __init__(self, epsilon=0.1, alpha=5, k=5): -- """ -- This will initialize the parameters required for the various smoothing -- techniques, the default values are set to the numbers used in the -- experiments from Chen and Cherry (2014). -- -- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', -- ... 'that', 'the', 'military', 'always', 'obeys', 'the', -- ... 'commands', 'of', 'the', 'party'] -- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', -- ... 'that', 'the', 'military', 'will', 'forever', 'heed', -- ... 'Party', 'commands'] -- -- >>> chencherry = SmoothingFunction() -- >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS -- 0.4118... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS -- 0.4118... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS -- 0.4118... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS -- 0.4452... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS -- 0.4118... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS -- 0.4118... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS -- 0.4905... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS -- 0.4135... -- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS -- 0.4905... -- -- :param epsilon: the epsilon value use in method 1 -- :type epsilon: float -- :param alpha: the alpha value use in method 6 -- :type alpha: int -- :param k: the k value use in method 4 -- :type k: int -- """ -- self.epsilon = epsilon -- self.alpha = alpha -- self.k = k -- -- def method0(self, p_n, *args, **kwargs): -- """ -- No smoothing. -- """ -- p_n_new = [] -- for i, p_i in enumerate(p_n): -- if p_i.numerator != 0: -- p_n_new.append(p_i) -- else: -- _msg = str( -- "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" -- "Therefore the BLEU score evaluates to 0, independently of\n" -- "how many N-gram overlaps of lower order it contains.\n" -- "Consider using lower n-gram order or use " -- "SmoothingFunction()" -- ).format(i + 1) -- warnings.warn(_msg) -- # When numerator==0 where denonminator==0 or !=0, the result -- # for the precision score should be equal to 0 or undefined. -- # Due to BLEU geometric mean computation in logarithm space, -- # we we need to take the return sys.float_info.min such that -- # math.log(sys.float_info.min) returns a 0 precision score. -- p_n_new.append(sys.float_info.min) -- return p_n_new -- -- def method1(self, p_n, *args, **kwargs): -- """ -- Smoothing method 1: Add *epsilon* counts to precision with 0 counts. -- """ -- return [ -- (p_i.numerator + self.epsilon) / p_i.denominator -- if p_i.numerator == 0 -- else p_i -- for p_i in p_n -- ] -- -- def method2(self, p_n, *args, **kwargs): -- """ -- Smoothing method 2: Add 1 to both numerator and denominator from -- Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for -- Evaluating Automatic Evaluation Metrics for Machine Translation. -- In COLING 2004. -- """ -- return [ -- Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) -- if i != 0 -- else p_n[0] -- for i in range(len(p_n)) -- ] -- -- def method3(self, p_n, *args, **kwargs): -- """ -- Smoothing method 3: NIST geometric sequence smoothing -- The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each -- precision score whose matching n-gram count is null. -- k is 1 for the first 'n' value for which the n-gram match count is null/ -- -- For example, if the text contains: -- -- - one 2-gram match -- - and (consequently) two 1-gram matches -- -- the n-gram count for each individual precision score would be: -- -- - n=1 => prec_count = 2 (two unigrams) -- - n=2 => prec_count = 1 (one bigram) -- - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) -- - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) -- """ -- incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. -- for i, p_i in enumerate(p_n): -- if p_i.numerator == 0: -- p_n[i] = 1 / (2**incvnt * p_i.denominator) -- incvnt += 1 -- return p_n -- -- def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -- """ -- Smoothing method 4: -- Shorter translations may have inflated precision values due to having -- smaller denominators; therefore, we give them proportionally -- smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry -- suggests dividing by 1/ln(len(T)), where T is the length of the translation. -- """ -- incvnt = 1 -- hyp_len = hyp_len if hyp_len else len(hypothesis) -- for i, p_i in enumerate(p_n): -- if p_i.numerator == 0 and hyp_len > 1: -- # incvnt = i + 1 * self.k / math.log( -- # hyp_len -- # ) # Note that this K is different from the K from NIST. -- # p_n[i] = incvnt / p_i.denominator\ -- numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) -- p_n[i] = numerator / p_i.denominator -- incvnt += 1 -- return p_n -- -- def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -- """ -- Smoothing method 5: -- The matched counts for similar values of n should be similar. To a -- calculate the n-gram matched count, it averages the n−1, n and n+1 gram -- matched counts. -- """ -- hyp_len = hyp_len if hyp_len else len(hypothesis) -- m = {} -- # Requires an precision value for an addition ngram order. -- p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] -- m[-1] = p_n[0] + 1 -- for i, p_i in enumerate(p_n): -- p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 -- m[i] = p_n[i] -- return p_n -- -- def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -- """ -- Smoothing method 6: -- Interpolates the maximum likelihood estimate of the precision *p_n* with -- a prior estimate *pi0*. The prior is estimated by assuming that the ratio -- between pn and pn−1 will be the same as that between pn−1 and pn−2; from -- Gao and He (2013) Training MRF-Based Phrase Translation Models using -- Gradient Ascent. In NAACL. -- """ -- hyp_len = hyp_len if hyp_len else len(hypothesis) -- # This smoothing only works when p_1 and p_2 is non-zero. -- # Raise an error with an appropriate message when the input is too short -- # to use this smoothing technique. -- assert p_n[2], "This smoothing method requires non-zero precision for bigrams." -- for i, p_i in enumerate(p_n): -- if i in [0, 1]: # Skips the first 2 orders of ngrams. -- continue -- else: -- pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] -- # No. of ngrams in translation that matches the reference. -- m = p_i.numerator -- # No. of ngrams in translation. -- l = sum(1 for _ in ngrams(hypothesis, i + 1)) -- # Calculates the interpolated precision. -- p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) -- return p_n -- -- def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -- """ -- Smoothing method 7: -- Interpolates methods 4 and 5. -- """ -- hyp_len = hyp_len if hyp_len else len(hypothesis) -- p_n = self.method4(p_n, references, hypothesis, hyp_len) -- p_n = self.method5(p_n, references, hypothesis, hyp_len) -- return p_n -+# Natural Language Toolkit: BLEU Score -+# -+# Copyright (C) 2001-2023 NLTK Project -+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim -+# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan -+# URL: -+# For license information, see LICENSE.TXT -+ -+"""BLEU score implementation.""" -+import math -+import sys -+import warnings -+from collections import Counter + Tests for BLEU translation evaluation metric + """ + +-import io + import unittest + + from nltk.data import find +--- a/nltk/translate/bleu_score.py ++++ b/nltk/translate/bleu_score.py +@@ -7,16 +7,41 @@ + # For license information, see LICENSE.TXT + + """BLEU score implementation.""" +- + import math + import sys + import warnings + from collections import Counter +-from fractions import Fraction +from fractions import Fraction as _Fraction -+ -+from nltk.util import ngrams -+ -+ + + from nltk.util import ngrams + + +class Fraction(_Fraction): + """Fraction with _normalize=False support for 3.12""" + @@ -753,1030 +86,25 @@ Index: nltk-3.8.1/nltk/translate/bleu_score.py + return super().denominator + + -+def sentence_bleu( -+ references, -+ hypothesis, -+ weights=(0.25, 0.25, 0.25, 0.25), -+ smoothing_function=None, -+ auto_reweigh=False, -+): -+ """ -+ Calculate BLEU score (Bilingual Evaluation Understudy) from -+ Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. -+ "BLEU: a method for automatic evaluation of machine translation." -+ In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf -+ -+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', -+ ... 'ensures', 'that', 'the', 'military', 'always', -+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] -+ -+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', -+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook', -+ ... 'that', 'party', 'direct'] -+ -+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever', -+ ... 'heed', 'Party', 'commands'] -+ -+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', -+ ... 'guarantees', 'the', 'military', 'forces', 'always', -+ ... 'being', 'under', 'the', 'command', 'of', 'the', -+ ... 'Party'] -+ -+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -+ ... 'army', 'always', 'to', 'heed', 'the', 'directions', -+ ... 'of', 'the', 'party'] -+ -+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS -+ 0.5045... -+ -+ If there is no ngrams overlap for any order of n-grams, BLEU returns the -+ value 0. This is because the precision for the order of n-grams without -+ overlap is 0, and the geometric mean in the final BLEU score computation -+ multiplies the 0 with the precision of other n-grams. This results in 0 -+ (independently of the precision of the other n-gram orders). The following -+ example has zero 3-gram and 4-gram overlaps: -+ -+ >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS -+ 0.0 -+ -+ To avoid this harsh behaviour when no ngram overlaps are found a smoothing -+ function can be used. -+ -+ >>> chencherry = SmoothingFunction() -+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, -+ ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS -+ 0.0370... -+ -+ The default BLEU calculates a score for up to 4-grams using uniform -+ weights (this is called BLEU-4). To evaluate your translations with -+ higher/lower order ngrams, use customized weights. E.g. when accounting -+ for up to 5-grams with uniform weights (this is called BLEU-5) use: -+ -+ >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) -+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS -+ 0.3920... -+ -+ Multiple BLEU scores can be computed at once, by supplying a list of weights. -+ E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: -+ >>> weights = [ -+ ... (1./2., 1./2.), -+ ... (1./3., 1./3., 1./3.), -+ ... (1./4., 1./4., 1./4., 1./4.) -+ ... ] -+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS -+ [0.7453..., 0.6240..., 0.5045...] -+ -+ :param references: reference sentences -+ :type references: list(list(str)) -+ :param hypothesis: a hypothesis sentence -+ :type hypothesis: list(str) -+ :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) -+ :type weights: tuple(float) / list(tuple(float)) -+ :param smoothing_function: -+ :type smoothing_function: SmoothingFunction -+ :param auto_reweigh: Option to re-normalize the weights uniformly. -+ :type auto_reweigh: bool -+ :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. -+ :rtype: float / list(float) -+ """ -+ return corpus_bleu( -+ [references], [hypothesis], weights, smoothing_function, auto_reweigh -+ ) -+ -+ -+def corpus_bleu( -+ list_of_references, -+ hypotheses, -+ weights=(0.25, 0.25, 0.25, 0.25), -+ smoothing_function=None, -+ auto_reweigh=False, -+): -+ """ -+ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all -+ the hypotheses and their respective references. -+ -+ Instead of averaging the sentence level BLEU scores (i.e. macro-average -+ precision), the original BLEU metric (Papineni et al. 2002) accounts for -+ the micro-average precision (i.e. summing the numerators and denominators -+ for each hypothesis-reference(s) pairs before the division). -+ -+ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', -+ ... 'ensures', 'that', 'the', 'military', 'always', -+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] -+ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever', -+ ... 'heed', 'Party', 'commands'] -+ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', -+ ... 'guarantees', 'the', 'military', 'forces', 'always', -+ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] -+ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -+ ... 'army', 'always', 'to', 'heed', 'the', 'directions', -+ ... 'of', 'the', 'party'] -+ -+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', -+ ... 'interested', 'in', 'world', 'history'] -+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', -+ ... 'because', 'he', 'read', 'the', 'book'] -+ -+ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] -+ >>> hypotheses = [hyp1, hyp2] -+ >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS -+ 0.5920... -+ -+ The example below show that corpus_bleu() is different from averaging -+ sentence_bleu() for hypotheses -+ -+ >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) -+ >>> score2 = sentence_bleu([ref2a], hyp2) -+ >>> (score1 + score2) / 2 # doctest: +ELLIPSIS -+ 0.6223... -+ -+ Custom weights may be supplied to fine-tune the BLEU score further. -+ A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. -+ >>> weights = (0.1, 0.3, 0.5, 0.1) -+ >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS -+ 0.5818... -+ -+ This particular weight gave extra value to trigrams. -+ Furthermore, multiple weights can be given, resulting in multiple BLEU scores. -+ >>> weights = [ -+ ... (0.5, 0.5), -+ ... (0.333, 0.333, 0.334), -+ ... (0.25, 0.25, 0.25, 0.25), -+ ... (0.2, 0.2, 0.2, 0.2, 0.2) -+ ... ] -+ >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS -+ [0.8242..., 0.7067..., 0.5920..., 0.4719...] -+ -+ :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses -+ :type list_of_references: list(list(list(str))) -+ :param hypotheses: a list of hypothesis sentences -+ :type hypotheses: list(list(str)) -+ :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) -+ :type weights: tuple(float) / list(tuple(float)) -+ :param smoothing_function: -+ :type smoothing_function: SmoothingFunction -+ :param auto_reweigh: Option to re-normalize the weights uniformly. -+ :type auto_reweigh: bool -+ :return: The corpus-level BLEU score. -+ :rtype: float -+ """ -+ # Before proceeding to compute BLEU, perform sanity checks. -+ -+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. -+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. -+ hyp_lengths, ref_lengths = 0, 0 -+ -+ assert len(list_of_references) == len(hypotheses), ( -+ "The number of hypotheses and their reference(s) should be the " "same " -+ ) -+ -+ try: -+ weights[0][0] -+ except TypeError: -+ weights = [weights] -+ max_weight_length = max(len(weight) for weight in weights) -+ -+ # Iterate through each hypothesis and their corresponding references. -+ for references, hypothesis in zip(list_of_references, hypotheses): -+ # For each order of ngram, calculate the numerator and -+ # denominator for the corpus-level modified precision. -+ for i in range(1, max_weight_length + 1): -+ p_i = modified_precision(references, hypothesis, i) -+ p_numerators[i] += p_i.numerator -+ p_denominators[i] += p_i.denominator -+ -+ # Calculate the hypothesis length and the closest reference length. -+ # Adds them to the corpus-level hypothesis and reference counts. -+ hyp_len = len(hypothesis) -+ hyp_lengths += hyp_len -+ ref_lengths += closest_ref_length(references, hyp_len) -+ -+ # Calculate corpus-level brevity penalty. -+ bp = brevity_penalty(ref_lengths, hyp_lengths) -+ -+ # Collects the various precision values for the different ngram orders. -+ p_n = [ -+ Fraction(p_numerators[i], p_denominators[i], _normalize=False) -+ for i in range(1, max_weight_length + 1) -+ ] -+ -+ # Returns 0 if there's no matching n-grams -+ # We only need to check for p_numerators[1] == 0, since if there's -+ # no unigrams, there won't be any higher order ngrams. -+ if p_numerators[1] == 0: -+ return 0 if len(weights) == 1 else [0] * len(weights) -+ -+ # If there's no smoothing, set use method0 from SmoothinFunction class. -+ if not smoothing_function: -+ smoothing_function = SmoothingFunction().method0 -+ # Smoothen the modified precision. -+ # Note: smoothing_function() may convert values into floats; -+ # it tries to retain the Fraction object as much as the -+ # smoothing method allows. -+ p_n = smoothing_function( -+ p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths -+ ) -+ -+ bleu_scores = [] -+ for weight in weights: -+ # Uniformly re-weighting based on maximum hypothesis lengths if largest -+ # order of n-grams < 4 and weights is set at default. -+ if auto_reweigh: -+ if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): -+ weight = (1 / hyp_lengths,) * hyp_lengths -+ -+ s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) -+ s = bp * math.exp(math.fsum(s)) -+ bleu_scores.append(s) -+ return bleu_scores[0] if len(weights) == 1 else bleu_scores -+ -+ -+def modified_precision(references, hypothesis, n): -+ """ -+ Calculate modified ngram precision. -+ -+ The normal precision method may lead to some wrong translations with -+ high-precision, e.g., the translation, in which a word of reference -+ repeats several times, has very high precision. -+ -+ This function only returns the Fraction object that contains the numerator -+ and denominator necessary to calculate the corpus-level precision. -+ To calculate the modified precision for a single pair of hypothesis and -+ references, cast the Fraction object into a float. -+ -+ The famous "the the the ... " example shows that you can get BLEU precision -+ by duplicating high frequency words. -+ -+ >>> reference1 = 'the cat is on the mat'.split() -+ >>> reference2 = 'there is a cat on the mat'.split() -+ >>> hypothesis1 = 'the the the the the the the'.split() -+ >>> references = [reference1, reference2] -+ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS -+ 0.2857... -+ -+ In the modified n-gram precision, a reference word will be considered -+ exhausted after a matching hypothesis word is identified, e.g. -+ -+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -+ ... 'ensures', 'that', 'the', 'military', 'will', -+ ... 'forever', 'heed', 'Party', 'commands'] -+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', -+ ... 'guarantees', 'the', 'military', 'forces', 'always', -+ ... 'being', 'under', 'the', 'command', 'of', 'the', -+ ... 'Party'] -+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -+ ... 'army', 'always', 'to', 'heed', 'the', 'directions', -+ ... 'of', 'the', 'party'] -+ >>> hypothesis = 'of the'.split() -+ >>> references = [reference1, reference2, reference3] -+ >>> float(modified_precision(references, hypothesis, n=1)) -+ 1.0 -+ >>> float(modified_precision(references, hypothesis, n=2)) -+ 1.0 -+ -+ An example of a normal machine translation hypothesis: -+ -+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', -+ ... 'ensures', 'that', 'the', 'military', 'always', -+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] -+ -+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', -+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook', -+ ... 'that', 'party', 'direct'] -+ -+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', -+ ... 'ensures', 'that', 'the', 'military', 'will', -+ ... 'forever', 'heed', 'Party', 'commands'] -+ -+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', -+ ... 'guarantees', 'the', 'military', 'forces', 'always', -+ ... 'being', 'under', 'the', 'command', 'of', 'the', -+ ... 'Party'] -+ -+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', -+ ... 'army', 'always', 'to', 'heed', 'the', 'directions', -+ ... 'of', 'the', 'party'] -+ >>> references = [reference1, reference2, reference3] -+ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS -+ 0.9444... -+ >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS -+ 0.5714... -+ >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS -+ 0.5882352941176471 -+ >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS -+ 0.07692... -+ -+ -+ :param references: A list of reference translations. -+ :type references: list(list(str)) -+ :param hypothesis: A hypothesis translation. -+ :type hypothesis: list(str) -+ :param n: The ngram order. -+ :type n: int -+ :return: BLEU's modified precision for the nth order ngram. -+ :rtype: Fraction -+ """ -+ # Extracts all ngrams in hypothesis -+ # Set an empty Counter if hypothesis is empty. -+ counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() -+ # Extract a union of references' counts. -+ # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) -+ max_counts = {} -+ for reference in references: -+ reference_counts = ( -+ Counter(ngrams(reference, n)) if len(reference) >= n else Counter() -+ ) -+ for ngram in counts: -+ max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) -+ -+ # Assigns the intersection between hypothesis and references' counts. -+ clipped_counts = { -+ ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() -+ } -+ -+ numerator = sum(clipped_counts.values()) -+ # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. -+ # Usually this happens when the ngram order is > len(reference). -+ denominator = max(1, sum(counts.values())) -+ -+ return Fraction(numerator, denominator, _normalize=False) -+ -+ -+def closest_ref_length(references, hyp_len): -+ """ -+ This function finds the reference that is the closest length to the -+ hypothesis. The closest reference length is referred to as *r* variable -+ from the brevity penalty formula in Papineni et. al. (2002) -+ -+ :param references: A list of reference translations. -+ :type references: list(list(str)) -+ :param hyp_len: The length of the hypothesis. -+ :type hyp_len: int -+ :return: The length of the reference that's closest to the hypothesis. -+ :rtype: int -+ """ -+ ref_lens = (len(reference) for reference in references) -+ closest_ref_len = min( -+ ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) -+ ) -+ return closest_ref_len -+ -+ -+def brevity_penalty(closest_ref_len, hyp_len): -+ """ -+ Calculate brevity penalty. -+ -+ As the modified n-gram precision still has the problem from the short -+ length sentence, brevity penalty is used to modify the overall BLEU -+ score according to length. -+ -+ An example from the paper. There are three references with length 12, 15 -+ and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. -+ -+ >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 -+ >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 -+ >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 -+ >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 -+ >>> references = [reference1, reference2, reference3] -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(references, hyp_len) -+ >>> brevity_penalty(closest_ref_len, hyp_len) -+ 1.0 -+ -+ In case a hypothesis translation is shorter than the references, penalty is -+ applied. -+ -+ >>> references = [['a'] * 28, ['a'] * 28] -+ >>> hypothesis = ['a'] * 12 -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(references, hyp_len) -+ >>> brevity_penalty(closest_ref_len, hyp_len) -+ 0.2635971381157267 -+ -+ The length of the closest reference is used to compute the penalty. If the -+ length of a hypothesis is 12, and the reference lengths are 13 and 2, the -+ penalty is applied because the hypothesis length (12) is less then the -+ closest reference length (13). -+ -+ >>> references = [['a'] * 13, ['a'] * 2] -+ >>> hypothesis = ['a'] * 12 -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(references, hyp_len) -+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS -+ 0.9200... -+ -+ The brevity penalty doesn't depend on reference order. More importantly, -+ when two reference sentences are at the same distance, the shortest -+ reference sentence length is used. -+ -+ >>> references = [['a'] * 13, ['a'] * 11] -+ >>> hypothesis = ['a'] * 12 -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(references, hyp_len) -+ >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) -+ >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) -+ >>> bp1 == bp2 == 1 -+ True -+ -+ A test example from mteval-v13a.pl (starting from the line 705): -+ -+ >>> references = [['a'] * 11, ['a'] * 8] -+ >>> hypothesis = ['a'] * 7 -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(references, hyp_len) -+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS -+ 0.8668... -+ -+ >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] -+ >>> hypothesis = ['a'] * 7 -+ >>> hyp_len = len(hypothesis) -+ >>> closest_ref_len = closest_ref_length(references, hyp_len) -+ >>> brevity_penalty(closest_ref_len, hyp_len) -+ 1.0 -+ -+ :param hyp_len: The length of the hypothesis for a single sentence OR the -+ sum of all the hypotheses' lengths for a corpus -+ :type hyp_len: int -+ :param closest_ref_len: The length of the closest reference for a single -+ hypothesis OR the sum of all the closest references for every hypotheses. -+ :type closest_ref_len: int -+ :return: BLEU's brevity penalty. -+ :rtype: float -+ """ -+ if hyp_len > closest_ref_len: -+ return 1 -+ # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 -+ elif hyp_len == 0: -+ return 0 -+ else: -+ return math.exp(1 - closest_ref_len / hyp_len) -+ -+ -+class SmoothingFunction: -+ """ -+ This is an implementation of the smoothing techniques -+ for segment-level BLEU scores that was presented in -+ Boxing Chen and Collin Cherry (2014) A Systematic Comparison of -+ Smoothing Techniques for Sentence-Level BLEU. In WMT14. -+ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf -+ """ -+ -+ def __init__(self, epsilon=0.1, alpha=5, k=5): -+ """ -+ This will initialize the parameters required for the various smoothing -+ techniques, the default values are set to the numbers used in the -+ experiments from Chen and Cherry (2014). -+ -+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', -+ ... 'that', 'the', 'military', 'always', 'obeys', 'the', -+ ... 'commands', 'of', 'the', 'party'] -+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', -+ ... 'that', 'the', 'military', 'will', 'forever', 'heed', -+ ... 'Party', 'commands'] -+ -+ >>> chencherry = SmoothingFunction() -+ >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS -+ 0.4118... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS -+ 0.4118... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS -+ 0.4118... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS -+ 0.4452... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS -+ 0.4118... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS -+ 0.4118... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS -+ 0.4905... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS -+ 0.4135... -+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS -+ 0.4905... -+ -+ :param epsilon: the epsilon value use in method 1 -+ :type epsilon: float -+ :param alpha: the alpha value use in method 6 -+ :type alpha: int -+ :param k: the k value use in method 4 -+ :type k: int -+ """ -+ self.epsilon = epsilon -+ self.alpha = alpha -+ self.k = k -+ -+ def method0(self, p_n, *args, **kwargs): -+ """ -+ No smoothing. -+ """ -+ p_n_new = [] -+ for i, p_i in enumerate(p_n): -+ if p_i.numerator != 0: -+ p_n_new.append(p_i) -+ else: -+ _msg = str( -+ "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" -+ "Therefore the BLEU score evaluates to 0, independently of\n" -+ "how many N-gram overlaps of lower order it contains.\n" -+ "Consider using lower n-gram order or use " -+ "SmoothingFunction()" -+ ).format(i + 1) -+ warnings.warn(_msg) -+ # When numerator==0 where denonminator==0 or !=0, the result -+ # for the precision score should be equal to 0 or undefined. -+ # Due to BLEU geometric mean computation in logarithm space, -+ # we we need to take the return sys.float_info.min such that -+ # math.log(sys.float_info.min) returns a 0 precision score. -+ p_n_new.append(sys.float_info.min) -+ return p_n_new -+ -+ def method1(self, p_n, *args, **kwargs): -+ """ -+ Smoothing method 1: Add *epsilon* counts to precision with 0 counts. -+ """ -+ return [ -+ (p_i.numerator + self.epsilon) / p_i.denominator -+ if p_i.numerator == 0 -+ else p_i -+ for p_i in p_n -+ ] -+ -+ def method2(self, p_n, *args, **kwargs): -+ """ -+ Smoothing method 2: Add 1 to both numerator and denominator from -+ Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for -+ Evaluating Automatic Evaluation Metrics for Machine Translation. -+ In COLING 2004. -+ """ -+ return [ -+ Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) -+ if i != 0 -+ else p_n[0] -+ for i in range(len(p_n)) -+ ] -+ -+ def method3(self, p_n, *args, **kwargs): -+ """ -+ Smoothing method 3: NIST geometric sequence smoothing -+ The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each -+ precision score whose matching n-gram count is null. -+ k is 1 for the first 'n' value for which the n-gram match count is null/ -+ -+ For example, if the text contains: -+ -+ - one 2-gram match -+ - and (consequently) two 1-gram matches -+ -+ the n-gram count for each individual precision score would be: -+ -+ - n=1 => prec_count = 2 (two unigrams) -+ - n=2 => prec_count = 1 (one bigram) -+ - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) -+ - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) -+ """ -+ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. -+ for i, p_i in enumerate(p_n): -+ if p_i.numerator == 0: -+ p_n[i] = 1 / (2**incvnt * p_i.denominator) -+ incvnt += 1 -+ return p_n -+ -+ def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -+ """ -+ Smoothing method 4: -+ Shorter translations may have inflated precision values due to having -+ smaller denominators; therefore, we give them proportionally -+ smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry -+ suggests dividing by 1/ln(len(T)), where T is the length of the translation. -+ """ -+ incvnt = 1 -+ hyp_len = hyp_len if hyp_len else len(hypothesis) -+ for i, p_i in enumerate(p_n): -+ if p_i.numerator == 0 and hyp_len > 1: -+ # incvnt = i + 1 * self.k / math.log( -+ # hyp_len -+ # ) # Note that this K is different from the K from NIST. -+ # p_n[i] = incvnt / p_i.denominator\ -+ numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) -+ p_n[i] = numerator / p_i.denominator -+ incvnt += 1 -+ return p_n -+ -+ def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -+ """ -+ Smoothing method 5: -+ The matched counts for similar values of n should be similar. To a -+ calculate the n-gram matched count, it averages the n−1, n and n+1 gram -+ matched counts. -+ """ -+ hyp_len = hyp_len if hyp_len else len(hypothesis) -+ m = {} -+ # Requires an precision value for an addition ngram order. -+ p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] -+ m[-1] = p_n[0] + 1 -+ for i, p_i in enumerate(p_n): -+ p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 -+ m[i] = p_n[i] -+ return p_n -+ -+ def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -+ """ -+ Smoothing method 6: -+ Interpolates the maximum likelihood estimate of the precision *p_n* with -+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio -+ between pn and pn−1 will be the same as that between pn−1 and pn−2; from -+ Gao and He (2013) Training MRF-Based Phrase Translation Models using -+ Gradient Ascent. In NAACL. -+ """ -+ hyp_len = hyp_len if hyp_len else len(hypothesis) -+ # This smoothing only works when p_1 and p_2 is non-zero. -+ # Raise an error with an appropriate message when the input is too short -+ # to use this smoothing technique. -+ assert p_n[2], "This smoothing method requires non-zero precision for bigrams." -+ for i, p_i in enumerate(p_n): -+ if i in [0, 1]: # Skips the first 2 orders of ngrams. -+ continue -+ else: -+ pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] -+ # No. of ngrams in translation that matches the reference. -+ m = p_i.numerator -+ # No. of ngrams in translation. -+ l = sum(1 for _ in ngrams(hypothesis, i + 1)) -+ # Calculates the interpolated precision. -+ p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) -+ return p_n -+ -+ def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): -+ """ -+ Smoothing method 7: -+ Interpolates methods 4 and 5. -+ """ -+ hyp_len = hyp_len if hyp_len else len(hypothesis) -+ p_n = self.method4(p_n, references, hypothesis, hyp_len) -+ p_n = self.method5(p_n, references, hypothesis, hyp_len) -+ return p_n -Index: nltk-3.8.1/README.md -=================================================================== ---- nltk-3.8.1.orig/README.md -+++ nltk-3.8.1/README.md -@@ -1,50 +1,50 @@ --# Natural Language Toolkit (NLTK) --[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) --![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) -- --NLTK -- the Natural Language Toolkit -- is a suite of open source Python --modules, data sets, and tutorials supporting research and development in Natural --Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11. -- --For documentation, please visit [nltk.org](https://www.nltk.org/). -- -- --## Contributing -- --Do you want to contribute to NLTK development? Great! --Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. -- --See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). -- -- --## Donate -- --Have you found the toolkit helpful? Please support NLTK development by donating --to the project via PayPal, using the link on the NLTK homepage. -- -- --## Citing -- --If you publish work that uses NLTK, please cite the NLTK book, as follows: -- -- Bird, Steven, Edward Loper and Ewan Klein (2009). -- Natural Language Processing with Python. O'Reilly Media Inc. -- -- --## Copyright -- --Copyright (C) 2001-2023 NLTK Project -- --For license information, see [LICENSE.txt](LICENSE.txt). -- --[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. -- -- --### Redistributing -- --- NLTK source code is distributed under the Apache 2.0 License. --- NLTK documentation is distributed under the Creative Commons -- Attribution-Noncommercial-No Derivative Works 3.0 United States license. --- NLTK corpora are provided under the terms given in the README file for each -- corpus; all are redistributable and available for non-commercial use. --- NLTK may be freely redistributed, subject to the provisions of these licenses. -+# Natural Language Toolkit (NLTK) -+[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) -+![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) -+ -+NLTK -- the Natural Language Toolkit -- is a suite of open source Python -+modules, data sets, and tutorials supporting research and development in Natural -+Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12. -+ -+For documentation, please visit [nltk.org](https://www.nltk.org/). -+ -+ -+## Contributing -+ -+Do you want to contribute to NLTK development? Great! -+Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. -+ -+See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). -+ -+ -+## Donate -+ -+Have you found the toolkit helpful? Please support NLTK development by donating -+to the project via PayPal, using the link on the NLTK homepage. -+ -+ -+## Citing -+ -+If you publish work that uses NLTK, please cite the NLTK book, as follows: -+ -+ Bird, Steven, Edward Loper and Ewan Klein (2009). -+ Natural Language Processing with Python. O'Reilly Media Inc. -+ -+ -+## Copyright -+ -+Copyright (C) 2001-2023 NLTK Project -+ -+For license information, see [LICENSE.txt](LICENSE.txt). -+ -+[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. -+ -+ -+### Redistributing -+ -+- NLTK source code is distributed under the Apache 2.0 License. -+- NLTK documentation is distributed under the Creative Commons -+ Attribution-Noncommercial-No Derivative Works 3.0 United States license. -+- NLTK corpora are provided under the terms given in the README file for each -+ corpus; all are redistributable and available for non-commercial use. -+- NLTK may be freely redistributed, subject to the provisions of these licenses. -Index: nltk-3.8.1/setup.py -=================================================================== ---- nltk-3.8.1.orig/setup.py -+++ nltk-3.8.1/setup.py -@@ -1,125 +1,126 @@ --#!/usr/bin/env python --# --# Setup script for the Natural Language Toolkit --# --# Copyright (C) 2001-2023 NLTK Project --# Author: NLTK Team --# URL: --# For license information, see LICENSE.TXT -- --# Work around mbcs bug in distutils. --# https://bugs.python.org/issue10945 --import codecs -- --try: -- codecs.lookup("mbcs") --except LookupError: -- ascii = codecs.lookup("ascii") -- func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") -- codecs.register(func) -- --import os -- --# Use the VERSION file to get NLTK version --version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") --with open(version_file) as fh: -- nltk_version = fh.read().strip() -- --# setuptools --from setuptools import find_packages, setup -- --# Specify groups of optional dependencies --extras_require = { -- "machine_learning": [ -- "numpy", -- "python-crfsuite", -- "scikit-learn", -- "scipy", -- ], -- "plot": ["matplotlib"], -- "tgrep": ["pyparsing"], -- "twitter": ["twython"], -- "corenlp": ["requests"], --} -- --# Add a group made up of all optional dependencies --extras_require["all"] = { -- package for group in extras_require.values() for package in group --} -- --# Adds CLI commands --console_scripts = """ --[console_scripts] --nltk=nltk.cli:cli --""" -- --_project_homepage = "https://www.nltk.org/" -- --setup( -- name="nltk", -- description="Natural Language Toolkit", -- version=nltk_version, -- url=_project_homepage, -- project_urls={ -- "Documentation": _project_homepage, -- "Source Code": "https://github.com/nltk/nltk", -- "Issue Tracker": "https://github.com/nltk/nltk/issues", -- }, -- long_description="""\ --The Natural Language Toolkit (NLTK) is a Python package for --natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""", -- license="Apache License, Version 2.0", -- keywords=[ -- "NLP", -- "CL", -- "natural language processing", -- "computational linguistics", -- "parsing", -- "tagging", -- "tokenizing", -- "syntax", -- "linguistics", -- "language", -- "natural language", -- "text analytics", -- ], -- maintainer="NLTK Team", -- maintainer_email="nltk.team@gmail.com", -- author="NLTK Team", -- author_email="nltk.team@gmail.com", -- classifiers=[ -- "Development Status :: 5 - Production/Stable", -- "Intended Audience :: Developers", -- "Intended Audience :: Education", -- "Intended Audience :: Information Technology", -- "Intended Audience :: Science/Research", -- "License :: OSI Approved :: Apache Software License", -- "Operating System :: OS Independent", -- "Programming Language :: Python :: 3.7", -- "Programming Language :: Python :: 3.8", -- "Programming Language :: Python :: 3.9", -- "Programming Language :: Python :: 3.10", -- "Programming Language :: Python :: 3.11", -- "Topic :: Scientific/Engineering", -- "Topic :: Scientific/Engineering :: Artificial Intelligence", -- "Topic :: Scientific/Engineering :: Human Machine Interfaces", -- "Topic :: Scientific/Engineering :: Information Analysis", -- "Topic :: Text Processing", -- "Topic :: Text Processing :: Filters", -- "Topic :: Text Processing :: General", -- "Topic :: Text Processing :: Indexing", -- "Topic :: Text Processing :: Linguistic", -- ], -- package_data={"nltk": ["test/*.doctest", "VERSION"]}, -- python_requires=">=3.7", -- install_requires=[ -- "click", -- "joblib", -- "regex>=2021.8.3", -- "tqdm", -- ], -- extras_require=extras_require, -- packages=find_packages(), -- zip_safe=False, # since normal files will be present too? -- entry_points=console_scripts, --) -+#!/usr/bin/env python -+# -+# Setup script for the Natural Language Toolkit -+# -+# Copyright (C) 2001-2023 NLTK Project -+# Author: NLTK Team -+# URL: -+# For license information, see LICENSE.TXT -+ -+# Work around mbcs bug in distutils. -+# https://bugs.python.org/issue10945 -+import codecs -+ -+try: -+ codecs.lookup("mbcs") -+except LookupError: -+ ascii = codecs.lookup("ascii") -+ func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") -+ codecs.register(func) -+ -+import os -+ -+# Use the VERSION file to get NLTK version -+version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") -+with open(version_file) as fh: -+ nltk_version = fh.read().strip() -+ -+# setuptools -+from setuptools import find_packages, setup -+ -+# Specify groups of optional dependencies -+extras_require = { -+ "machine_learning": [ -+ "numpy", -+ "python-crfsuite", -+ "scikit-learn", -+ "scipy", -+ ], -+ "plot": ["matplotlib"], -+ "tgrep": ["pyparsing"], -+ "twitter": ["twython"], -+ "corenlp": ["requests"], -+} -+ -+# Add a group made up of all optional dependencies -+extras_require["all"] = { -+ package for group in extras_require.values() for package in group -+} -+ -+# Adds CLI commands -+console_scripts = """ -+[console_scripts] -+nltk=nltk.cli:cli -+""" -+ -+_project_homepage = "https://www.nltk.org/" -+ -+setup( -+ name="nltk", -+ description="Natural Language Toolkit", -+ version=nltk_version, -+ url=_project_homepage, -+ project_urls={ -+ "Documentation": _project_homepage, -+ "Source Code": "https://github.com/nltk/nltk", -+ "Issue Tracker": "https://github.com/nltk/nltk/issues", -+ }, -+ long_description="""\ -+The Natural Language Toolkit (NLTK) is a Python package for + def sentence_bleu( + references, + hypothesis, +--- a/setup.py ++++ b/setup.py +@@ -67,7 +67,7 @@ setup( + }, + long_description="""\ + The Natural Language Toolkit (NLTK) is a Python package for +-natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""", +natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""", -+ license="Apache License, Version 2.0", -+ keywords=[ -+ "NLP", -+ "CL", -+ "natural language processing", -+ "computational linguistics", -+ "parsing", -+ "tagging", -+ "tokenizing", -+ "syntax", -+ "linguistics", -+ "language", -+ "natural language", -+ "text analytics", -+ ], -+ maintainer="NLTK Team", -+ maintainer_email="nltk.team@gmail.com", -+ author="NLTK Team", -+ author_email="nltk.team@gmail.com", -+ classifiers=[ -+ "Development Status :: 5 - Production/Stable", -+ "Intended Audience :: Developers", -+ "Intended Audience :: Education", -+ "Intended Audience :: Information Technology", -+ "Intended Audience :: Science/Research", -+ "License :: OSI Approved :: Apache Software License", -+ "Operating System :: OS Independent", -+ "Programming Language :: Python :: 3.7", -+ "Programming Language :: Python :: 3.8", -+ "Programming Language :: Python :: 3.9", -+ "Programming Language :: Python :: 3.10", -+ "Programming Language :: Python :: 3.11", + license="Apache License, Version 2.0", + keywords=[ + "NLP", +@@ -100,6 +100,7 @@ natural language processing. NLTK requi + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", -+ "Topic :: Scientific/Engineering", -+ "Topic :: Scientific/Engineering :: Artificial Intelligence", -+ "Topic :: Scientific/Engineering :: Human Machine Interfaces", -+ "Topic :: Scientific/Engineering :: Information Analysis", -+ "Topic :: Text Processing", -+ "Topic :: Text Processing :: Filters", -+ "Topic :: Text Processing :: General", -+ "Topic :: Text Processing :: Indexing", -+ "Topic :: Text Processing :: Linguistic", -+ ], -+ package_data={"nltk": ["test/*.doctest", "VERSION"]}, -+ python_requires=">=3.7", -+ install_requires=[ -+ "click", -+ "joblib", -+ "regex>=2021.8.3", -+ "tqdm", -+ ], -+ extras_require=extras_require, -+ packages=find_packages(), -+ zip_safe=False, # since normal files will be present too? -+ entry_points=console_scripts, -+) + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Human Machine Interfaces", diff --git a/python-nltk.changes b/python-nltk.changes index e4d945d..2b401ed 100644 --- a/python-nltk.changes +++ b/python-nltk.changes @@ -1,3 +1,14 @@ +------------------------------------------------------------------- +Mon Jul 1 21:02:45 UTC 2024 - Matej Cepl + +- Use tarball from GitHub instead of the Zip archive from PyPI, + the latter has very messy combination of CRLF and LF EOLs, + which are hard to patch. +- Refresh all patches from the original locations. +- Add CVE-2024-39705-disable-download.patch to make a crude + workaround around CVE-2024-39705 (gh#nltk/nltk#3266, + bsc#1227174). + ------------------------------------------------------------------- Thu Mar 21 17:41:52 UTC 2024 - Ben Greiner diff --git a/python-nltk.spec b/python-nltk.spec index c5ecdeb..21b8297 100644 --- a/python-nltk.spec +++ b/python-nltk.spec @@ -16,6 +16,7 @@ # +%define modname nltk Name: python-nltk Version: 3.8.1 Release: 0 @@ -23,7 +24,7 @@ Summary: Natural Language Toolkit License: Apache-2.0 URL: http://nltk.org/ # SourceRepository: https://github.com/nltk/nltk -Source0: https://files.pythonhosted.org/packages/source/n/nltk/nltk-%{version}.zip +Source0: https://github.com/nltk/%{modname}/archive/refs/tags/%{version}.tar.gz#/%{modname}-%{version}.tar.gz # Download/Update NLTK data: # quilt setup python-nltk.spec # pushd nltk-?.?.? @@ -62,6 +63,9 @@ Source99: python-nltk.rpmlintrc Patch0: skip-networked-test.patch # PATCH-FIX-UPSTREAM nltk-pr3207-py312.patch gh#nltk/nltk#3207 Patch1: nltk-pr3207-py312.patch +# PATCH-FIX-UPSTREAM CVE-2024-39705-disable-download.patch bsc#1227174 mcepl@suse.com +# this patch makes things totally awesome +Patch2: CVE-2024-39705-disable-download.patch BuildRequires: %{python_module base >= 3.7} BuildRequires: %{python_module pip} BuildRequires: %{python_module setuptools} @@ -118,7 +122,7 @@ Python modules, data sets and tutorials supporting research and development in Natural Language Processing. %prep -%autosetup -p1 -a1 -n nltk-%{version} +%setup -q -a1 -n %{modname}-%{version} # Fix EOL sed -i 's/\r/\n/g; s/\n$//' \ @@ -150,6 +154,8 @@ sed -E -i "s|#![[:space:]]*%{_bindir}/env python|#!%{_bindir}/python3|" \ nltk_data/corpora/pl196x/splitter.py \ tools/find_deprecated.py +%autopatch -p1 + %build %pyproject_wheel @@ -164,7 +170,12 @@ chmod -x %{buildroot}%{$python_sitelib}/nltk/test/dependency.doctest %check export NLTK_DATA=$(readlink -f ./nltk_data/) # export PYTEST_ADDOPTS="--doctest-modules" -%pytest -k 'not network' +# Skip tests requiring pickle.load gh#nltk/nltk#3266 (CVE-2024-39705) +skip_tests=" or test_basic or test_increment or test_pad_asterisk or test_pad_dotdot" +skip_tests+=" or test_pos_tag_eng or test_pos_tag_eng_universal or test_pos_tag_rus" +skip_tests+=" or test_pos_tag_rus_universal or test_pos_tag_unknown_lang" +skip_tests+=" or test_sent_tokenize or test_unspecified_lang or test_word_tokenize" +%pytest -k "not (network ${skip_tests})" %post %python_install_alternative nltk diff --git a/skip-networked-test.patch b/skip-networked-test.patch index 82d62af..f1cd8f7 100644 --- a/skip-networked-test.patch +++ b/skip-networked-test.patch @@ -6,30 +6,30 @@ --- a/nltk/test/unit/test_downloader.py +++ b/nltk/test/unit/test_downloader.py @@ -1,6 +1,9 @@ - from nltk import download - -+import pytest - -+ -+@pytest.mark.network - def test_downloader_using_existing_parent_download_dir(tmp_path): - """Test that download works properly when the parent folder of the download_dir exists""" - + from nltk import download + ++import pytest + ++ ++@pytest.mark.network + def test_downloader_using_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir exists""" + @@ -9,6 +12,7 @@ def test_downloader_using_existing_paren - assert download_status is True - - -+@pytest.mark.network - def test_downloader_using_non_existing_parent_download_dir(tmp_path): - """Test that download works properly when the parent folder of the download_dir does not exist""" - + assert download_status is True + + ++@pytest.mark.network + def test_downloader_using_non_existing_parent_download_dir(tmp_path): + """Test that download works properly when the parent folder of the download_dir does not exist""" + --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,7 @@ -+[tool:pytest] -+markers = -+ network: test case requires network connection -+ - [metadata] - license_files = - LICENSE.txt ++[tool:pytest] ++markers = ++ network: test case requires network connection ++ + [metadata] + license_files = + LICENSE.txt