From f64d1a206ea63c11ac880c96350251fefafe8f2c86e5fabbafec58da91fa711f Mon Sep 17 00:00:00 2001 From: Daniel Garcia Date: Fri, 22 Mar 2024 06:59:49 +0000 Subject: [PATCH] Accepting request 1160467 from home:bnavigator:branches:devel:languages:python - Update to 3.8.1 * Resolve RCE & XSS vulnerabilities in localhost WordNet Browser * Add Python 3.11 support - Update nltk_data archive - Drop port-2to3.patch - Add nltk-pr3207-py312.patch for Python 3.12 support * gh#nltk/nltk#3207 OBS-URL: https://build.opensuse.org/request/show/1160467 OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-nltk?expand=0&rev=45 --- nltk-3.8.1.zip | 3 + nltk-3.8.zip | 3 - nltk-pr3207-py312.patch | 1782 +++++++++++++++++++++++++++++++++++++++ nltk_data.tar.xz | 4 +- port-2to3.patch | 48 -- python-nltk.changes | 11 + python-nltk.rpmlintrc | 3 +- python-nltk.spec | 89 +- 8 files changed, 1858 insertions(+), 85 deletions(-) create mode 100644 nltk-3.8.1.zip delete mode 100644 nltk-3.8.zip create mode 100644 nltk-pr3207-py312.patch delete mode 100644 port-2to3.patch diff --git a/nltk-3.8.1.zip b/nltk-3.8.1.zip new file mode 100644 index 0000000..79456a9 --- /dev/null +++ b/nltk-3.8.1.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3 +size 4620388 diff --git a/nltk-3.8.zip b/nltk-3.8.zip deleted file mode 100644 index c472fa3..0000000 --- a/nltk-3.8.zip +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74b30826a37d78d53427105bbd037dd880251be269fca64ee530838a46ed55fc -size 4619825 diff --git a/nltk-pr3207-py312.patch b/nltk-pr3207-py312.patch new file mode 100644 index 0000000..75f2bff --- /dev/null +++ b/nltk-pr3207-py312.patch @@ -0,0 +1,1782 @@ +From 25d35fc4283dedd2053ec6d821f4b707fff8d72c Mon Sep 17 00:00:00 2001 +From: Konstantin Chernyshev +Date: Thu, 16 Nov 2023 19:00:15 +0100 +Subject: [PATCH 1/8] ci: enable 3.12 in ci tests + +--- + .github/workflows/ci.yaml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +Index: nltk-3.8.1/nltk/test/unit/translate/test_bleu.py +=================================================================== +--- nltk-3.8.1.orig/nltk/test/unit/translate/test_bleu.py ++++ nltk-3.8.1/nltk/test/unit/translate/test_bleu.py +@@ -2,7 +2,6 @@ + Tests for BLEU translation evaluation metric + """ + +-import io + import unittest + + from nltk.data import find +Index: nltk-3.8.1/nltk/translate/bleu_score.py +=================================================================== +--- nltk-3.8.1.orig/nltk/translate/bleu_score.py ++++ nltk-3.8.1/nltk/translate/bleu_score.py +@@ -1,685 +1,710 @@ +-# Natural Language Toolkit: BLEU Score +-# +-# Copyright (C) 2001-2023 NLTK Project +-# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim +-# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan +-# URL: +-# For license information, see LICENSE.TXT +- +-"""BLEU score implementation.""" +- +-import math +-import sys +-import warnings +-from collections import Counter +-from fractions import Fraction +- +-from nltk.util import ngrams +- +- +-def sentence_bleu( +- references, +- hypothesis, +- weights=(0.25, 0.25, 0.25, 0.25), +- smoothing_function=None, +- auto_reweigh=False, +-): +- """ +- Calculate BLEU score (Bilingual Evaluation Understudy) from +- Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. +- "BLEU: a method for automatic evaluation of machine translation." +- In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf +- +- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', +- ... 'ensures', 'that', 'the', 'military', 'always', +- ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] +- +- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', +- ... 'forever', 'hearing', 'the', 'activity', 'guidebook', +- ... 'that', 'party', 'direct'] +- +- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', +- ... 'ensures', 'that', 'the', 'military', 'will', 'forever', +- ... 'heed', 'Party', 'commands'] +- +- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', +- ... 'guarantees', 'the', 'military', 'forces', 'always', +- ... 'being', 'under', 'the', 'command', 'of', 'the', +- ... 'Party'] +- +- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', +- ... 'army', 'always', 'to', 'heed', 'the', 'directions', +- ... 'of', 'the', 'party'] +- +- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS +- 0.5045... +- +- If there is no ngrams overlap for any order of n-grams, BLEU returns the +- value 0. This is because the precision for the order of n-grams without +- overlap is 0, and the geometric mean in the final BLEU score computation +- multiplies the 0 with the precision of other n-grams. This results in 0 +- (independently of the precision of the other n-gram orders). The following +- example has zero 3-gram and 4-gram overlaps: +- +- >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS +- 0.0 +- +- To avoid this harsh behaviour when no ngram overlaps are found a smoothing +- function can be used. +- +- >>> chencherry = SmoothingFunction() +- >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, +- ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS +- 0.0370... +- +- The default BLEU calculates a score for up to 4-grams using uniform +- weights (this is called BLEU-4). To evaluate your translations with +- higher/lower order ngrams, use customized weights. E.g. when accounting +- for up to 5-grams with uniform weights (this is called BLEU-5) use: +- +- >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) +- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS +- 0.3920... +- +- Multiple BLEU scores can be computed at once, by supplying a list of weights. +- E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: +- >>> weights = [ +- ... (1./2., 1./2.), +- ... (1./3., 1./3., 1./3.), +- ... (1./4., 1./4., 1./4., 1./4.) +- ... ] +- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS +- [0.7453..., 0.6240..., 0.5045...] +- +- :param references: reference sentences +- :type references: list(list(str)) +- :param hypothesis: a hypothesis sentence +- :type hypothesis: list(str) +- :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) +- :type weights: tuple(float) / list(tuple(float)) +- :param smoothing_function: +- :type smoothing_function: SmoothingFunction +- :param auto_reweigh: Option to re-normalize the weights uniformly. +- :type auto_reweigh: bool +- :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. +- :rtype: float / list(float) +- """ +- return corpus_bleu( +- [references], [hypothesis], weights, smoothing_function, auto_reweigh +- ) +- +- +-def corpus_bleu( +- list_of_references, +- hypotheses, +- weights=(0.25, 0.25, 0.25, 0.25), +- smoothing_function=None, +- auto_reweigh=False, +-): +- """ +- Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all +- the hypotheses and their respective references. +- +- Instead of averaging the sentence level BLEU scores (i.e. macro-average +- precision), the original BLEU metric (Papineni et al. 2002) accounts for +- the micro-average precision (i.e. summing the numerators and denominators +- for each hypothesis-reference(s) pairs before the division). +- +- >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', +- ... 'ensures', 'that', 'the', 'military', 'always', +- ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] +- >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', +- ... 'ensures', 'that', 'the', 'military', 'will', 'forever', +- ... 'heed', 'Party', 'commands'] +- >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', +- ... 'guarantees', 'the', 'military', 'forces', 'always', +- ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] +- >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', +- ... 'army', 'always', 'to', 'heed', 'the', 'directions', +- ... 'of', 'the', 'party'] +- +- >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', +- ... 'interested', 'in', 'world', 'history'] +- >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', +- ... 'because', 'he', 'read', 'the', 'book'] +- +- >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] +- >>> hypotheses = [hyp1, hyp2] +- >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS +- 0.5920... +- +- The example below show that corpus_bleu() is different from averaging +- sentence_bleu() for hypotheses +- +- >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) +- >>> score2 = sentence_bleu([ref2a], hyp2) +- >>> (score1 + score2) / 2 # doctest: +ELLIPSIS +- 0.6223... +- +- Custom weights may be supplied to fine-tune the BLEU score further. +- A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. +- >>> weights = (0.1, 0.3, 0.5, 0.1) +- >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS +- 0.5818... +- +- This particular weight gave extra value to trigrams. +- Furthermore, multiple weights can be given, resulting in multiple BLEU scores. +- >>> weights = [ +- ... (0.5, 0.5), +- ... (0.333, 0.333, 0.334), +- ... (0.25, 0.25, 0.25, 0.25), +- ... (0.2, 0.2, 0.2, 0.2, 0.2) +- ... ] +- >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS +- [0.8242..., 0.7067..., 0.5920..., 0.4719...] +- +- :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses +- :type list_of_references: list(list(list(str))) +- :param hypotheses: a list of hypothesis sentences +- :type hypotheses: list(list(str)) +- :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) +- :type weights: tuple(float) / list(tuple(float)) +- :param smoothing_function: +- :type smoothing_function: SmoothingFunction +- :param auto_reweigh: Option to re-normalize the weights uniformly. +- :type auto_reweigh: bool +- :return: The corpus-level BLEU score. +- :rtype: float +- """ +- # Before proceeding to compute BLEU, perform sanity checks. +- +- p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. +- p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. +- hyp_lengths, ref_lengths = 0, 0 +- +- assert len(list_of_references) == len(hypotheses), ( +- "The number of hypotheses and their reference(s) should be the " "same " +- ) +- +- try: +- weights[0][0] +- except TypeError: +- weights = [weights] +- max_weight_length = max(len(weight) for weight in weights) +- +- # Iterate through each hypothesis and their corresponding references. +- for references, hypothesis in zip(list_of_references, hypotheses): +- # For each order of ngram, calculate the numerator and +- # denominator for the corpus-level modified precision. +- for i in range(1, max_weight_length + 1): +- p_i = modified_precision(references, hypothesis, i) +- p_numerators[i] += p_i.numerator +- p_denominators[i] += p_i.denominator +- +- # Calculate the hypothesis length and the closest reference length. +- # Adds them to the corpus-level hypothesis and reference counts. +- hyp_len = len(hypothesis) +- hyp_lengths += hyp_len +- ref_lengths += closest_ref_length(references, hyp_len) +- +- # Calculate corpus-level brevity penalty. +- bp = brevity_penalty(ref_lengths, hyp_lengths) +- +- # Collects the various precision values for the different ngram orders. +- p_n = [ +- Fraction(p_numerators[i], p_denominators[i], _normalize=False) +- for i in range(1, max_weight_length + 1) +- ] +- +- # Returns 0 if there's no matching n-grams +- # We only need to check for p_numerators[1] == 0, since if there's +- # no unigrams, there won't be any higher order ngrams. +- if p_numerators[1] == 0: +- return 0 if len(weights) == 1 else [0] * len(weights) +- +- # If there's no smoothing, set use method0 from SmoothinFunction class. +- if not smoothing_function: +- smoothing_function = SmoothingFunction().method0 +- # Smoothen the modified precision. +- # Note: smoothing_function() may convert values into floats; +- # it tries to retain the Fraction object as much as the +- # smoothing method allows. +- p_n = smoothing_function( +- p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths +- ) +- +- bleu_scores = [] +- for weight in weights: +- # Uniformly re-weighting based on maximum hypothesis lengths if largest +- # order of n-grams < 4 and weights is set at default. +- if auto_reweigh: +- if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): +- weight = (1 / hyp_lengths,) * hyp_lengths +- +- s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) +- s = bp * math.exp(math.fsum(s)) +- bleu_scores.append(s) +- return bleu_scores[0] if len(weights) == 1 else bleu_scores +- +- +-def modified_precision(references, hypothesis, n): +- """ +- Calculate modified ngram precision. +- +- The normal precision method may lead to some wrong translations with +- high-precision, e.g., the translation, in which a word of reference +- repeats several times, has very high precision. +- +- This function only returns the Fraction object that contains the numerator +- and denominator necessary to calculate the corpus-level precision. +- To calculate the modified precision for a single pair of hypothesis and +- references, cast the Fraction object into a float. +- +- The famous "the the the ... " example shows that you can get BLEU precision +- by duplicating high frequency words. +- +- >>> reference1 = 'the cat is on the mat'.split() +- >>> reference2 = 'there is a cat on the mat'.split() +- >>> hypothesis1 = 'the the the the the the the'.split() +- >>> references = [reference1, reference2] +- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS +- 0.2857... +- +- In the modified n-gram precision, a reference word will be considered +- exhausted after a matching hypothesis word is identified, e.g. +- +- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', +- ... 'ensures', 'that', 'the', 'military', 'will', +- ... 'forever', 'heed', 'Party', 'commands'] +- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', +- ... 'guarantees', 'the', 'military', 'forces', 'always', +- ... 'being', 'under', 'the', 'command', 'of', 'the', +- ... 'Party'] +- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', +- ... 'army', 'always', 'to', 'heed', 'the', 'directions', +- ... 'of', 'the', 'party'] +- >>> hypothesis = 'of the'.split() +- >>> references = [reference1, reference2, reference3] +- >>> float(modified_precision(references, hypothesis, n=1)) +- 1.0 +- >>> float(modified_precision(references, hypothesis, n=2)) +- 1.0 +- +- An example of a normal machine translation hypothesis: +- +- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', +- ... 'ensures', 'that', 'the', 'military', 'always', +- ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] +- +- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', +- ... 'forever', 'hearing', 'the', 'activity', 'guidebook', +- ... 'that', 'party', 'direct'] +- +- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', +- ... 'ensures', 'that', 'the', 'military', 'will', +- ... 'forever', 'heed', 'Party', 'commands'] +- +- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', +- ... 'guarantees', 'the', 'military', 'forces', 'always', +- ... 'being', 'under', 'the', 'command', 'of', 'the', +- ... 'Party'] +- +- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', +- ... 'army', 'always', 'to', 'heed', 'the', 'directions', +- ... 'of', 'the', 'party'] +- >>> references = [reference1, reference2, reference3] +- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS +- 0.9444... +- >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS +- 0.5714... +- >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS +- 0.5882352941176471 +- >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS +- 0.07692... +- +- +- :param references: A list of reference translations. +- :type references: list(list(str)) +- :param hypothesis: A hypothesis translation. +- :type hypothesis: list(str) +- :param n: The ngram order. +- :type n: int +- :return: BLEU's modified precision for the nth order ngram. +- :rtype: Fraction +- """ +- # Extracts all ngrams in hypothesis +- # Set an empty Counter if hypothesis is empty. +- counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() +- # Extract a union of references' counts. +- # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) +- max_counts = {} +- for reference in references: +- reference_counts = ( +- Counter(ngrams(reference, n)) if len(reference) >= n else Counter() +- ) +- for ngram in counts: +- max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) +- +- # Assigns the intersection between hypothesis and references' counts. +- clipped_counts = { +- ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() +- } +- +- numerator = sum(clipped_counts.values()) +- # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. +- # Usually this happens when the ngram order is > len(reference). +- denominator = max(1, sum(counts.values())) +- +- return Fraction(numerator, denominator, _normalize=False) +- +- +-def closest_ref_length(references, hyp_len): +- """ +- This function finds the reference that is the closest length to the +- hypothesis. The closest reference length is referred to as *r* variable +- from the brevity penalty formula in Papineni et. al. (2002) +- +- :param references: A list of reference translations. +- :type references: list(list(str)) +- :param hyp_len: The length of the hypothesis. +- :type hyp_len: int +- :return: The length of the reference that's closest to the hypothesis. +- :rtype: int +- """ +- ref_lens = (len(reference) for reference in references) +- closest_ref_len = min( +- ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) +- ) +- return closest_ref_len +- +- +-def brevity_penalty(closest_ref_len, hyp_len): +- """ +- Calculate brevity penalty. +- +- As the modified n-gram precision still has the problem from the short +- length sentence, brevity penalty is used to modify the overall BLEU +- score according to length. +- +- An example from the paper. There are three references with length 12, 15 +- and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. +- +- >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 +- >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 +- >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 +- >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 +- >>> references = [reference1, reference2, reference3] +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(references, hyp_len) +- >>> brevity_penalty(closest_ref_len, hyp_len) +- 1.0 +- +- In case a hypothesis translation is shorter than the references, penalty is +- applied. +- +- >>> references = [['a'] * 28, ['a'] * 28] +- >>> hypothesis = ['a'] * 12 +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(references, hyp_len) +- >>> brevity_penalty(closest_ref_len, hyp_len) +- 0.2635971381157267 +- +- The length of the closest reference is used to compute the penalty. If the +- length of a hypothesis is 12, and the reference lengths are 13 and 2, the +- penalty is applied because the hypothesis length (12) is less then the +- closest reference length (13). +- +- >>> references = [['a'] * 13, ['a'] * 2] +- >>> hypothesis = ['a'] * 12 +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(references, hyp_len) +- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS +- 0.9200... +- +- The brevity penalty doesn't depend on reference order. More importantly, +- when two reference sentences are at the same distance, the shortest +- reference sentence length is used. +- +- >>> references = [['a'] * 13, ['a'] * 11] +- >>> hypothesis = ['a'] * 12 +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(references, hyp_len) +- >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) +- >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) +- >>> bp1 == bp2 == 1 +- True +- +- A test example from mteval-v13a.pl (starting from the line 705): +- +- >>> references = [['a'] * 11, ['a'] * 8] +- >>> hypothesis = ['a'] * 7 +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(references, hyp_len) +- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS +- 0.8668... +- +- >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] +- >>> hypothesis = ['a'] * 7 +- >>> hyp_len = len(hypothesis) +- >>> closest_ref_len = closest_ref_length(references, hyp_len) +- >>> brevity_penalty(closest_ref_len, hyp_len) +- 1.0 +- +- :param hyp_len: The length of the hypothesis for a single sentence OR the +- sum of all the hypotheses' lengths for a corpus +- :type hyp_len: int +- :param closest_ref_len: The length of the closest reference for a single +- hypothesis OR the sum of all the closest references for every hypotheses. +- :type closest_ref_len: int +- :return: BLEU's brevity penalty. +- :rtype: float +- """ +- if hyp_len > closest_ref_len: +- return 1 +- # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 +- elif hyp_len == 0: +- return 0 +- else: +- return math.exp(1 - closest_ref_len / hyp_len) +- +- +-class SmoothingFunction: +- """ +- This is an implementation of the smoothing techniques +- for segment-level BLEU scores that was presented in +- Boxing Chen and Collin Cherry (2014) A Systematic Comparison of +- Smoothing Techniques for Sentence-Level BLEU. In WMT14. +- http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf +- """ +- +- def __init__(self, epsilon=0.1, alpha=5, k=5): +- """ +- This will initialize the parameters required for the various smoothing +- techniques, the default values are set to the numbers used in the +- experiments from Chen and Cherry (2014). +- +- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', +- ... 'that', 'the', 'military', 'always', 'obeys', 'the', +- ... 'commands', 'of', 'the', 'party'] +- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', +- ... 'that', 'the', 'military', 'will', 'forever', 'heed', +- ... 'Party', 'commands'] +- +- >>> chencherry = SmoothingFunction() +- >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS +- 0.4118... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS +- 0.4118... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS +- 0.4118... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS +- 0.4452... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS +- 0.4118... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS +- 0.4118... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS +- 0.4905... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS +- 0.4135... +- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS +- 0.4905... +- +- :param epsilon: the epsilon value use in method 1 +- :type epsilon: float +- :param alpha: the alpha value use in method 6 +- :type alpha: int +- :param k: the k value use in method 4 +- :type k: int +- """ +- self.epsilon = epsilon +- self.alpha = alpha +- self.k = k +- +- def method0(self, p_n, *args, **kwargs): +- """ +- No smoothing. +- """ +- p_n_new = [] +- for i, p_i in enumerate(p_n): +- if p_i.numerator != 0: +- p_n_new.append(p_i) +- else: +- _msg = str( +- "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" +- "Therefore the BLEU score evaluates to 0, independently of\n" +- "how many N-gram overlaps of lower order it contains.\n" +- "Consider using lower n-gram order or use " +- "SmoothingFunction()" +- ).format(i + 1) +- warnings.warn(_msg) +- # When numerator==0 where denonminator==0 or !=0, the result +- # for the precision score should be equal to 0 or undefined. +- # Due to BLEU geometric mean computation in logarithm space, +- # we we need to take the return sys.float_info.min such that +- # math.log(sys.float_info.min) returns a 0 precision score. +- p_n_new.append(sys.float_info.min) +- return p_n_new +- +- def method1(self, p_n, *args, **kwargs): +- """ +- Smoothing method 1: Add *epsilon* counts to precision with 0 counts. +- """ +- return [ +- (p_i.numerator + self.epsilon) / p_i.denominator +- if p_i.numerator == 0 +- else p_i +- for p_i in p_n +- ] +- +- def method2(self, p_n, *args, **kwargs): +- """ +- Smoothing method 2: Add 1 to both numerator and denominator from +- Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for +- Evaluating Automatic Evaluation Metrics for Machine Translation. +- In COLING 2004. +- """ +- return [ +- Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) +- if i != 0 +- else p_n[0] +- for i in range(len(p_n)) +- ] +- +- def method3(self, p_n, *args, **kwargs): +- """ +- Smoothing method 3: NIST geometric sequence smoothing +- The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each +- precision score whose matching n-gram count is null. +- k is 1 for the first 'n' value for which the n-gram match count is null/ +- +- For example, if the text contains: +- +- - one 2-gram match +- - and (consequently) two 1-gram matches +- +- the n-gram count for each individual precision score would be: +- +- - n=1 => prec_count = 2 (two unigrams) +- - n=2 => prec_count = 1 (one bigram) +- - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) +- - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) +- """ +- incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. +- for i, p_i in enumerate(p_n): +- if p_i.numerator == 0: +- p_n[i] = 1 / (2**incvnt * p_i.denominator) +- incvnt += 1 +- return p_n +- +- def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): +- """ +- Smoothing method 4: +- Shorter translations may have inflated precision values due to having +- smaller denominators; therefore, we give them proportionally +- smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry +- suggests dividing by 1/ln(len(T)), where T is the length of the translation. +- """ +- incvnt = 1 +- hyp_len = hyp_len if hyp_len else len(hypothesis) +- for i, p_i in enumerate(p_n): +- if p_i.numerator == 0 and hyp_len > 1: +- # incvnt = i + 1 * self.k / math.log( +- # hyp_len +- # ) # Note that this K is different from the K from NIST. +- # p_n[i] = incvnt / p_i.denominator\ +- numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) +- p_n[i] = numerator / p_i.denominator +- incvnt += 1 +- return p_n +- +- def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): +- """ +- Smoothing method 5: +- The matched counts for similar values of n should be similar. To a +- calculate the n-gram matched count, it averages the n−1, n and n+1 gram +- matched counts. +- """ +- hyp_len = hyp_len if hyp_len else len(hypothesis) +- m = {} +- # Requires an precision value for an addition ngram order. +- p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] +- m[-1] = p_n[0] + 1 +- for i, p_i in enumerate(p_n): +- p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 +- m[i] = p_n[i] +- return p_n +- +- def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): +- """ +- Smoothing method 6: +- Interpolates the maximum likelihood estimate of the precision *p_n* with +- a prior estimate *pi0*. The prior is estimated by assuming that the ratio +- between pn and pn−1 will be the same as that between pn−1 and pn−2; from +- Gao and He (2013) Training MRF-Based Phrase Translation Models using +- Gradient Ascent. In NAACL. +- """ +- hyp_len = hyp_len if hyp_len else len(hypothesis) +- # This smoothing only works when p_1 and p_2 is non-zero. +- # Raise an error with an appropriate message when the input is too short +- # to use this smoothing technique. +- assert p_n[2], "This smoothing method requires non-zero precision for bigrams." +- for i, p_i in enumerate(p_n): +- if i in [0, 1]: # Skips the first 2 orders of ngrams. +- continue +- else: +- pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] +- # No. of ngrams in translation that matches the reference. +- m = p_i.numerator +- # No. of ngrams in translation. +- l = sum(1 for _ in ngrams(hypothesis, i + 1)) +- # Calculates the interpolated precision. +- p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) +- return p_n +- +- def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): +- """ +- Smoothing method 7: +- Interpolates methods 4 and 5. +- """ +- hyp_len = hyp_len if hyp_len else len(hypothesis) +- p_n = self.method4(p_n, references, hypothesis, hyp_len) +- p_n = self.method5(p_n, references, hypothesis, hyp_len) +- return p_n ++# Natural Language Toolkit: BLEU Score ++# ++# Copyright (C) 2001-2023 NLTK Project ++# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim ++# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan ++# URL: ++# For license information, see LICENSE.TXT ++ ++"""BLEU score implementation.""" ++import math ++import sys ++import warnings ++from collections import Counter ++from fractions import Fraction as _Fraction ++ ++from nltk.util import ngrams ++ ++ ++class Fraction(_Fraction): ++ """Fraction with _normalize=False support for 3.12""" ++ ++ def __new__(cls, numerator=0, denominator=None, _normalize=False): ++ if sys.version_info >= (3, 12): ++ self = super().__new__(cls, numerator, denominator) ++ else: ++ self = super().__new__(cls, numerator, denominator, _normalize=_normalize) ++ self._normalize = _normalize ++ self._original_numerator = numerator ++ self._original_denominator = denominator ++ return self ++ ++ @property ++ def numerator(self): ++ if not self._normalize: ++ return self._original_numerator ++ return super().numerator ++ ++ @property ++ def denominator(self): ++ if not self._normalize: ++ return self._original_denominator ++ return super().denominator ++ ++ ++def sentence_bleu( ++ references, ++ hypothesis, ++ weights=(0.25, 0.25, 0.25, 0.25), ++ smoothing_function=None, ++ auto_reweigh=False, ++): ++ """ ++ Calculate BLEU score (Bilingual Evaluation Understudy) from ++ Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. ++ "BLEU: a method for automatic evaluation of machine translation." ++ In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf ++ ++ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ++ ... 'ensures', 'that', 'the', 'military', 'always', ++ ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] ++ ++ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ++ ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ++ ... 'that', 'party', 'direct'] ++ ++ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ++ ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ++ ... 'heed', 'Party', 'commands'] ++ ++ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ++ ... 'guarantees', 'the', 'military', 'forces', 'always', ++ ... 'being', 'under', 'the', 'command', 'of', 'the', ++ ... 'Party'] ++ ++ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ++ ... 'army', 'always', 'to', 'heed', 'the', 'directions', ++ ... 'of', 'the', 'party'] ++ ++ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS ++ 0.5045... ++ ++ If there is no ngrams overlap for any order of n-grams, BLEU returns the ++ value 0. This is because the precision for the order of n-grams without ++ overlap is 0, and the geometric mean in the final BLEU score computation ++ multiplies the 0 with the precision of other n-grams. This results in 0 ++ (independently of the precision of the other n-gram orders). The following ++ example has zero 3-gram and 4-gram overlaps: ++ ++ >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS ++ 0.0 ++ ++ To avoid this harsh behaviour when no ngram overlaps are found a smoothing ++ function can be used. ++ ++ >>> chencherry = SmoothingFunction() ++ >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, ++ ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS ++ 0.0370... ++ ++ The default BLEU calculates a score for up to 4-grams using uniform ++ weights (this is called BLEU-4). To evaluate your translations with ++ higher/lower order ngrams, use customized weights. E.g. when accounting ++ for up to 5-grams with uniform weights (this is called BLEU-5) use: ++ ++ >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) ++ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS ++ 0.3920... ++ ++ Multiple BLEU scores can be computed at once, by supplying a list of weights. ++ E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: ++ >>> weights = [ ++ ... (1./2., 1./2.), ++ ... (1./3., 1./3., 1./3.), ++ ... (1./4., 1./4., 1./4., 1./4.) ++ ... ] ++ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS ++ [0.7453..., 0.6240..., 0.5045...] ++ ++ :param references: reference sentences ++ :type references: list(list(str)) ++ :param hypothesis: a hypothesis sentence ++ :type hypothesis: list(str) ++ :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) ++ :type weights: tuple(float) / list(tuple(float)) ++ :param smoothing_function: ++ :type smoothing_function: SmoothingFunction ++ :param auto_reweigh: Option to re-normalize the weights uniformly. ++ :type auto_reweigh: bool ++ :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. ++ :rtype: float / list(float) ++ """ ++ return corpus_bleu( ++ [references], [hypothesis], weights, smoothing_function, auto_reweigh ++ ) ++ ++ ++def corpus_bleu( ++ list_of_references, ++ hypotheses, ++ weights=(0.25, 0.25, 0.25, 0.25), ++ smoothing_function=None, ++ auto_reweigh=False, ++): ++ """ ++ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all ++ the hypotheses and their respective references. ++ ++ Instead of averaging the sentence level BLEU scores (i.e. macro-average ++ precision), the original BLEU metric (Papineni et al. 2002) accounts for ++ the micro-average precision (i.e. summing the numerators and denominators ++ for each hypothesis-reference(s) pairs before the division). ++ ++ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ++ ... 'ensures', 'that', 'the', 'military', 'always', ++ ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] ++ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ++ ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ++ ... 'heed', 'Party', 'commands'] ++ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ++ ... 'guarantees', 'the', 'military', 'forces', 'always', ++ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] ++ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ++ ... 'army', 'always', 'to', 'heed', 'the', 'directions', ++ ... 'of', 'the', 'party'] ++ ++ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ++ ... 'interested', 'in', 'world', 'history'] ++ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ++ ... 'because', 'he', 'read', 'the', 'book'] ++ ++ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] ++ >>> hypotheses = [hyp1, hyp2] ++ >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS ++ 0.5920... ++ ++ The example below show that corpus_bleu() is different from averaging ++ sentence_bleu() for hypotheses ++ ++ >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) ++ >>> score2 = sentence_bleu([ref2a], hyp2) ++ >>> (score1 + score2) / 2 # doctest: +ELLIPSIS ++ 0.6223... ++ ++ Custom weights may be supplied to fine-tune the BLEU score further. ++ A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. ++ >>> weights = (0.1, 0.3, 0.5, 0.1) ++ >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS ++ 0.5818... ++ ++ This particular weight gave extra value to trigrams. ++ Furthermore, multiple weights can be given, resulting in multiple BLEU scores. ++ >>> weights = [ ++ ... (0.5, 0.5), ++ ... (0.333, 0.333, 0.334), ++ ... (0.25, 0.25, 0.25, 0.25), ++ ... (0.2, 0.2, 0.2, 0.2, 0.2) ++ ... ] ++ >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS ++ [0.8242..., 0.7067..., 0.5920..., 0.4719...] ++ ++ :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses ++ :type list_of_references: list(list(list(str))) ++ :param hypotheses: a list of hypothesis sentences ++ :type hypotheses: list(list(str)) ++ :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) ++ :type weights: tuple(float) / list(tuple(float)) ++ :param smoothing_function: ++ :type smoothing_function: SmoothingFunction ++ :param auto_reweigh: Option to re-normalize the weights uniformly. ++ :type auto_reweigh: bool ++ :return: The corpus-level BLEU score. ++ :rtype: float ++ """ ++ # Before proceeding to compute BLEU, perform sanity checks. ++ ++ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. ++ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. ++ hyp_lengths, ref_lengths = 0, 0 ++ ++ assert len(list_of_references) == len(hypotheses), ( ++ "The number of hypotheses and their reference(s) should be the " "same " ++ ) ++ ++ try: ++ weights[0][0] ++ except TypeError: ++ weights = [weights] ++ max_weight_length = max(len(weight) for weight in weights) ++ ++ # Iterate through each hypothesis and their corresponding references. ++ for references, hypothesis in zip(list_of_references, hypotheses): ++ # For each order of ngram, calculate the numerator and ++ # denominator for the corpus-level modified precision. ++ for i in range(1, max_weight_length + 1): ++ p_i = modified_precision(references, hypothesis, i) ++ p_numerators[i] += p_i.numerator ++ p_denominators[i] += p_i.denominator ++ ++ # Calculate the hypothesis length and the closest reference length. ++ # Adds them to the corpus-level hypothesis and reference counts. ++ hyp_len = len(hypothesis) ++ hyp_lengths += hyp_len ++ ref_lengths += closest_ref_length(references, hyp_len) ++ ++ # Calculate corpus-level brevity penalty. ++ bp = brevity_penalty(ref_lengths, hyp_lengths) ++ ++ # Collects the various precision values for the different ngram orders. ++ p_n = [ ++ Fraction(p_numerators[i], p_denominators[i], _normalize=False) ++ for i in range(1, max_weight_length + 1) ++ ] ++ ++ # Returns 0 if there's no matching n-grams ++ # We only need to check for p_numerators[1] == 0, since if there's ++ # no unigrams, there won't be any higher order ngrams. ++ if p_numerators[1] == 0: ++ return 0 if len(weights) == 1 else [0] * len(weights) ++ ++ # If there's no smoothing, set use method0 from SmoothinFunction class. ++ if not smoothing_function: ++ smoothing_function = SmoothingFunction().method0 ++ # Smoothen the modified precision. ++ # Note: smoothing_function() may convert values into floats; ++ # it tries to retain the Fraction object as much as the ++ # smoothing method allows. ++ p_n = smoothing_function( ++ p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths ++ ) ++ ++ bleu_scores = [] ++ for weight in weights: ++ # Uniformly re-weighting based on maximum hypothesis lengths if largest ++ # order of n-grams < 4 and weights is set at default. ++ if auto_reweigh: ++ if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): ++ weight = (1 / hyp_lengths,) * hyp_lengths ++ ++ s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) ++ s = bp * math.exp(math.fsum(s)) ++ bleu_scores.append(s) ++ return bleu_scores[0] if len(weights) == 1 else bleu_scores ++ ++ ++def modified_precision(references, hypothesis, n): ++ """ ++ Calculate modified ngram precision. ++ ++ The normal precision method may lead to some wrong translations with ++ high-precision, e.g., the translation, in which a word of reference ++ repeats several times, has very high precision. ++ ++ This function only returns the Fraction object that contains the numerator ++ and denominator necessary to calculate the corpus-level precision. ++ To calculate the modified precision for a single pair of hypothesis and ++ references, cast the Fraction object into a float. ++ ++ The famous "the the the ... " example shows that you can get BLEU precision ++ by duplicating high frequency words. ++ ++ >>> reference1 = 'the cat is on the mat'.split() ++ >>> reference2 = 'there is a cat on the mat'.split() ++ >>> hypothesis1 = 'the the the the the the the'.split() ++ >>> references = [reference1, reference2] ++ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS ++ 0.2857... ++ ++ In the modified n-gram precision, a reference word will be considered ++ exhausted after a matching hypothesis word is identified, e.g. ++ ++ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ++ ... 'ensures', 'that', 'the', 'military', 'will', ++ ... 'forever', 'heed', 'Party', 'commands'] ++ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ++ ... 'guarantees', 'the', 'military', 'forces', 'always', ++ ... 'being', 'under', 'the', 'command', 'of', 'the', ++ ... 'Party'] ++ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ++ ... 'army', 'always', 'to', 'heed', 'the', 'directions', ++ ... 'of', 'the', 'party'] ++ >>> hypothesis = 'of the'.split() ++ >>> references = [reference1, reference2, reference3] ++ >>> float(modified_precision(references, hypothesis, n=1)) ++ 1.0 ++ >>> float(modified_precision(references, hypothesis, n=2)) ++ 1.0 ++ ++ An example of a normal machine translation hypothesis: ++ ++ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ++ ... 'ensures', 'that', 'the', 'military', 'always', ++ ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] ++ ++ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ++ ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ++ ... 'that', 'party', 'direct'] ++ ++ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ++ ... 'ensures', 'that', 'the', 'military', 'will', ++ ... 'forever', 'heed', 'Party', 'commands'] ++ ++ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ++ ... 'guarantees', 'the', 'military', 'forces', 'always', ++ ... 'being', 'under', 'the', 'command', 'of', 'the', ++ ... 'Party'] ++ ++ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ++ ... 'army', 'always', 'to', 'heed', 'the', 'directions', ++ ... 'of', 'the', 'party'] ++ >>> references = [reference1, reference2, reference3] ++ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS ++ 0.9444... ++ >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS ++ 0.5714... ++ >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS ++ 0.5882352941176471 ++ >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS ++ 0.07692... ++ ++ ++ :param references: A list of reference translations. ++ :type references: list(list(str)) ++ :param hypothesis: A hypothesis translation. ++ :type hypothesis: list(str) ++ :param n: The ngram order. ++ :type n: int ++ :return: BLEU's modified precision for the nth order ngram. ++ :rtype: Fraction ++ """ ++ # Extracts all ngrams in hypothesis ++ # Set an empty Counter if hypothesis is empty. ++ counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() ++ # Extract a union of references' counts. ++ # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) ++ max_counts = {} ++ for reference in references: ++ reference_counts = ( ++ Counter(ngrams(reference, n)) if len(reference) >= n else Counter() ++ ) ++ for ngram in counts: ++ max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) ++ ++ # Assigns the intersection between hypothesis and references' counts. ++ clipped_counts = { ++ ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() ++ } ++ ++ numerator = sum(clipped_counts.values()) ++ # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. ++ # Usually this happens when the ngram order is > len(reference). ++ denominator = max(1, sum(counts.values())) ++ ++ return Fraction(numerator, denominator, _normalize=False) ++ ++ ++def closest_ref_length(references, hyp_len): ++ """ ++ This function finds the reference that is the closest length to the ++ hypothesis. The closest reference length is referred to as *r* variable ++ from the brevity penalty formula in Papineni et. al. (2002) ++ ++ :param references: A list of reference translations. ++ :type references: list(list(str)) ++ :param hyp_len: The length of the hypothesis. ++ :type hyp_len: int ++ :return: The length of the reference that's closest to the hypothesis. ++ :rtype: int ++ """ ++ ref_lens = (len(reference) for reference in references) ++ closest_ref_len = min( ++ ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) ++ ) ++ return closest_ref_len ++ ++ ++def brevity_penalty(closest_ref_len, hyp_len): ++ """ ++ Calculate brevity penalty. ++ ++ As the modified n-gram precision still has the problem from the short ++ length sentence, brevity penalty is used to modify the overall BLEU ++ score according to length. ++ ++ An example from the paper. There are three references with length 12, 15 ++ and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. ++ ++ >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 ++ >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 ++ >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 ++ >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 ++ >>> references = [reference1, reference2, reference3] ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(references, hyp_len) ++ >>> brevity_penalty(closest_ref_len, hyp_len) ++ 1.0 ++ ++ In case a hypothesis translation is shorter than the references, penalty is ++ applied. ++ ++ >>> references = [['a'] * 28, ['a'] * 28] ++ >>> hypothesis = ['a'] * 12 ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(references, hyp_len) ++ >>> brevity_penalty(closest_ref_len, hyp_len) ++ 0.2635971381157267 ++ ++ The length of the closest reference is used to compute the penalty. If the ++ length of a hypothesis is 12, and the reference lengths are 13 and 2, the ++ penalty is applied because the hypothesis length (12) is less then the ++ closest reference length (13). ++ ++ >>> references = [['a'] * 13, ['a'] * 2] ++ >>> hypothesis = ['a'] * 12 ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(references, hyp_len) ++ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS ++ 0.9200... ++ ++ The brevity penalty doesn't depend on reference order. More importantly, ++ when two reference sentences are at the same distance, the shortest ++ reference sentence length is used. ++ ++ >>> references = [['a'] * 13, ['a'] * 11] ++ >>> hypothesis = ['a'] * 12 ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(references, hyp_len) ++ >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) ++ >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) ++ >>> bp1 == bp2 == 1 ++ True ++ ++ A test example from mteval-v13a.pl (starting from the line 705): ++ ++ >>> references = [['a'] * 11, ['a'] * 8] ++ >>> hypothesis = ['a'] * 7 ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(references, hyp_len) ++ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS ++ 0.8668... ++ ++ >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] ++ >>> hypothesis = ['a'] * 7 ++ >>> hyp_len = len(hypothesis) ++ >>> closest_ref_len = closest_ref_length(references, hyp_len) ++ >>> brevity_penalty(closest_ref_len, hyp_len) ++ 1.0 ++ ++ :param hyp_len: The length of the hypothesis for a single sentence OR the ++ sum of all the hypotheses' lengths for a corpus ++ :type hyp_len: int ++ :param closest_ref_len: The length of the closest reference for a single ++ hypothesis OR the sum of all the closest references for every hypotheses. ++ :type closest_ref_len: int ++ :return: BLEU's brevity penalty. ++ :rtype: float ++ """ ++ if hyp_len > closest_ref_len: ++ return 1 ++ # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 ++ elif hyp_len == 0: ++ return 0 ++ else: ++ return math.exp(1 - closest_ref_len / hyp_len) ++ ++ ++class SmoothingFunction: ++ """ ++ This is an implementation of the smoothing techniques ++ for segment-level BLEU scores that was presented in ++ Boxing Chen and Collin Cherry (2014) A Systematic Comparison of ++ Smoothing Techniques for Sentence-Level BLEU. In WMT14. ++ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf ++ """ ++ ++ def __init__(self, epsilon=0.1, alpha=5, k=5): ++ """ ++ This will initialize the parameters required for the various smoothing ++ techniques, the default values are set to the numbers used in the ++ experiments from Chen and Cherry (2014). ++ ++ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', ++ ... 'that', 'the', 'military', 'always', 'obeys', 'the', ++ ... 'commands', 'of', 'the', 'party'] ++ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', ++ ... 'that', 'the', 'military', 'will', 'forever', 'heed', ++ ... 'Party', 'commands'] ++ ++ >>> chencherry = SmoothingFunction() ++ >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS ++ 0.4118... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS ++ 0.4118... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS ++ 0.4118... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS ++ 0.4452... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS ++ 0.4118... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS ++ 0.4118... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS ++ 0.4905... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS ++ 0.4135... ++ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS ++ 0.4905... ++ ++ :param epsilon: the epsilon value use in method 1 ++ :type epsilon: float ++ :param alpha: the alpha value use in method 6 ++ :type alpha: int ++ :param k: the k value use in method 4 ++ :type k: int ++ """ ++ self.epsilon = epsilon ++ self.alpha = alpha ++ self.k = k ++ ++ def method0(self, p_n, *args, **kwargs): ++ """ ++ No smoothing. ++ """ ++ p_n_new = [] ++ for i, p_i in enumerate(p_n): ++ if p_i.numerator != 0: ++ p_n_new.append(p_i) ++ else: ++ _msg = str( ++ "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" ++ "Therefore the BLEU score evaluates to 0, independently of\n" ++ "how many N-gram overlaps of lower order it contains.\n" ++ "Consider using lower n-gram order or use " ++ "SmoothingFunction()" ++ ).format(i + 1) ++ warnings.warn(_msg) ++ # When numerator==0 where denonminator==0 or !=0, the result ++ # for the precision score should be equal to 0 or undefined. ++ # Due to BLEU geometric mean computation in logarithm space, ++ # we we need to take the return sys.float_info.min such that ++ # math.log(sys.float_info.min) returns a 0 precision score. ++ p_n_new.append(sys.float_info.min) ++ return p_n_new ++ ++ def method1(self, p_n, *args, **kwargs): ++ """ ++ Smoothing method 1: Add *epsilon* counts to precision with 0 counts. ++ """ ++ return [ ++ (p_i.numerator + self.epsilon) / p_i.denominator ++ if p_i.numerator == 0 ++ else p_i ++ for p_i in p_n ++ ] ++ ++ def method2(self, p_n, *args, **kwargs): ++ """ ++ Smoothing method 2: Add 1 to both numerator and denominator from ++ Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for ++ Evaluating Automatic Evaluation Metrics for Machine Translation. ++ In COLING 2004. ++ """ ++ return [ ++ Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) ++ if i != 0 ++ else p_n[0] ++ for i in range(len(p_n)) ++ ] ++ ++ def method3(self, p_n, *args, **kwargs): ++ """ ++ Smoothing method 3: NIST geometric sequence smoothing ++ The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each ++ precision score whose matching n-gram count is null. ++ k is 1 for the first 'n' value for which the n-gram match count is null/ ++ ++ For example, if the text contains: ++ ++ - one 2-gram match ++ - and (consequently) two 1-gram matches ++ ++ the n-gram count for each individual precision score would be: ++ ++ - n=1 => prec_count = 2 (two unigrams) ++ - n=2 => prec_count = 1 (one bigram) ++ - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) ++ - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) ++ """ ++ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. ++ for i, p_i in enumerate(p_n): ++ if p_i.numerator == 0: ++ p_n[i] = 1 / (2**incvnt * p_i.denominator) ++ incvnt += 1 ++ return p_n ++ ++ def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): ++ """ ++ Smoothing method 4: ++ Shorter translations may have inflated precision values due to having ++ smaller denominators; therefore, we give them proportionally ++ smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry ++ suggests dividing by 1/ln(len(T)), where T is the length of the translation. ++ """ ++ incvnt = 1 ++ hyp_len = hyp_len if hyp_len else len(hypothesis) ++ for i, p_i in enumerate(p_n): ++ if p_i.numerator == 0 and hyp_len > 1: ++ # incvnt = i + 1 * self.k / math.log( ++ # hyp_len ++ # ) # Note that this K is different from the K from NIST. ++ # p_n[i] = incvnt / p_i.denominator\ ++ numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) ++ p_n[i] = numerator / p_i.denominator ++ incvnt += 1 ++ return p_n ++ ++ def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): ++ """ ++ Smoothing method 5: ++ The matched counts for similar values of n should be similar. To a ++ calculate the n-gram matched count, it averages the n−1, n and n+1 gram ++ matched counts. ++ """ ++ hyp_len = hyp_len if hyp_len else len(hypothesis) ++ m = {} ++ # Requires an precision value for an addition ngram order. ++ p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] ++ m[-1] = p_n[0] + 1 ++ for i, p_i in enumerate(p_n): ++ p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 ++ m[i] = p_n[i] ++ return p_n ++ ++ def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): ++ """ ++ Smoothing method 6: ++ Interpolates the maximum likelihood estimate of the precision *p_n* with ++ a prior estimate *pi0*. The prior is estimated by assuming that the ratio ++ between pn and pn−1 will be the same as that between pn−1 and pn−2; from ++ Gao and He (2013) Training MRF-Based Phrase Translation Models using ++ Gradient Ascent. In NAACL. ++ """ ++ hyp_len = hyp_len if hyp_len else len(hypothesis) ++ # This smoothing only works when p_1 and p_2 is non-zero. ++ # Raise an error with an appropriate message when the input is too short ++ # to use this smoothing technique. ++ assert p_n[2], "This smoothing method requires non-zero precision for bigrams." ++ for i, p_i in enumerate(p_n): ++ if i in [0, 1]: # Skips the first 2 orders of ngrams. ++ continue ++ else: ++ pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] ++ # No. of ngrams in translation that matches the reference. ++ m = p_i.numerator ++ # No. of ngrams in translation. ++ l = sum(1 for _ in ngrams(hypothesis, i + 1)) ++ # Calculates the interpolated precision. ++ p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) ++ return p_n ++ ++ def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): ++ """ ++ Smoothing method 7: ++ Interpolates methods 4 and 5. ++ """ ++ hyp_len = hyp_len if hyp_len else len(hypothesis) ++ p_n = self.method4(p_n, references, hypothesis, hyp_len) ++ p_n = self.method5(p_n, references, hypothesis, hyp_len) ++ return p_n +Index: nltk-3.8.1/README.md +=================================================================== +--- nltk-3.8.1.orig/README.md ++++ nltk-3.8.1/README.md +@@ -1,50 +1,50 @@ +-# Natural Language Toolkit (NLTK) +-[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) +-![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) +- +-NLTK -- the Natural Language Toolkit -- is a suite of open source Python +-modules, data sets, and tutorials supporting research and development in Natural +-Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11. +- +-For documentation, please visit [nltk.org](https://www.nltk.org/). +- +- +-## Contributing +- +-Do you want to contribute to NLTK development? Great! +-Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. +- +-See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). +- +- +-## Donate +- +-Have you found the toolkit helpful? Please support NLTK development by donating +-to the project via PayPal, using the link on the NLTK homepage. +- +- +-## Citing +- +-If you publish work that uses NLTK, please cite the NLTK book, as follows: +- +- Bird, Steven, Edward Loper and Ewan Klein (2009). +- Natural Language Processing with Python. O'Reilly Media Inc. +- +- +-## Copyright +- +-Copyright (C) 2001-2023 NLTK Project +- +-For license information, see [LICENSE.txt](LICENSE.txt). +- +-[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. +- +- +-### Redistributing +- +-- NLTK source code is distributed under the Apache 2.0 License. +-- NLTK documentation is distributed under the Creative Commons +- Attribution-Noncommercial-No Derivative Works 3.0 United States license. +-- NLTK corpora are provided under the terms given in the README file for each +- corpus; all are redistributable and available for non-commercial use. +-- NLTK may be freely redistributed, subject to the provisions of these licenses. ++# Natural Language Toolkit (NLTK) ++[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) ++![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) ++ ++NLTK -- the Natural Language Toolkit -- is a suite of open source Python ++modules, data sets, and tutorials supporting research and development in Natural ++Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12. ++ ++For documentation, please visit [nltk.org](https://www.nltk.org/). ++ ++ ++## Contributing ++ ++Do you want to contribute to NLTK development? Great! ++Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. ++ ++See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). ++ ++ ++## Donate ++ ++Have you found the toolkit helpful? Please support NLTK development by donating ++to the project via PayPal, using the link on the NLTK homepage. ++ ++ ++## Citing ++ ++If you publish work that uses NLTK, please cite the NLTK book, as follows: ++ ++ Bird, Steven, Edward Loper and Ewan Klein (2009). ++ Natural Language Processing with Python. O'Reilly Media Inc. ++ ++ ++## Copyright ++ ++Copyright (C) 2001-2023 NLTK Project ++ ++For license information, see [LICENSE.txt](LICENSE.txt). ++ ++[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. ++ ++ ++### Redistributing ++ ++- NLTK source code is distributed under the Apache 2.0 License. ++- NLTK documentation is distributed under the Creative Commons ++ Attribution-Noncommercial-No Derivative Works 3.0 United States license. ++- NLTK corpora are provided under the terms given in the README file for each ++ corpus; all are redistributable and available for non-commercial use. ++- NLTK may be freely redistributed, subject to the provisions of these licenses. +Index: nltk-3.8.1/setup.py +=================================================================== +--- nltk-3.8.1.orig/setup.py ++++ nltk-3.8.1/setup.py +@@ -1,125 +1,126 @@ +-#!/usr/bin/env python +-# +-# Setup script for the Natural Language Toolkit +-# +-# Copyright (C) 2001-2023 NLTK Project +-# Author: NLTK Team +-# URL: +-# For license information, see LICENSE.TXT +- +-# Work around mbcs bug in distutils. +-# https://bugs.python.org/issue10945 +-import codecs +- +-try: +- codecs.lookup("mbcs") +-except LookupError: +- ascii = codecs.lookup("ascii") +- func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") +- codecs.register(func) +- +-import os +- +-# Use the VERSION file to get NLTK version +-version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") +-with open(version_file) as fh: +- nltk_version = fh.read().strip() +- +-# setuptools +-from setuptools import find_packages, setup +- +-# Specify groups of optional dependencies +-extras_require = { +- "machine_learning": [ +- "numpy", +- "python-crfsuite", +- "scikit-learn", +- "scipy", +- ], +- "plot": ["matplotlib"], +- "tgrep": ["pyparsing"], +- "twitter": ["twython"], +- "corenlp": ["requests"], +-} +- +-# Add a group made up of all optional dependencies +-extras_require["all"] = { +- package for group in extras_require.values() for package in group +-} +- +-# Adds CLI commands +-console_scripts = """ +-[console_scripts] +-nltk=nltk.cli:cli +-""" +- +-_project_homepage = "https://www.nltk.org/" +- +-setup( +- name="nltk", +- description="Natural Language Toolkit", +- version=nltk_version, +- url=_project_homepage, +- project_urls={ +- "Documentation": _project_homepage, +- "Source Code": "https://github.com/nltk/nltk", +- "Issue Tracker": "https://github.com/nltk/nltk/issues", +- }, +- long_description="""\ +-The Natural Language Toolkit (NLTK) is a Python package for +-natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""", +- license="Apache License, Version 2.0", +- keywords=[ +- "NLP", +- "CL", +- "natural language processing", +- "computational linguistics", +- "parsing", +- "tagging", +- "tokenizing", +- "syntax", +- "linguistics", +- "language", +- "natural language", +- "text analytics", +- ], +- maintainer="NLTK Team", +- maintainer_email="nltk.team@gmail.com", +- author="NLTK Team", +- author_email="nltk.team@gmail.com", +- classifiers=[ +- "Development Status :: 5 - Production/Stable", +- "Intended Audience :: Developers", +- "Intended Audience :: Education", +- "Intended Audience :: Information Technology", +- "Intended Audience :: Science/Research", +- "License :: OSI Approved :: Apache Software License", +- "Operating System :: OS Independent", +- "Programming Language :: Python :: 3.7", +- "Programming Language :: Python :: 3.8", +- "Programming Language :: Python :: 3.9", +- "Programming Language :: Python :: 3.10", +- "Programming Language :: Python :: 3.11", +- "Topic :: Scientific/Engineering", +- "Topic :: Scientific/Engineering :: Artificial Intelligence", +- "Topic :: Scientific/Engineering :: Human Machine Interfaces", +- "Topic :: Scientific/Engineering :: Information Analysis", +- "Topic :: Text Processing", +- "Topic :: Text Processing :: Filters", +- "Topic :: Text Processing :: General", +- "Topic :: Text Processing :: Indexing", +- "Topic :: Text Processing :: Linguistic", +- ], +- package_data={"nltk": ["test/*.doctest", "VERSION"]}, +- python_requires=">=3.7", +- install_requires=[ +- "click", +- "joblib", +- "regex>=2021.8.3", +- "tqdm", +- ], +- extras_require=extras_require, +- packages=find_packages(), +- zip_safe=False, # since normal files will be present too? +- entry_points=console_scripts, +-) ++#!/usr/bin/env python ++# ++# Setup script for the Natural Language Toolkit ++# ++# Copyright (C) 2001-2023 NLTK Project ++# Author: NLTK Team ++# URL: ++# For license information, see LICENSE.TXT ++ ++# Work around mbcs bug in distutils. ++# https://bugs.python.org/issue10945 ++import codecs ++ ++try: ++ codecs.lookup("mbcs") ++except LookupError: ++ ascii = codecs.lookup("ascii") ++ func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") ++ codecs.register(func) ++ ++import os ++ ++# Use the VERSION file to get NLTK version ++version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") ++with open(version_file) as fh: ++ nltk_version = fh.read().strip() ++ ++# setuptools ++from setuptools import find_packages, setup ++ ++# Specify groups of optional dependencies ++extras_require = { ++ "machine_learning": [ ++ "numpy", ++ "python-crfsuite", ++ "scikit-learn", ++ "scipy", ++ ], ++ "plot": ["matplotlib"], ++ "tgrep": ["pyparsing"], ++ "twitter": ["twython"], ++ "corenlp": ["requests"], ++} ++ ++# Add a group made up of all optional dependencies ++extras_require["all"] = { ++ package for group in extras_require.values() for package in group ++} ++ ++# Adds CLI commands ++console_scripts = """ ++[console_scripts] ++nltk=nltk.cli:cli ++""" ++ ++_project_homepage = "https://www.nltk.org/" ++ ++setup( ++ name="nltk", ++ description="Natural Language Toolkit", ++ version=nltk_version, ++ url=_project_homepage, ++ project_urls={ ++ "Documentation": _project_homepage, ++ "Source Code": "https://github.com/nltk/nltk", ++ "Issue Tracker": "https://github.com/nltk/nltk/issues", ++ }, ++ long_description="""\ ++The Natural Language Toolkit (NLTK) is a Python package for ++natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""", ++ license="Apache License, Version 2.0", ++ keywords=[ ++ "NLP", ++ "CL", ++ "natural language processing", ++ "computational linguistics", ++ "parsing", ++ "tagging", ++ "tokenizing", ++ "syntax", ++ "linguistics", ++ "language", ++ "natural language", ++ "text analytics", ++ ], ++ maintainer="NLTK Team", ++ maintainer_email="nltk.team@gmail.com", ++ author="NLTK Team", ++ author_email="nltk.team@gmail.com", ++ classifiers=[ ++ "Development Status :: 5 - Production/Stable", ++ "Intended Audience :: Developers", ++ "Intended Audience :: Education", ++ "Intended Audience :: Information Technology", ++ "Intended Audience :: Science/Research", ++ "License :: OSI Approved :: Apache Software License", ++ "Operating System :: OS Independent", ++ "Programming Language :: Python :: 3.7", ++ "Programming Language :: Python :: 3.8", ++ "Programming Language :: Python :: 3.9", ++ "Programming Language :: Python :: 3.10", ++ "Programming Language :: Python :: 3.11", ++ "Programming Language :: Python :: 3.12", ++ "Topic :: Scientific/Engineering", ++ "Topic :: Scientific/Engineering :: Artificial Intelligence", ++ "Topic :: Scientific/Engineering :: Human Machine Interfaces", ++ "Topic :: Scientific/Engineering :: Information Analysis", ++ "Topic :: Text Processing", ++ "Topic :: Text Processing :: Filters", ++ "Topic :: Text Processing :: General", ++ "Topic :: Text Processing :: Indexing", ++ "Topic :: Text Processing :: Linguistic", ++ ], ++ package_data={"nltk": ["test/*.doctest", "VERSION"]}, ++ python_requires=">=3.7", ++ install_requires=[ ++ "click", ++ "joblib", ++ "regex>=2021.8.3", ++ "tqdm", ++ ], ++ extras_require=extras_require, ++ packages=find_packages(), ++ zip_safe=False, # since normal files will be present too? ++ entry_points=console_scripts, ++) diff --git a/nltk_data.tar.xz b/nltk_data.tar.xz index b6fd225..97a033a 100644 --- a/nltk_data.tar.xz +++ b/nltk_data.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f41383a4774bf7227f4563f46543460ba07a6921f7bcc6185519e87ea9e4323f -size 453871052 +oid sha256:f79462ac99f414b4850943720bed4a59c1bb15bfc8f1ce16b26165da6db07680 +size 393271816 diff --git a/port-2to3.patch b/port-2to3.patch deleted file mode 100644 index 7a5697d..0000000 --- a/port-2to3.patch +++ /dev/null @@ -1,48 +0,0 @@ ---- - nltk_data/corpora/pl196x/splitter.py | 4 ++-- - nltk_data/taggers/universal_tagset/universal_tags.py | 5 ----- - tools/find_deprecated.py | 2 +- - 3 files changed, 3 insertions(+), 8 deletions(-) - ---- a/nltk_data/corpora/pl196x/splitter.py -+++ b/nltk_data/corpora/pl196x/splitter.py -@@ -1,4 +1,4 @@ --#!/usr/bin/python -+#!/usr/bin/python3 - - import sys, re - -@@ -7,7 +7,7 @@ TEXTID = re.compile(r'') - - if __name__ == '__main__': - if len(sys.argv) != 2: -- print 'One argument required: a pl196x corpus to split.' -+ print('One argument required: a pl196x corpus to split.') - sys.exit() - - inputFileName = sys.argv[1] ---- a/nltk_data/taggers/universal_tagset/universal_tags.py -+++ b/nltk_data/taggers/universal_tagset/universal_tags.py -@@ -22,11 +22,6 @@ X - other: foreign words, typos, abbrevi - @author: Nathan Schneider (nschneid) - @since: 2011-05-06 - ''' -- --# Strive towards Python 3 compatibility --from __future__ import print_function, unicode_literals, division --from future_builtins import map, filter -- - import re, glob - from collections import defaultdict - ---- a/tools/find_deprecated.py -+++ b/tools/find_deprecated.py -@@ -29,7 +29,7 @@ import textwrap - import tokenize - from doctest import DocTestParser, register_optionflag - --from cStringIO import StringIO -+from io import StringIO - - import nltk.corpus - from nltk import defaultdict diff --git a/python-nltk.changes b/python-nltk.changes index ecc7a8c..e4d945d 100644 --- a/python-nltk.changes +++ b/python-nltk.changes @@ -1,3 +1,14 @@ +------------------------------------------------------------------- +Thu Mar 21 17:41:52 UTC 2024 - Ben Greiner + +- Update to 3.8.1 + * Resolve RCE & XSS vulnerabilities in localhost WordNet Browser + * Add Python 3.11 support +- Update nltk_data archive +- Drop port-2to3.patch +- Add nltk-pr3207-py312.patch for Python 3.12 support + * gh#nltk/nltk#3207 + ------------------------------------------------------------------- Tue Mar 28 08:36:04 UTC 2023 - pgajdos@suse.com diff --git a/python-nltk.rpmlintrc b/python-nltk.rpmlintrc index 9816082..1c89015 100644 --- a/python-nltk.rpmlintrc +++ b/python-nltk.rpmlintrc @@ -1 +1,2 @@ -addFilter("E: zero-length /usr/lib/python3\.\d/site-packages/nltk/tbl/api\.py") +addFilter("E: zero-length /usr/lib/python3\.\d+/site-packages/nltk/tbl/api\.py") +addFilter("explicit-lib-dependency python3\d*-joblib") diff --git a/python-nltk.spec b/python-nltk.spec index fd7b0d9..c5ecdeb 100644 --- a/python-nltk.spec +++ b/python-nltk.spec @@ -1,7 +1,7 @@ # # spec file for package python-nltk # -# Copyright (c) 2023 SUSE LLC +# Copyright (c) 2024 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -16,55 +16,86 @@ # -%{!?python_module:%define python_module() python-%{**} python3-%{**}} -%define pyname nltk -%define skip_python2 1 Name: python-nltk -Version: 3.8 +Version: 3.8.1 Release: 0 Summary: Natural Language Toolkit License: Apache-2.0 URL: http://nltk.org/ -Source0: https://files.pythonhosted.org/packages/source/n/nltk/%{pyname}-%{version}.zip -# Downloaded NLTK data via python3 -m nltk.downloader, -# then unzip downloaded zip archive. +# SourceRepository: https://github.com/nltk/nltk +Source0: https://files.pythonhosted.org/packages/source/n/nltk/nltk-%{version}.zip +# Download/Update NLTK data: +# quilt setup python-nltk.spec +# pushd nltk-?.?.? +# python3 -m nltk.downloader -d nltk_data tests \ +# averaged_perceptron_tagger_ru \ +# brown \ +# cess_cat \ +# cess_esp \ +# conll2007 \ +# floresta \ +# gutenberg \ +# inaugural \ +# indian \ +# large_grammars \ +# nombank.1.0 \ +# omw-1.4 \ +# pl196x \ +# ptb \ +# punkt \ +# rte \ +# sinica_treebank \ +# stopwords \ +# treebank \ +# udhr \ +# universal_tagset \ +# wordnet \ +# wordnet_ic \ +# words +# tar -cJf ../nltk_data.tar.xz nltk_data +# popd # see https://www.nltk.org/data.html for more details Source1: nltk_data.tar.xz Source99: python-nltk.rpmlintrc # PATCH-FIX-UPSTREAM skip-networked-test.patch gh#nltk/nltk#2969 mcepl@suse.com # skip tests requiring network connection Patch0: skip-networked-test.patch -# PATCH-FIX-UPSTREAM port-2to3.patch bsc#[0-9]+ mcepl@suse.com -# port scripts in nltk_data to Python 3 -Patch1: port-2to3.patch -BuildRequires: %{python_module regex} +# PATCH-FIX-UPSTREAM nltk-pr3207-py312.patch gh#nltk/nltk#3207 +Patch1: nltk-pr3207-py312.patch +BuildRequires: %{python_module base >= 3.7} +BuildRequires: %{python_module pip} BuildRequires: %{python_module setuptools} +BuildRequires: %{python_module wheel} BuildRequires: %{pythons} BuildRequires: fdupes BuildRequires: python-rpm-macros BuildRequires: unzip -# For testing -BuildRequires: %{python_module tk} +# SECTION runtime +BuildRequires: %{python_module regex >= 2021.8.3} BuildRequires: %{python_module click} -BuildRequires: %{python_module pytest} -# BuildRequires: %%{python_module gensim} BuildRequires: %{python_module joblib} +BuildRequires: %{python_module tqdm} +# /SECTION +# SECTION test +BuildRequires: %{python_module tk} BuildRequires: %{python_module Jinja2} BuildRequires: %{python_module matplotlib} BuildRequires: %{python_module numpy} BuildRequires: %{python_module pyparsing} BuildRequires: %{python_module pytest-cov} BuildRequires: %{python_module pytest-mock} +BuildRequires: %{python_module pytest} BuildRequires: %{python_module python-crfsuite} -BuildRequires: %{python_module regex} BuildRequires: %{python_module requests} BuildRequires: %{python_module scikit-learn} BuildRequires: %{python_module scipy} BuildRequires: %{python_module text-unidecode} -BuildRequires: %{python_module tqdm} BuildRequires: %{python_module twython} -# -Requires: python-regex +# /SECTION +Requires: python-regex >= 2021.8.3 +Requires: python-click +Requires: python-joblib +Requires: python-tqdm Recommends: python-gensim Recommends: python-matplotlib Recommends: python-numpy @@ -75,7 +106,7 @@ Recommends: python-scikit-learn Recommends: python-scipy Recommends: python-twython Requires(post): update-alternatives -Requires(postun):update-alternatives +Requires(postun): update-alternatives BuildArch: noarch %python_subpackages @@ -87,10 +118,7 @@ Python modules, data sets and tutorials supporting research and development in Natural Language Processing. %prep -%autosetup -p1 -a1 -n %{pyname}-%{version} - -# Remove obsolete scripts -rm tools/nltk_term_index.py tools/run_doctests.py nltk_data/corpora/semcor/semcor.py +%autosetup -p1 -a1 -n nltk-%{version} # Fix EOL sed -i 's/\r/\n/g; s/\n$//' \ @@ -120,14 +148,13 @@ sed -E -i "s|#![[:space:]]*%{_bindir}/env python|#!%{_bindir}/python3|" \ setup.py \ tools/global_replace.py \ nltk_data/corpora/pl196x/splitter.py \ - tools/find_deprecated.py \ - tools/svnmime.py + tools/find_deprecated.py %build -%python_build +%pyproject_wheel %install -%python_install +%pyproject_install %python_clone -a %{buildroot}%{_bindir}/nltk %{python_expand %fdupes %{buildroot}%{$python_sitelib}/ @@ -148,8 +175,8 @@ export NLTK_DATA=$(readlink -f ./nltk_data/) %files %{python_files} %doc README.md %license LICENSE.txt -%{python_sitelib}/%{pyname}/ -%{python_sitelib}/%{pyname}-%{version}-py%{python_version}.egg-info/ +%{python_sitelib}/nltk/ +%{python_sitelib}/nltk-%{version}.dist-info/ %python_alternative %{_bindir}/nltk %changelog