From 25d35fc4283dedd2053ec6d821f4b707fff8d72c Mon Sep 17 00:00:00 2001 From: Konstantin Chernyshev Date: Thu, 16 Nov 2023 19:00:15 +0100 Subject: [PATCH 1/8] ci: enable 3.12 in ci tests --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: nltk-3.8.1/nltk/test/unit/translate/test_bleu.py =================================================================== --- nltk-3.8.1.orig/nltk/test/unit/translate/test_bleu.py +++ nltk-3.8.1/nltk/test/unit/translate/test_bleu.py @@ -2,7 +2,6 @@ Tests for BLEU translation evaluation metric """ -import io import unittest from nltk.data import find Index: nltk-3.8.1/nltk/translate/bleu_score.py =================================================================== --- nltk-3.8.1.orig/nltk/translate/bleu_score.py +++ nltk-3.8.1/nltk/translate/bleu_score.py @@ -1,685 +1,710 @@ -# Natural Language Toolkit: BLEU Score -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim -# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan -# URL: -# For license information, see LICENSE.TXT - -"""BLEU score implementation.""" - -import math -import sys -import warnings -from collections import Counter -from fractions import Fraction - -from nltk.util import ngrams - - -def sentence_bleu( - references, - hypothesis, - weights=(0.25, 0.25, 0.25, 0.25), - smoothing_function=None, - auto_reweigh=False, -): - """ - Calculate BLEU score (Bilingual Evaluation Understudy) from - Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. - "BLEU: a method for automatic evaluation of machine translation." - In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - - >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', - ... 'forever', 'hearing', 'the', 'activity', 'guidebook', - ... 'that', 'party', 'direct'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS - 0.5045... - - If there is no ngrams overlap for any order of n-grams, BLEU returns the - value 0. This is because the precision for the order of n-grams without - overlap is 0, and the geometric mean in the final BLEU score computation - multiplies the 0 with the precision of other n-grams. This results in 0 - (independently of the precision of the other n-gram orders). The following - example has zero 3-gram and 4-gram overlaps: - - >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS - 0.0 - - To avoid this harsh behaviour when no ngram overlaps are found a smoothing - function can be used. - - >>> chencherry = SmoothingFunction() - >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, - ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS - 0.0370... - - The default BLEU calculates a score for up to 4-grams using uniform - weights (this is called BLEU-4). To evaluate your translations with - higher/lower order ngrams, use customized weights. E.g. when accounting - for up to 5-grams with uniform weights (this is called BLEU-5) use: - - >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) - >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS - 0.3920... - - Multiple BLEU scores can be computed at once, by supplying a list of weights. - E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: - >>> weights = [ - ... (1./2., 1./2.), - ... (1./3., 1./3., 1./3.), - ... (1./4., 1./4., 1./4., 1./4.) - ... ] - >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS - [0.7453..., 0.6240..., 0.5045...] - - :param references: reference sentences - :type references: list(list(str)) - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) - :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) - :type weights: tuple(float) / list(tuple(float)) - :param smoothing_function: - :type smoothing_function: SmoothingFunction - :param auto_reweigh: Option to re-normalize the weights uniformly. - :type auto_reweigh: bool - :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. - :rtype: float / list(float) - """ - return corpus_bleu( - [references], [hypothesis], weights, smoothing_function, auto_reweigh - ) - - -def corpus_bleu( - list_of_references, - hypotheses, - weights=(0.25, 0.25, 0.25, 0.25), - smoothing_function=None, - auto_reweigh=False, -): - """ - Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all - the hypotheses and their respective references. - - Instead of averaging the sentence level BLEU scores (i.e. macro-average - precision), the original BLEU metric (Papineni et al. 2002) accounts for - the micro-average precision (i.e. summing the numerators and denominators - for each hypothesis-reference(s) pairs before the division). - - >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] - >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', - ... 'interested', 'in', 'world', 'history'] - >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', - ... 'because', 'he', 'read', 'the', 'book'] - - >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] - >>> hypotheses = [hyp1, hyp2] - >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS - 0.5920... - - The example below show that corpus_bleu() is different from averaging - sentence_bleu() for hypotheses - - >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) - >>> score2 = sentence_bleu([ref2a], hyp2) - >>> (score1 + score2) / 2 # doctest: +ELLIPSIS - 0.6223... - - Custom weights may be supplied to fine-tune the BLEU score further. - A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. - >>> weights = (0.1, 0.3, 0.5, 0.1) - >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS - 0.5818... - - This particular weight gave extra value to trigrams. - Furthermore, multiple weights can be given, resulting in multiple BLEU scores. - >>> weights = [ - ... (0.5, 0.5), - ... (0.333, 0.333, 0.334), - ... (0.25, 0.25, 0.25, 0.25), - ... (0.2, 0.2, 0.2, 0.2, 0.2) - ... ] - >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS - [0.8242..., 0.7067..., 0.5920..., 0.4719...] - - :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses - :type list_of_references: list(list(list(str))) - :param hypotheses: a list of hypothesis sentences - :type hypotheses: list(list(str)) - :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) - :type weights: tuple(float) / list(tuple(float)) - :param smoothing_function: - :type smoothing_function: SmoothingFunction - :param auto_reweigh: Option to re-normalize the weights uniformly. - :type auto_reweigh: bool - :return: The corpus-level BLEU score. - :rtype: float - """ - # Before proceeding to compute BLEU, perform sanity checks. - - p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. - p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. - hyp_lengths, ref_lengths = 0, 0 - - assert len(list_of_references) == len(hypotheses), ( - "The number of hypotheses and their reference(s) should be the " "same " - ) - - try: - weights[0][0] - except TypeError: - weights = [weights] - max_weight_length = max(len(weight) for weight in weights) - - # Iterate through each hypothesis and their corresponding references. - for references, hypothesis in zip(list_of_references, hypotheses): - # For each order of ngram, calculate the numerator and - # denominator for the corpus-level modified precision. - for i in range(1, max_weight_length + 1): - p_i = modified_precision(references, hypothesis, i) - p_numerators[i] += p_i.numerator - p_denominators[i] += p_i.denominator - - # Calculate the hypothesis length and the closest reference length. - # Adds them to the corpus-level hypothesis and reference counts. - hyp_len = len(hypothesis) - hyp_lengths += hyp_len - ref_lengths += closest_ref_length(references, hyp_len) - - # Calculate corpus-level brevity penalty. - bp = brevity_penalty(ref_lengths, hyp_lengths) - - # Collects the various precision values for the different ngram orders. - p_n = [ - Fraction(p_numerators[i], p_denominators[i], _normalize=False) - for i in range(1, max_weight_length + 1) - ] - - # Returns 0 if there's no matching n-grams - # We only need to check for p_numerators[1] == 0, since if there's - # no unigrams, there won't be any higher order ngrams. - if p_numerators[1] == 0: - return 0 if len(weights) == 1 else [0] * len(weights) - - # If there's no smoothing, set use method0 from SmoothinFunction class. - if not smoothing_function: - smoothing_function = SmoothingFunction().method0 - # Smoothen the modified precision. - # Note: smoothing_function() may convert values into floats; - # it tries to retain the Fraction object as much as the - # smoothing method allows. - p_n = smoothing_function( - p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths - ) - - bleu_scores = [] - for weight in weights: - # Uniformly re-weighting based on maximum hypothesis lengths if largest - # order of n-grams < 4 and weights is set at default. - if auto_reweigh: - if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): - weight = (1 / hyp_lengths,) * hyp_lengths - - s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) - s = bp * math.exp(math.fsum(s)) - bleu_scores.append(s) - return bleu_scores[0] if len(weights) == 1 else bleu_scores - - -def modified_precision(references, hypothesis, n): - """ - Calculate modified ngram precision. - - The normal precision method may lead to some wrong translations with - high-precision, e.g., the translation, in which a word of reference - repeats several times, has very high precision. - - This function only returns the Fraction object that contains the numerator - and denominator necessary to calculate the corpus-level precision. - To calculate the modified precision for a single pair of hypothesis and - references, cast the Fraction object into a float. - - The famous "the the the ... " example shows that you can get BLEU precision - by duplicating high frequency words. - - >>> reference1 = 'the cat is on the mat'.split() - >>> reference2 = 'there is a cat on the mat'.split() - >>> hypothesis1 = 'the the the the the the the'.split() - >>> references = [reference1, reference2] - >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS - 0.2857... - - In the modified n-gram precision, a reference word will be considered - exhausted after a matching hypothesis word is identified, e.g. - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', - ... 'forever', 'heed', 'Party', 'commands'] - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - >>> hypothesis = 'of the'.split() - >>> references = [reference1, reference2, reference3] - >>> float(modified_precision(references, hypothesis, n=1)) - 1.0 - >>> float(modified_precision(references, hypothesis, n=2)) - 1.0 - - An example of a normal machine translation hypothesis: - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - - >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', - ... 'forever', 'hearing', 'the', 'activity', 'guidebook', - ... 'that', 'party', 'direct'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', - ... 'forever', 'heed', 'Party', 'commands'] - - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - >>> references = [reference1, reference2, reference3] - >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS - 0.9444... - >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS - 0.5714... - >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS - 0.5882352941176471 - >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS - 0.07692... - - - :param references: A list of reference translations. - :type references: list(list(str)) - :param hypothesis: A hypothesis translation. - :type hypothesis: list(str) - :param n: The ngram order. - :type n: int - :return: BLEU's modified precision for the nth order ngram. - :rtype: Fraction - """ - # Extracts all ngrams in hypothesis - # Set an empty Counter if hypothesis is empty. - counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() - # Extract a union of references' counts. - # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) - max_counts = {} - for reference in references: - reference_counts = ( - Counter(ngrams(reference, n)) if len(reference) >= n else Counter() - ) - for ngram in counts: - max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) - - # Assigns the intersection between hypothesis and references' counts. - clipped_counts = { - ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() - } - - numerator = sum(clipped_counts.values()) - # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. - # Usually this happens when the ngram order is > len(reference). - denominator = max(1, sum(counts.values())) - - return Fraction(numerator, denominator, _normalize=False) - - -def closest_ref_length(references, hyp_len): - """ - This function finds the reference that is the closest length to the - hypothesis. The closest reference length is referred to as *r* variable - from the brevity penalty formula in Papineni et. al. (2002) - - :param references: A list of reference translations. - :type references: list(list(str)) - :param hyp_len: The length of the hypothesis. - :type hyp_len: int - :return: The length of the reference that's closest to the hypothesis. - :rtype: int - """ - ref_lens = (len(reference) for reference in references) - closest_ref_len = min( - ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) - ) - return closest_ref_len - - -def brevity_penalty(closest_ref_len, hyp_len): - """ - Calculate brevity penalty. - - As the modified n-gram precision still has the problem from the short - length sentence, brevity penalty is used to modify the overall BLEU - score according to length. - - An example from the paper. There are three references with length 12, 15 - and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. - - >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 - >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 - >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 - >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 - >>> references = [reference1, reference2, reference3] - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 1.0 - - In case a hypothesis translation is shorter than the references, penalty is - applied. - - >>> references = [['a'] * 28, ['a'] * 28] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 0.2635971381157267 - - The length of the closest reference is used to compute the penalty. If the - length of a hypothesis is 12, and the reference lengths are 13 and 2, the - penalty is applied because the hypothesis length (12) is less then the - closest reference length (13). - - >>> references = [['a'] * 13, ['a'] * 2] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS - 0.9200... - - The brevity penalty doesn't depend on reference order. More importantly, - when two reference sentences are at the same distance, the shortest - reference sentence length is used. - - >>> references = [['a'] * 13, ['a'] * 11] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) - >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) - >>> bp1 == bp2 == 1 - True - - A test example from mteval-v13a.pl (starting from the line 705): - - >>> references = [['a'] * 11, ['a'] * 8] - >>> hypothesis = ['a'] * 7 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS - 0.8668... - - >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] - >>> hypothesis = ['a'] * 7 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 1.0 - - :param hyp_len: The length of the hypothesis for a single sentence OR the - sum of all the hypotheses' lengths for a corpus - :type hyp_len: int - :param closest_ref_len: The length of the closest reference for a single - hypothesis OR the sum of all the closest references for every hypotheses. - :type closest_ref_len: int - :return: BLEU's brevity penalty. - :rtype: float - """ - if hyp_len > closest_ref_len: - return 1 - # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 - elif hyp_len == 0: - return 0 - else: - return math.exp(1 - closest_ref_len / hyp_len) - - -class SmoothingFunction: - """ - This is an implementation of the smoothing techniques - for segment-level BLEU scores that was presented in - Boxing Chen and Collin Cherry (2014) A Systematic Comparison of - Smoothing Techniques for Sentence-Level BLEU. In WMT14. - http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf - """ - - def __init__(self, epsilon=0.1, alpha=5, k=5): - """ - This will initialize the parameters required for the various smoothing - techniques, the default values are set to the numbers used in the - experiments from Chen and Cherry (2014). - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', - ... 'that', 'the', 'military', 'always', 'obeys', 'the', - ... 'commands', 'of', 'the', 'party'] - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', - ... 'that', 'the', 'military', 'will', 'forever', 'heed', - ... 'Party', 'commands'] - - >>> chencherry = SmoothingFunction() - >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS - 0.4452... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS - 0.4905... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS - 0.4135... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS - 0.4905... - - :param epsilon: the epsilon value use in method 1 - :type epsilon: float - :param alpha: the alpha value use in method 6 - :type alpha: int - :param k: the k value use in method 4 - :type k: int - """ - self.epsilon = epsilon - self.alpha = alpha - self.k = k - - def method0(self, p_n, *args, **kwargs): - """ - No smoothing. - """ - p_n_new = [] - for i, p_i in enumerate(p_n): - if p_i.numerator != 0: - p_n_new.append(p_i) - else: - _msg = str( - "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" - "Therefore the BLEU score evaluates to 0, independently of\n" - "how many N-gram overlaps of lower order it contains.\n" - "Consider using lower n-gram order or use " - "SmoothingFunction()" - ).format(i + 1) - warnings.warn(_msg) - # When numerator==0 where denonminator==0 or !=0, the result - # for the precision score should be equal to 0 or undefined. - # Due to BLEU geometric mean computation in logarithm space, - # we we need to take the return sys.float_info.min such that - # math.log(sys.float_info.min) returns a 0 precision score. - p_n_new.append(sys.float_info.min) - return p_n_new - - def method1(self, p_n, *args, **kwargs): - """ - Smoothing method 1: Add *epsilon* counts to precision with 0 counts. - """ - return [ - (p_i.numerator + self.epsilon) / p_i.denominator - if p_i.numerator == 0 - else p_i - for p_i in p_n - ] - - def method2(self, p_n, *args, **kwargs): - """ - Smoothing method 2: Add 1 to both numerator and denominator from - Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for - Evaluating Automatic Evaluation Metrics for Machine Translation. - In COLING 2004. - """ - return [ - Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) - if i != 0 - else p_n[0] - for i in range(len(p_n)) - ] - - def method3(self, p_n, *args, **kwargs): - """ - Smoothing method 3: NIST geometric sequence smoothing - The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each - precision score whose matching n-gram count is null. - k is 1 for the first 'n' value for which the n-gram match count is null/ - - For example, if the text contains: - - - one 2-gram match - - and (consequently) two 1-gram matches - - the n-gram count for each individual precision score would be: - - - n=1 => prec_count = 2 (two unigrams) - - n=2 => prec_count = 1 (one bigram) - - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) - - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) - """ - incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. - for i, p_i in enumerate(p_n): - if p_i.numerator == 0: - p_n[i] = 1 / (2**incvnt * p_i.denominator) - incvnt += 1 - return p_n - - def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 4: - Shorter translations may have inflated precision values due to having - smaller denominators; therefore, we give them proportionally - smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry - suggests dividing by 1/ln(len(T)), where T is the length of the translation. - """ - incvnt = 1 - hyp_len = hyp_len if hyp_len else len(hypothesis) - for i, p_i in enumerate(p_n): - if p_i.numerator == 0 and hyp_len > 1: - # incvnt = i + 1 * self.k / math.log( - # hyp_len - # ) # Note that this K is different from the K from NIST. - # p_n[i] = incvnt / p_i.denominator\ - numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) - p_n[i] = numerator / p_i.denominator - incvnt += 1 - return p_n - - def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 5: - The matched counts for similar values of n should be similar. To a - calculate the n-gram matched count, it averages the n−1, n and n+1 gram - matched counts. - """ - hyp_len = hyp_len if hyp_len else len(hypothesis) - m = {} - # Requires an precision value for an addition ngram order. - p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] - m[-1] = p_n[0] + 1 - for i, p_i in enumerate(p_n): - p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 - m[i] = p_n[i] - return p_n - - def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 6: - Interpolates the maximum likelihood estimate of the precision *p_n* with - a prior estimate *pi0*. The prior is estimated by assuming that the ratio - between pn and pn−1 will be the same as that between pn−1 and pn−2; from - Gao and He (2013) Training MRF-Based Phrase Translation Models using - Gradient Ascent. In NAACL. - """ - hyp_len = hyp_len if hyp_len else len(hypothesis) - # This smoothing only works when p_1 and p_2 is non-zero. - # Raise an error with an appropriate message when the input is too short - # to use this smoothing technique. - assert p_n[2], "This smoothing method requires non-zero precision for bigrams." - for i, p_i in enumerate(p_n): - if i in [0, 1]: # Skips the first 2 orders of ngrams. - continue - else: - pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] - # No. of ngrams in translation that matches the reference. - m = p_i.numerator - # No. of ngrams in translation. - l = sum(1 for _ in ngrams(hypothesis, i + 1)) - # Calculates the interpolated precision. - p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) - return p_n - - def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 7: - Interpolates methods 4 and 5. - """ - hyp_len = hyp_len if hyp_len else len(hypothesis) - p_n = self.method4(p_n, references, hypothesis, hyp_len) - p_n = self.method5(p_n, references, hypothesis, hyp_len) - return p_n +# Natural Language Toolkit: BLEU Score +# +# Copyright (C) 2001-2023 NLTK Project +# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim +# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan +# URL: +# For license information, see LICENSE.TXT + +"""BLEU score implementation.""" +import math +import sys +import warnings +from collections import Counter +from fractions import Fraction as _Fraction + +from nltk.util import ngrams + + +class Fraction(_Fraction): + """Fraction with _normalize=False support for 3.12""" + + def __new__(cls, numerator=0, denominator=None, _normalize=False): + if sys.version_info >= (3, 12): + self = super().__new__(cls, numerator, denominator) + else: + self = super().__new__(cls, numerator, denominator, _normalize=_normalize) + self._normalize = _normalize + self._original_numerator = numerator + self._original_denominator = denominator + return self + + @property + def numerator(self): + if not self._normalize: + return self._original_numerator + return super().numerator + + @property + def denominator(self): + if not self._normalize: + return self._original_denominator + return super().denominator + + +def sentence_bleu( + references, + hypothesis, + weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function=None, + auto_reweigh=False, +): + """ + Calculate BLEU score (Bilingual Evaluation Understudy) from + Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. + "BLEU: a method for automatic evaluation of machine translation." + In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + + >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', + ... 'forever', 'hearing', 'the', 'activity', 'guidebook', + ... 'that', 'party', 'direct'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS + 0.5045... + + If there is no ngrams overlap for any order of n-grams, BLEU returns the + value 0. This is because the precision for the order of n-grams without + overlap is 0, and the geometric mean in the final BLEU score computation + multiplies the 0 with the precision of other n-grams. This results in 0 + (independently of the precision of the other n-gram orders). The following + example has zero 3-gram and 4-gram overlaps: + + >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS + 0.0 + + To avoid this harsh behaviour when no ngram overlaps are found a smoothing + function can be used. + + >>> chencherry = SmoothingFunction() + >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, + ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS + 0.0370... + + The default BLEU calculates a score for up to 4-grams using uniform + weights (this is called BLEU-4). To evaluate your translations with + higher/lower order ngrams, use customized weights. E.g. when accounting + for up to 5-grams with uniform weights (this is called BLEU-5) use: + + >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS + 0.3920... + + Multiple BLEU scores can be computed at once, by supplying a list of weights. + E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: + >>> weights = [ + ... (1./2., 1./2.), + ... (1./3., 1./3., 1./3.), + ... (1./4., 1./4., 1./4., 1./4.) + ... ] + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS + [0.7453..., 0.6240..., 0.5045...] + + :param references: reference sentences + :type references: list(list(str)) + :param hypothesis: a hypothesis sentence + :type hypothesis: list(str) + :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) + :type weights: tuple(float) / list(tuple(float)) + :param smoothing_function: + :type smoothing_function: SmoothingFunction + :param auto_reweigh: Option to re-normalize the weights uniformly. + :type auto_reweigh: bool + :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. + :rtype: float / list(float) + """ + return corpus_bleu( + [references], [hypothesis], weights, smoothing_function, auto_reweigh + ) + + +def corpus_bleu( + list_of_references, + hypotheses, + weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function=None, + auto_reweigh=False, +): + """ + Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all + the hypotheses and their respective references. + + Instead of averaging the sentence level BLEU scores (i.e. macro-average + precision), the original BLEU metric (Papineni et al. 2002) accounts for + the micro-average precision (i.e. summing the numerators and denominators + for each hypothesis-reference(s) pairs before the division). + + >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] + >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', + ... 'interested', 'in', 'world', 'history'] + >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', + ... 'because', 'he', 'read', 'the', 'book'] + + >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] + >>> hypotheses = [hyp1, hyp2] + >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS + 0.5920... + + The example below show that corpus_bleu() is different from averaging + sentence_bleu() for hypotheses + + >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) + >>> score2 = sentence_bleu([ref2a], hyp2) + >>> (score1 + score2) / 2 # doctest: +ELLIPSIS + 0.6223... + + Custom weights may be supplied to fine-tune the BLEU score further. + A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. + >>> weights = (0.1, 0.3, 0.5, 0.1) + >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS + 0.5818... + + This particular weight gave extra value to trigrams. + Furthermore, multiple weights can be given, resulting in multiple BLEU scores. + >>> weights = [ + ... (0.5, 0.5), + ... (0.333, 0.333, 0.334), + ... (0.25, 0.25, 0.25, 0.25), + ... (0.2, 0.2, 0.2, 0.2, 0.2) + ... ] + >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS + [0.8242..., 0.7067..., 0.5920..., 0.4719...] + + :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses + :type list_of_references: list(list(list(str))) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) + :type weights: tuple(float) / list(tuple(float)) + :param smoothing_function: + :type smoothing_function: SmoothingFunction + :param auto_reweigh: Option to re-normalize the weights uniformly. + :type auto_reweigh: bool + :return: The corpus-level BLEU score. + :rtype: float + """ + # Before proceeding to compute BLEU, perform sanity checks. + + p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. + p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. + hyp_lengths, ref_lengths = 0, 0 + + assert len(list_of_references) == len(hypotheses), ( + "The number of hypotheses and their reference(s) should be the " "same " + ) + + try: + weights[0][0] + except TypeError: + weights = [weights] + max_weight_length = max(len(weight) for weight in weights) + + # Iterate through each hypothesis and their corresponding references. + for references, hypothesis in zip(list_of_references, hypotheses): + # For each order of ngram, calculate the numerator and + # denominator for the corpus-level modified precision. + for i in range(1, max_weight_length + 1): + p_i = modified_precision(references, hypothesis, i) + p_numerators[i] += p_i.numerator + p_denominators[i] += p_i.denominator + + # Calculate the hypothesis length and the closest reference length. + # Adds them to the corpus-level hypothesis and reference counts. + hyp_len = len(hypothesis) + hyp_lengths += hyp_len + ref_lengths += closest_ref_length(references, hyp_len) + + # Calculate corpus-level brevity penalty. + bp = brevity_penalty(ref_lengths, hyp_lengths) + + # Collects the various precision values for the different ngram orders. + p_n = [ + Fraction(p_numerators[i], p_denominators[i], _normalize=False) + for i in range(1, max_weight_length + 1) + ] + + # Returns 0 if there's no matching n-grams + # We only need to check for p_numerators[1] == 0, since if there's + # no unigrams, there won't be any higher order ngrams. + if p_numerators[1] == 0: + return 0 if len(weights) == 1 else [0] * len(weights) + + # If there's no smoothing, set use method0 from SmoothinFunction class. + if not smoothing_function: + smoothing_function = SmoothingFunction().method0 + # Smoothen the modified precision. + # Note: smoothing_function() may convert values into floats; + # it tries to retain the Fraction object as much as the + # smoothing method allows. + p_n = smoothing_function( + p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths + ) + + bleu_scores = [] + for weight in weights: + # Uniformly re-weighting based on maximum hypothesis lengths if largest + # order of n-grams < 4 and weights is set at default. + if auto_reweigh: + if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): + weight = (1 / hyp_lengths,) * hyp_lengths + + s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) + s = bp * math.exp(math.fsum(s)) + bleu_scores.append(s) + return bleu_scores[0] if len(weights) == 1 else bleu_scores + + +def modified_precision(references, hypothesis, n): + """ + Calculate modified ngram precision. + + The normal precision method may lead to some wrong translations with + high-precision, e.g., the translation, in which a word of reference + repeats several times, has very high precision. + + This function only returns the Fraction object that contains the numerator + and denominator necessary to calculate the corpus-level precision. + To calculate the modified precision for a single pair of hypothesis and + references, cast the Fraction object into a float. + + The famous "the the the ... " example shows that you can get BLEU precision + by duplicating high frequency words. + + >>> reference1 = 'the cat is on the mat'.split() + >>> reference2 = 'there is a cat on the mat'.split() + >>> hypothesis1 = 'the the the the the the the'.split() + >>> references = [reference1, reference2] + >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS + 0.2857... + + In the modified n-gram precision, a reference word will be considered + exhausted after a matching hypothesis word is identified, e.g. + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', + ... 'forever', 'heed', 'Party', 'commands'] + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + >>> hypothesis = 'of the'.split() + >>> references = [reference1, reference2, reference3] + >>> float(modified_precision(references, hypothesis, n=1)) + 1.0 + >>> float(modified_precision(references, hypothesis, n=2)) + 1.0 + + An example of a normal machine translation hypothesis: + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + + >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', + ... 'forever', 'hearing', 'the', 'activity', 'guidebook', + ... 'that', 'party', 'direct'] + + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', + ... 'forever', 'heed', 'Party', 'commands'] + + >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', + ... 'Party'] + + >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + >>> references = [reference1, reference2, reference3] + >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS + 0.9444... + >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS + 0.5714... + >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS + 0.5882352941176471 + >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS + 0.07692... + + + :param references: A list of reference translations. + :type references: list(list(str)) + :param hypothesis: A hypothesis translation. + :type hypothesis: list(str) + :param n: The ngram order. + :type n: int + :return: BLEU's modified precision for the nth order ngram. + :rtype: Fraction + """ + # Extracts all ngrams in hypothesis + # Set an empty Counter if hypothesis is empty. + counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() + # Extract a union of references' counts. + # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) + max_counts = {} + for reference in references: + reference_counts = ( + Counter(ngrams(reference, n)) if len(reference) >= n else Counter() + ) + for ngram in counts: + max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) + + # Assigns the intersection between hypothesis and references' counts. + clipped_counts = { + ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() + } + + numerator = sum(clipped_counts.values()) + # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. + # Usually this happens when the ngram order is > len(reference). + denominator = max(1, sum(counts.values())) + + return Fraction(numerator, denominator, _normalize=False) + + +def closest_ref_length(references, hyp_len): + """ + This function finds the reference that is the closest length to the + hypothesis. The closest reference length is referred to as *r* variable + from the brevity penalty formula in Papineni et. al. (2002) + + :param references: A list of reference translations. + :type references: list(list(str)) + :param hyp_len: The length of the hypothesis. + :type hyp_len: int + :return: The length of the reference that's closest to the hypothesis. + :rtype: int + """ + ref_lens = (len(reference) for reference in references) + closest_ref_len = min( + ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) + ) + return closest_ref_len + + +def brevity_penalty(closest_ref_len, hyp_len): + """ + Calculate brevity penalty. + + As the modified n-gram precision still has the problem from the short + length sentence, brevity penalty is used to modify the overall BLEU + score according to length. + + An example from the paper. There are three references with length 12, 15 + and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. + + >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 + >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 + >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 + >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 + >>> references = [reference1, reference2, reference3] + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 1.0 + + In case a hypothesis translation is shorter than the references, penalty is + applied. + + >>> references = [['a'] * 28, ['a'] * 28] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 0.2635971381157267 + + The length of the closest reference is used to compute the penalty. If the + length of a hypothesis is 12, and the reference lengths are 13 and 2, the + penalty is applied because the hypothesis length (12) is less then the + closest reference length (13). + + >>> references = [['a'] * 13, ['a'] * 2] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS + 0.9200... + + The brevity penalty doesn't depend on reference order. More importantly, + when two reference sentences are at the same distance, the shortest + reference sentence length is used. + + >>> references = [['a'] * 13, ['a'] * 11] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) + >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) + >>> bp1 == bp2 == 1 + True + + A test example from mteval-v13a.pl (starting from the line 705): + + >>> references = [['a'] * 11, ['a'] * 8] + >>> hypothesis = ['a'] * 7 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS + 0.8668... + + >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] + >>> hypothesis = ['a'] * 7 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 1.0 + + :param hyp_len: The length of the hypothesis for a single sentence OR the + sum of all the hypotheses' lengths for a corpus + :type hyp_len: int + :param closest_ref_len: The length of the closest reference for a single + hypothesis OR the sum of all the closest references for every hypotheses. + :type closest_ref_len: int + :return: BLEU's brevity penalty. + :rtype: float + """ + if hyp_len > closest_ref_len: + return 1 + # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 + elif hyp_len == 0: + return 0 + else: + return math.exp(1 - closest_ref_len / hyp_len) + + +class SmoothingFunction: + """ + This is an implementation of the smoothing techniques + for segment-level BLEU scores that was presented in + Boxing Chen and Collin Cherry (2014) A Systematic Comparison of + Smoothing Techniques for Sentence-Level BLEU. In WMT14. + http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf + """ + + def __init__(self, epsilon=0.1, alpha=5, k=5): + """ + This will initialize the parameters required for the various smoothing + techniques, the default values are set to the numbers used in the + experiments from Chen and Cherry (2014). + + >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', + ... 'that', 'the', 'military', 'always', 'obeys', 'the', + ... 'commands', 'of', 'the', 'party'] + >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', + ... 'that', 'the', 'military', 'will', 'forever', 'heed', + ... 'Party', 'commands'] + + >>> chencherry = SmoothingFunction() + >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS + 0.4452... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS + 0.4118... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS + 0.4905... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS + 0.4135... + >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS + 0.4905... + + :param epsilon: the epsilon value use in method 1 + :type epsilon: float + :param alpha: the alpha value use in method 6 + :type alpha: int + :param k: the k value use in method 4 + :type k: int + """ + self.epsilon = epsilon + self.alpha = alpha + self.k = k + + def method0(self, p_n, *args, **kwargs): + """ + No smoothing. + """ + p_n_new = [] + for i, p_i in enumerate(p_n): + if p_i.numerator != 0: + p_n_new.append(p_i) + else: + _msg = str( + "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" + "Therefore the BLEU score evaluates to 0, independently of\n" + "how many N-gram overlaps of lower order it contains.\n" + "Consider using lower n-gram order or use " + "SmoothingFunction()" + ).format(i + 1) + warnings.warn(_msg) + # When numerator==0 where denonminator==0 or !=0, the result + # for the precision score should be equal to 0 or undefined. + # Due to BLEU geometric mean computation in logarithm space, + # we we need to take the return sys.float_info.min such that + # math.log(sys.float_info.min) returns a 0 precision score. + p_n_new.append(sys.float_info.min) + return p_n_new + + def method1(self, p_n, *args, **kwargs): + """ + Smoothing method 1: Add *epsilon* counts to precision with 0 counts. + """ + return [ + (p_i.numerator + self.epsilon) / p_i.denominator + if p_i.numerator == 0 + else p_i + for p_i in p_n + ] + + def method2(self, p_n, *args, **kwargs): + """ + Smoothing method 2: Add 1 to both numerator and denominator from + Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for + Evaluating Automatic Evaluation Metrics for Machine Translation. + In COLING 2004. + """ + return [ + Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) + if i != 0 + else p_n[0] + for i in range(len(p_n)) + ] + + def method3(self, p_n, *args, **kwargs): + """ + Smoothing method 3: NIST geometric sequence smoothing + The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each + precision score whose matching n-gram count is null. + k is 1 for the first 'n' value for which the n-gram match count is null/ + + For example, if the text contains: + + - one 2-gram match + - and (consequently) two 1-gram matches + + the n-gram count for each individual precision score would be: + + - n=1 => prec_count = 2 (two unigrams) + - n=2 => prec_count = 1 (one bigram) + - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) + - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) + """ + incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. + for i, p_i in enumerate(p_n): + if p_i.numerator == 0: + p_n[i] = 1 / (2**incvnt * p_i.denominator) + incvnt += 1 + return p_n + + def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 4: + Shorter translations may have inflated precision values due to having + smaller denominators; therefore, we give them proportionally + smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry + suggests dividing by 1/ln(len(T)), where T is the length of the translation. + """ + incvnt = 1 + hyp_len = hyp_len if hyp_len else len(hypothesis) + for i, p_i in enumerate(p_n): + if p_i.numerator == 0 and hyp_len > 1: + # incvnt = i + 1 * self.k / math.log( + # hyp_len + # ) # Note that this K is different from the K from NIST. + # p_n[i] = incvnt / p_i.denominator\ + numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) + p_n[i] = numerator / p_i.denominator + incvnt += 1 + return p_n + + def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 5: + The matched counts for similar values of n should be similar. To a + calculate the n-gram matched count, it averages the n−1, n and n+1 gram + matched counts. + """ + hyp_len = hyp_len if hyp_len else len(hypothesis) + m = {} + # Requires an precision value for an addition ngram order. + p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] + m[-1] = p_n[0] + 1 + for i, p_i in enumerate(p_n): + p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 + m[i] = p_n[i] + return p_n + + def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 6: + Interpolates the maximum likelihood estimate of the precision *p_n* with + a prior estimate *pi0*. The prior is estimated by assuming that the ratio + between pn and pn−1 will be the same as that between pn−1 and pn−2; from + Gao and He (2013) Training MRF-Based Phrase Translation Models using + Gradient Ascent. In NAACL. + """ + hyp_len = hyp_len if hyp_len else len(hypothesis) + # This smoothing only works when p_1 and p_2 is non-zero. + # Raise an error with an appropriate message when the input is too short + # to use this smoothing technique. + assert p_n[2], "This smoothing method requires non-zero precision for bigrams." + for i, p_i in enumerate(p_n): + if i in [0, 1]: # Skips the first 2 orders of ngrams. + continue + else: + pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] + # No. of ngrams in translation that matches the reference. + m = p_i.numerator + # No. of ngrams in translation. + l = sum(1 for _ in ngrams(hypothesis, i + 1)) + # Calculates the interpolated precision. + p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) + return p_n + + def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): + """ + Smoothing method 7: + Interpolates methods 4 and 5. + """ + hyp_len = hyp_len if hyp_len else len(hypothesis) + p_n = self.method4(p_n, references, hypothesis, hyp_len) + p_n = self.method5(p_n, references, hypothesis, hyp_len) + return p_n Index: nltk-3.8.1/README.md =================================================================== --- nltk-3.8.1.orig/README.md +++ nltk-3.8.1/README.md @@ -1,50 +1,50 @@ -# Natural Language Toolkit (NLTK) -[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) -![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) - -NLTK -- the Natural Language Toolkit -- is a suite of open source Python -modules, data sets, and tutorials supporting research and development in Natural -Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11. - -For documentation, please visit [nltk.org](https://www.nltk.org/). - - -## Contributing - -Do you want to contribute to NLTK development? Great! -Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. - -See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). - - -## Donate - -Have you found the toolkit helpful? Please support NLTK development by donating -to the project via PayPal, using the link on the NLTK homepage. - - -## Citing - -If you publish work that uses NLTK, please cite the NLTK book, as follows: - - Bird, Steven, Edward Loper and Ewan Klein (2009). - Natural Language Processing with Python. O'Reilly Media Inc. - - -## Copyright - -Copyright (C) 2001-2023 NLTK Project - -For license information, see [LICENSE.txt](LICENSE.txt). - -[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. - - -### Redistributing - -- NLTK source code is distributed under the Apache 2.0 License. -- NLTK documentation is distributed under the Creative Commons - Attribution-Noncommercial-No Derivative Works 3.0 United States license. -- NLTK corpora are provided under the terms given in the README file for each - corpus; all are redistributable and available for non-commercial use. -- NLTK may be freely redistributed, subject to the provisions of these licenses. +# Natural Language Toolkit (NLTK) +[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) +![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) + +NLTK -- the Natural Language Toolkit -- is a suite of open source Python +modules, data sets, and tutorials supporting research and development in Natural +Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12. + +For documentation, please visit [nltk.org](https://www.nltk.org/). + + +## Contributing + +Do you want to contribute to NLTK development? Great! +Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. + +See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). + + +## Donate + +Have you found the toolkit helpful? Please support NLTK development by donating +to the project via PayPal, using the link on the NLTK homepage. + + +## Citing + +If you publish work that uses NLTK, please cite the NLTK book, as follows: + + Bird, Steven, Edward Loper and Ewan Klein (2009). + Natural Language Processing with Python. O'Reilly Media Inc. + + +## Copyright + +Copyright (C) 2001-2023 NLTK Project + +For license information, see [LICENSE.txt](LICENSE.txt). + +[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. + + +### Redistributing + +- NLTK source code is distributed under the Apache 2.0 License. +- NLTK documentation is distributed under the Creative Commons + Attribution-Noncommercial-No Derivative Works 3.0 United States license. +- NLTK corpora are provided under the terms given in the README file for each + corpus; all are redistributable and available for non-commercial use. +- NLTK may be freely redistributed, subject to the provisions of these licenses. Index: nltk-3.8.1/setup.py =================================================================== --- nltk-3.8.1.orig/setup.py +++ nltk-3.8.1/setup.py @@ -1,125 +1,126 @@ -#!/usr/bin/env python -# -# Setup script for the Natural Language Toolkit -# -# Copyright (C) 2001-2023 NLTK Project -# Author: NLTK Team -# URL: -# For license information, see LICENSE.TXT - -# Work around mbcs bug in distutils. -# https://bugs.python.org/issue10945 -import codecs - -try: - codecs.lookup("mbcs") -except LookupError: - ascii = codecs.lookup("ascii") - func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") - codecs.register(func) - -import os - -# Use the VERSION file to get NLTK version -version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") -with open(version_file) as fh: - nltk_version = fh.read().strip() - -# setuptools -from setuptools import find_packages, setup - -# Specify groups of optional dependencies -extras_require = { - "machine_learning": [ - "numpy", - "python-crfsuite", - "scikit-learn", - "scipy", - ], - "plot": ["matplotlib"], - "tgrep": ["pyparsing"], - "twitter": ["twython"], - "corenlp": ["requests"], -} - -# Add a group made up of all optional dependencies -extras_require["all"] = { - package for group in extras_require.values() for package in group -} - -# Adds CLI commands -console_scripts = """ -[console_scripts] -nltk=nltk.cli:cli -""" - -_project_homepage = "https://www.nltk.org/" - -setup( - name="nltk", - description="Natural Language Toolkit", - version=nltk_version, - url=_project_homepage, - project_urls={ - "Documentation": _project_homepage, - "Source Code": "https://github.com/nltk/nltk", - "Issue Tracker": "https://github.com/nltk/nltk/issues", - }, - long_description="""\ -The Natural Language Toolkit (NLTK) is a Python package for -natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""", - license="Apache License, Version 2.0", - keywords=[ - "NLP", - "CL", - "natural language processing", - "computational linguistics", - "parsing", - "tagging", - "tokenizing", - "syntax", - "linguistics", - "language", - "natural language", - "text analytics", - ], - maintainer="NLTK Team", - maintainer_email="nltk.team@gmail.com", - author="NLTK Team", - author_email="nltk.team@gmail.com", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Information Technology", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Scientific/Engineering :: Human Machine Interfaces", - "Topic :: Scientific/Engineering :: Information Analysis", - "Topic :: Text Processing", - "Topic :: Text Processing :: Filters", - "Topic :: Text Processing :: General", - "Topic :: Text Processing :: Indexing", - "Topic :: Text Processing :: Linguistic", - ], - package_data={"nltk": ["test/*.doctest", "VERSION"]}, - python_requires=">=3.7", - install_requires=[ - "click", - "joblib", - "regex>=2021.8.3", - "tqdm", - ], - extras_require=extras_require, - packages=find_packages(), - zip_safe=False, # since normal files will be present too? - entry_points=console_scripts, -) +#!/usr/bin/env python +# +# Setup script for the Natural Language Toolkit +# +# Copyright (C) 2001-2023 NLTK Project +# Author: NLTK Team +# URL: +# For license information, see LICENSE.TXT + +# Work around mbcs bug in distutils. +# https://bugs.python.org/issue10945 +import codecs + +try: + codecs.lookup("mbcs") +except LookupError: + ascii = codecs.lookup("ascii") + func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs") + codecs.register(func) + +import os + +# Use the VERSION file to get NLTK version +version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION") +with open(version_file) as fh: + nltk_version = fh.read().strip() + +# setuptools +from setuptools import find_packages, setup + +# Specify groups of optional dependencies +extras_require = { + "machine_learning": [ + "numpy", + "python-crfsuite", + "scikit-learn", + "scipy", + ], + "plot": ["matplotlib"], + "tgrep": ["pyparsing"], + "twitter": ["twython"], + "corenlp": ["requests"], +} + +# Add a group made up of all optional dependencies +extras_require["all"] = { + package for group in extras_require.values() for package in group +} + +# Adds CLI commands +console_scripts = """ +[console_scripts] +nltk=nltk.cli:cli +""" + +_project_homepage = "https://www.nltk.org/" + +setup( + name="nltk", + description="Natural Language Toolkit", + version=nltk_version, + url=_project_homepage, + project_urls={ + "Documentation": _project_homepage, + "Source Code": "https://github.com/nltk/nltk", + "Issue Tracker": "https://github.com/nltk/nltk/issues", + }, + long_description="""\ +The Natural Language Toolkit (NLTK) is a Python package for +natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""", + license="Apache License, Version 2.0", + keywords=[ + "NLP", + "CL", + "natural language processing", + "computational linguistics", + "parsing", + "tagging", + "tokenizing", + "syntax", + "linguistics", + "language", + "natural language", + "text analytics", + ], + maintainer="NLTK Team", + maintainer_email="nltk.team@gmail.com", + author="NLTK Team", + author_email="nltk.team@gmail.com", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Human Machine Interfaces", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing", + "Topic :: Text Processing :: Filters", + "Topic :: Text Processing :: General", + "Topic :: Text Processing :: Indexing", + "Topic :: Text Processing :: Linguistic", + ], + package_data={"nltk": ["test/*.doctest", "VERSION"]}, + python_requires=">=3.7", + install_requires=[ + "click", + "joblib", + "regex>=2021.8.3", + "tqdm", + ], + extras_require=extras_require, + packages=find_packages(), + zip_safe=False, # since normal files will be present too? + entry_points=console_scripts, +)