forked from pool/python-nltk
Daniel Garcia
f64d1a206e
- Update to 3.8.1 * Resolve RCE & XSS vulnerabilities in localhost WordNet Browser * Add Python 3.11 support - Update nltk_data archive - Drop port-2to3.patch - Add nltk-pr3207-py312.patch for Python 3.12 support * gh#nltk/nltk#3207 OBS-URL: https://build.opensuse.org/request/show/1160467 OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-nltk?expand=0&rev=45
1783 lines
73 KiB
Diff
1783 lines
73 KiB
Diff
From 25d35fc4283dedd2053ec6d821f4b707fff8d72c Mon Sep 17 00:00:00 2001
|
||
From: Konstantin Chernyshev <k4black@ya.ru>
|
||
Date: Thu, 16 Nov 2023 19:00:15 +0100
|
||
Subject: [PATCH 1/8] ci: enable 3.12 in ci tests
|
||
|
||
---
|
||
.github/workflows/ci.yaml | 2 +-
|
||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||
|
||
Index: nltk-3.8.1/nltk/test/unit/translate/test_bleu.py
|
||
===================================================================
|
||
--- nltk-3.8.1.orig/nltk/test/unit/translate/test_bleu.py
|
||
+++ nltk-3.8.1/nltk/test/unit/translate/test_bleu.py
|
||
@@ -2,7 +2,6 @@
|
||
Tests for BLEU translation evaluation metric
|
||
"""
|
||
|
||
-import io
|
||
import unittest
|
||
|
||
from nltk.data import find
|
||
Index: nltk-3.8.1/nltk/translate/bleu_score.py
|
||
===================================================================
|
||
--- nltk-3.8.1.orig/nltk/translate/bleu_score.py
|
||
+++ nltk-3.8.1/nltk/translate/bleu_score.py
|
||
@@ -1,685 +1,710 @@
|
||
-# Natural Language Toolkit: BLEU Score
|
||
-#
|
||
-# Copyright (C) 2001-2023 NLTK Project
|
||
-# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||
-# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
|
||
-# URL: <https://www.nltk.org/>
|
||
-# For license information, see LICENSE.TXT
|
||
-
|
||
-"""BLEU score implementation."""
|
||
-
|
||
-import math
|
||
-import sys
|
||
-import warnings
|
||
-from collections import Counter
|
||
-from fractions import Fraction
|
||
-
|
||
-from nltk.util import ngrams
|
||
-
|
||
-
|
||
-def sentence_bleu(
|
||
- references,
|
||
- hypothesis,
|
||
- weights=(0.25, 0.25, 0.25, 0.25),
|
||
- smoothing_function=None,
|
||
- auto_reweigh=False,
|
||
-):
|
||
- """
|
||
- Calculate BLEU score (Bilingual Evaluation Understudy) from
|
||
- Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
|
||
- "BLEU: a method for automatic evaluation of machine translation."
|
||
- In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf
|
||
-
|
||
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||
- ... 'ensures', 'that', 'the', 'military', 'always',
|
||
- ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||
-
|
||
- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||
- ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||
- ... 'that', 'party', 'direct']
|
||
-
|
||
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
- ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||
- ... 'heed', 'Party', 'commands']
|
||
-
|
||
- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
- ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
- ... 'being', 'under', 'the', 'command', 'of', 'the',
|
||
- ... 'Party']
|
||
-
|
||
- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
- ... 'of', 'the', 'party']
|
||
-
|
||
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
||
- 0.5045...
|
||
-
|
||
- If there is no ngrams overlap for any order of n-grams, BLEU returns the
|
||
- value 0. This is because the precision for the order of n-grams without
|
||
- overlap is 0, and the geometric mean in the final BLEU score computation
|
||
- multiplies the 0 with the precision of other n-grams. This results in 0
|
||
- (independently of the precision of the other n-gram orders). The following
|
||
- example has zero 3-gram and 4-gram overlaps:
|
||
-
|
||
- >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
|
||
- 0.0
|
||
-
|
||
- To avoid this harsh behaviour when no ngram overlaps are found a smoothing
|
||
- function can be used.
|
||
-
|
||
- >>> chencherry = SmoothingFunction()
|
||
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
|
||
- ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
|
||
- 0.0370...
|
||
-
|
||
- The default BLEU calculates a score for up to 4-grams using uniform
|
||
- weights (this is called BLEU-4). To evaluate your translations with
|
||
- higher/lower order ngrams, use customized weights. E.g. when accounting
|
||
- for up to 5-grams with uniform weights (this is called BLEU-5) use:
|
||
-
|
||
- >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
|
||
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||
- 0.3920...
|
||
-
|
||
- Multiple BLEU scores can be computed at once, by supplying a list of weights.
|
||
- E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
|
||
- >>> weights = [
|
||
- ... (1./2., 1./2.),
|
||
- ... (1./3., 1./3., 1./3.),
|
||
- ... (1./4., 1./4., 1./4., 1./4.)
|
||
- ... ]
|
||
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||
- [0.7453..., 0.6240..., 0.5045...]
|
||
-
|
||
- :param references: reference sentences
|
||
- :type references: list(list(str))
|
||
- :param hypothesis: a hypothesis sentence
|
||
- :type hypothesis: list(str)
|
||
- :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||
- :type weights: tuple(float) / list(tuple(float))
|
||
- :param smoothing_function:
|
||
- :type smoothing_function: SmoothingFunction
|
||
- :param auto_reweigh: Option to re-normalize the weights uniformly.
|
||
- :type auto_reweigh: bool
|
||
- :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
|
||
- :rtype: float / list(float)
|
||
- """
|
||
- return corpus_bleu(
|
||
- [references], [hypothesis], weights, smoothing_function, auto_reweigh
|
||
- )
|
||
-
|
||
-
|
||
-def corpus_bleu(
|
||
- list_of_references,
|
||
- hypotheses,
|
||
- weights=(0.25, 0.25, 0.25, 0.25),
|
||
- smoothing_function=None,
|
||
- auto_reweigh=False,
|
||
-):
|
||
- """
|
||
- Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
|
||
- the hypotheses and their respective references.
|
||
-
|
||
- Instead of averaging the sentence level BLEU scores (i.e. macro-average
|
||
- precision), the original BLEU metric (Papineni et al. 2002) accounts for
|
||
- the micro-average precision (i.e. summing the numerators and denominators
|
||
- for each hypothesis-reference(s) pairs before the division).
|
||
-
|
||
- >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||
- ... 'ensures', 'that', 'the', 'military', 'always',
|
||
- ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||
- >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
- ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||
- ... 'heed', 'Party', 'commands']
|
||
- >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
- ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
- ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||
- >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
- ... 'of', 'the', 'party']
|
||
-
|
||
- >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||
- ... 'interested', 'in', 'world', 'history']
|
||
- >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||
- ... 'because', 'he', 'read', 'the', 'book']
|
||
-
|
||
- >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||
- >>> hypotheses = [hyp1, hyp2]
|
||
- >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
||
- 0.5920...
|
||
-
|
||
- The example below show that corpus_bleu() is different from averaging
|
||
- sentence_bleu() for hypotheses
|
||
-
|
||
- >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
|
||
- >>> score2 = sentence_bleu([ref2a], hyp2)
|
||
- >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
||
- 0.6223...
|
||
-
|
||
- Custom weights may be supplied to fine-tune the BLEU score further.
|
||
- A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
|
||
- >>> weights = (0.1, 0.3, 0.5, 0.1)
|
||
- >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||
- 0.5818...
|
||
-
|
||
- This particular weight gave extra value to trigrams.
|
||
- Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
|
||
- >>> weights = [
|
||
- ... (0.5, 0.5),
|
||
- ... (0.333, 0.333, 0.334),
|
||
- ... (0.25, 0.25, 0.25, 0.25),
|
||
- ... (0.2, 0.2, 0.2, 0.2, 0.2)
|
||
- ... ]
|
||
- >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||
- [0.8242..., 0.7067..., 0.5920..., 0.4719...]
|
||
-
|
||
- :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||
- :type list_of_references: list(list(list(str)))
|
||
- :param hypotheses: a list of hypothesis sentences
|
||
- :type hypotheses: list(list(str))
|
||
- :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||
- :type weights: tuple(float) / list(tuple(float))
|
||
- :param smoothing_function:
|
||
- :type smoothing_function: SmoothingFunction
|
||
- :param auto_reweigh: Option to re-normalize the weights uniformly.
|
||
- :type auto_reweigh: bool
|
||
- :return: The corpus-level BLEU score.
|
||
- :rtype: float
|
||
- """
|
||
- # Before proceeding to compute BLEU, perform sanity checks.
|
||
-
|
||
- p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
|
||
- p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
|
||
- hyp_lengths, ref_lengths = 0, 0
|
||
-
|
||
- assert len(list_of_references) == len(hypotheses), (
|
||
- "The number of hypotheses and their reference(s) should be the " "same "
|
||
- )
|
||
-
|
||
- try:
|
||
- weights[0][0]
|
||
- except TypeError:
|
||
- weights = [weights]
|
||
- max_weight_length = max(len(weight) for weight in weights)
|
||
-
|
||
- # Iterate through each hypothesis and their corresponding references.
|
||
- for references, hypothesis in zip(list_of_references, hypotheses):
|
||
- # For each order of ngram, calculate the numerator and
|
||
- # denominator for the corpus-level modified precision.
|
||
- for i in range(1, max_weight_length + 1):
|
||
- p_i = modified_precision(references, hypothesis, i)
|
||
- p_numerators[i] += p_i.numerator
|
||
- p_denominators[i] += p_i.denominator
|
||
-
|
||
- # Calculate the hypothesis length and the closest reference length.
|
||
- # Adds them to the corpus-level hypothesis and reference counts.
|
||
- hyp_len = len(hypothesis)
|
||
- hyp_lengths += hyp_len
|
||
- ref_lengths += closest_ref_length(references, hyp_len)
|
||
-
|
||
- # Calculate corpus-level brevity penalty.
|
||
- bp = brevity_penalty(ref_lengths, hyp_lengths)
|
||
-
|
||
- # Collects the various precision values for the different ngram orders.
|
||
- p_n = [
|
||
- Fraction(p_numerators[i], p_denominators[i], _normalize=False)
|
||
- for i in range(1, max_weight_length + 1)
|
||
- ]
|
||
-
|
||
- # Returns 0 if there's no matching n-grams
|
||
- # We only need to check for p_numerators[1] == 0, since if there's
|
||
- # no unigrams, there won't be any higher order ngrams.
|
||
- if p_numerators[1] == 0:
|
||
- return 0 if len(weights) == 1 else [0] * len(weights)
|
||
-
|
||
- # If there's no smoothing, set use method0 from SmoothinFunction class.
|
||
- if not smoothing_function:
|
||
- smoothing_function = SmoothingFunction().method0
|
||
- # Smoothen the modified precision.
|
||
- # Note: smoothing_function() may convert values into floats;
|
||
- # it tries to retain the Fraction object as much as the
|
||
- # smoothing method allows.
|
||
- p_n = smoothing_function(
|
||
- p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
|
||
- )
|
||
-
|
||
- bleu_scores = []
|
||
- for weight in weights:
|
||
- # Uniformly re-weighting based on maximum hypothesis lengths if largest
|
||
- # order of n-grams < 4 and weights is set at default.
|
||
- if auto_reweigh:
|
||
- if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
|
||
- weight = (1 / hyp_lengths,) * hyp_lengths
|
||
-
|
||
- s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
|
||
- s = bp * math.exp(math.fsum(s))
|
||
- bleu_scores.append(s)
|
||
- return bleu_scores[0] if len(weights) == 1 else bleu_scores
|
||
-
|
||
-
|
||
-def modified_precision(references, hypothesis, n):
|
||
- """
|
||
- Calculate modified ngram precision.
|
||
-
|
||
- The normal precision method may lead to some wrong translations with
|
||
- high-precision, e.g., the translation, in which a word of reference
|
||
- repeats several times, has very high precision.
|
||
-
|
||
- This function only returns the Fraction object that contains the numerator
|
||
- and denominator necessary to calculate the corpus-level precision.
|
||
- To calculate the modified precision for a single pair of hypothesis and
|
||
- references, cast the Fraction object into a float.
|
||
-
|
||
- The famous "the the the ... " example shows that you can get BLEU precision
|
||
- by duplicating high frequency words.
|
||
-
|
||
- >>> reference1 = 'the cat is on the mat'.split()
|
||
- >>> reference2 = 'there is a cat on the mat'.split()
|
||
- >>> hypothesis1 = 'the the the the the the the'.split()
|
||
- >>> references = [reference1, reference2]
|
||
- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||
- 0.2857...
|
||
-
|
||
- In the modified n-gram precision, a reference word will be considered
|
||
- exhausted after a matching hypothesis word is identified, e.g.
|
||
-
|
||
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
- ... 'ensures', 'that', 'the', 'military', 'will',
|
||
- ... 'forever', 'heed', 'Party', 'commands']
|
||
- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
- ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
- ... 'being', 'under', 'the', 'command', 'of', 'the',
|
||
- ... 'Party']
|
||
- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
- ... 'of', 'the', 'party']
|
||
- >>> hypothesis = 'of the'.split()
|
||
- >>> references = [reference1, reference2, reference3]
|
||
- >>> float(modified_precision(references, hypothesis, n=1))
|
||
- 1.0
|
||
- >>> float(modified_precision(references, hypothesis, n=2))
|
||
- 1.0
|
||
-
|
||
- An example of a normal machine translation hypothesis:
|
||
-
|
||
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||
- ... 'ensures', 'that', 'the', 'military', 'always',
|
||
- ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||
-
|
||
- >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||
- ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||
- ... 'that', 'party', 'direct']
|
||
-
|
||
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
- ... 'ensures', 'that', 'the', 'military', 'will',
|
||
- ... 'forever', 'heed', 'Party', 'commands']
|
||
-
|
||
- >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
- ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
- ... 'being', 'under', 'the', 'command', 'of', 'the',
|
||
- ... 'Party']
|
||
-
|
||
- >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
- ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
- ... 'of', 'the', 'party']
|
||
- >>> references = [reference1, reference2, reference3]
|
||
- >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||
- 0.9444...
|
||
- >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
|
||
- 0.5714...
|
||
- >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
|
||
- 0.5882352941176471
|
||
- >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
|
||
- 0.07692...
|
||
-
|
||
-
|
||
- :param references: A list of reference translations.
|
||
- :type references: list(list(str))
|
||
- :param hypothesis: A hypothesis translation.
|
||
- :type hypothesis: list(str)
|
||
- :param n: The ngram order.
|
||
- :type n: int
|
||
- :return: BLEU's modified precision for the nth order ngram.
|
||
- :rtype: Fraction
|
||
- """
|
||
- # Extracts all ngrams in hypothesis
|
||
- # Set an empty Counter if hypothesis is empty.
|
||
- counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
|
||
- # Extract a union of references' counts.
|
||
- # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
|
||
- max_counts = {}
|
||
- for reference in references:
|
||
- reference_counts = (
|
||
- Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
|
||
- )
|
||
- for ngram in counts:
|
||
- max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
|
||
-
|
||
- # Assigns the intersection between hypothesis and references' counts.
|
||
- clipped_counts = {
|
||
- ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
|
||
- }
|
||
-
|
||
- numerator = sum(clipped_counts.values())
|
||
- # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
|
||
- # Usually this happens when the ngram order is > len(reference).
|
||
- denominator = max(1, sum(counts.values()))
|
||
-
|
||
- return Fraction(numerator, denominator, _normalize=False)
|
||
-
|
||
-
|
||
-def closest_ref_length(references, hyp_len):
|
||
- """
|
||
- This function finds the reference that is the closest length to the
|
||
- hypothesis. The closest reference length is referred to as *r* variable
|
||
- from the brevity penalty formula in Papineni et. al. (2002)
|
||
-
|
||
- :param references: A list of reference translations.
|
||
- :type references: list(list(str))
|
||
- :param hyp_len: The length of the hypothesis.
|
||
- :type hyp_len: int
|
||
- :return: The length of the reference that's closest to the hypothesis.
|
||
- :rtype: int
|
||
- """
|
||
- ref_lens = (len(reference) for reference in references)
|
||
- closest_ref_len = min(
|
||
- ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
|
||
- )
|
||
- return closest_ref_len
|
||
-
|
||
-
|
||
-def brevity_penalty(closest_ref_len, hyp_len):
|
||
- """
|
||
- Calculate brevity penalty.
|
||
-
|
||
- As the modified n-gram precision still has the problem from the short
|
||
- length sentence, brevity penalty is used to modify the overall BLEU
|
||
- score according to length.
|
||
-
|
||
- An example from the paper. There are three references with length 12, 15
|
||
- and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
|
||
-
|
||
- >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||
- >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
|
||
- >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
|
||
- >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||
- >>> references = [reference1, reference2, reference3]
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
- >>> brevity_penalty(closest_ref_len, hyp_len)
|
||
- 1.0
|
||
-
|
||
- In case a hypothesis translation is shorter than the references, penalty is
|
||
- applied.
|
||
-
|
||
- >>> references = [['a'] * 28, ['a'] * 28]
|
||
- >>> hypothesis = ['a'] * 12
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
- >>> brevity_penalty(closest_ref_len, hyp_len)
|
||
- 0.2635971381157267
|
||
-
|
||
- The length of the closest reference is used to compute the penalty. If the
|
||
- length of a hypothesis is 12, and the reference lengths are 13 and 2, the
|
||
- penalty is applied because the hypothesis length (12) is less then the
|
||
- closest reference length (13).
|
||
-
|
||
- >>> references = [['a'] * 13, ['a'] * 2]
|
||
- >>> hypothesis = ['a'] * 12
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||
- 0.9200...
|
||
-
|
||
- The brevity penalty doesn't depend on reference order. More importantly,
|
||
- when two reference sentences are at the same distance, the shortest
|
||
- reference sentence length is used.
|
||
-
|
||
- >>> references = [['a'] * 13, ['a'] * 11]
|
||
- >>> hypothesis = ['a'] * 12
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
- >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
|
||
- >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
|
||
- >>> bp1 == bp2 == 1
|
||
- True
|
||
-
|
||
- A test example from mteval-v13a.pl (starting from the line 705):
|
||
-
|
||
- >>> references = [['a'] * 11, ['a'] * 8]
|
||
- >>> hypothesis = ['a'] * 7
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
- >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||
- 0.8668...
|
||
-
|
||
- >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
|
||
- >>> hypothesis = ['a'] * 7
|
||
- >>> hyp_len = len(hypothesis)
|
||
- >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
- >>> brevity_penalty(closest_ref_len, hyp_len)
|
||
- 1.0
|
||
-
|
||
- :param hyp_len: The length of the hypothesis for a single sentence OR the
|
||
- sum of all the hypotheses' lengths for a corpus
|
||
- :type hyp_len: int
|
||
- :param closest_ref_len: The length of the closest reference for a single
|
||
- hypothesis OR the sum of all the closest references for every hypotheses.
|
||
- :type closest_ref_len: int
|
||
- :return: BLEU's brevity penalty.
|
||
- :rtype: float
|
||
- """
|
||
- if hyp_len > closest_ref_len:
|
||
- return 1
|
||
- # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
|
||
- elif hyp_len == 0:
|
||
- return 0
|
||
- else:
|
||
- return math.exp(1 - closest_ref_len / hyp_len)
|
||
-
|
||
-
|
||
-class SmoothingFunction:
|
||
- """
|
||
- This is an implementation of the smoothing techniques
|
||
- for segment-level BLEU scores that was presented in
|
||
- Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
|
||
- Smoothing Techniques for Sentence-Level BLEU. In WMT14.
|
||
- http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
|
||
- """
|
||
-
|
||
- def __init__(self, epsilon=0.1, alpha=5, k=5):
|
||
- """
|
||
- This will initialize the parameters required for the various smoothing
|
||
- techniques, the default values are set to the numbers used in the
|
||
- experiments from Chen and Cherry (2014).
|
||
-
|
||
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
|
||
- ... 'that', 'the', 'military', 'always', 'obeys', 'the',
|
||
- ... 'commands', 'of', 'the', 'party']
|
||
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
|
||
- ... 'that', 'the', 'military', 'will', 'forever', 'heed',
|
||
- ... 'Party', 'commands']
|
||
-
|
||
- >>> chencherry = SmoothingFunction()
|
||
- >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
|
||
- 0.4118...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
|
||
- 0.4118...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
|
||
- 0.4118...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
|
||
- 0.4452...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
|
||
- 0.4118...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
|
||
- 0.4118...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
|
||
- 0.4905...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
|
||
- 0.4135...
|
||
- >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
|
||
- 0.4905...
|
||
-
|
||
- :param epsilon: the epsilon value use in method 1
|
||
- :type epsilon: float
|
||
- :param alpha: the alpha value use in method 6
|
||
- :type alpha: int
|
||
- :param k: the k value use in method 4
|
||
- :type k: int
|
||
- """
|
||
- self.epsilon = epsilon
|
||
- self.alpha = alpha
|
||
- self.k = k
|
||
-
|
||
- def method0(self, p_n, *args, **kwargs):
|
||
- """
|
||
- No smoothing.
|
||
- """
|
||
- p_n_new = []
|
||
- for i, p_i in enumerate(p_n):
|
||
- if p_i.numerator != 0:
|
||
- p_n_new.append(p_i)
|
||
- else:
|
||
- _msg = str(
|
||
- "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
|
||
- "Therefore the BLEU score evaluates to 0, independently of\n"
|
||
- "how many N-gram overlaps of lower order it contains.\n"
|
||
- "Consider using lower n-gram order or use "
|
||
- "SmoothingFunction()"
|
||
- ).format(i + 1)
|
||
- warnings.warn(_msg)
|
||
- # When numerator==0 where denonminator==0 or !=0, the result
|
||
- # for the precision score should be equal to 0 or undefined.
|
||
- # Due to BLEU geometric mean computation in logarithm space,
|
||
- # we we need to take the return sys.float_info.min such that
|
||
- # math.log(sys.float_info.min) returns a 0 precision score.
|
||
- p_n_new.append(sys.float_info.min)
|
||
- return p_n_new
|
||
-
|
||
- def method1(self, p_n, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
|
||
- """
|
||
- return [
|
||
- (p_i.numerator + self.epsilon) / p_i.denominator
|
||
- if p_i.numerator == 0
|
||
- else p_i
|
||
- for p_i in p_n
|
||
- ]
|
||
-
|
||
- def method2(self, p_n, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 2: Add 1 to both numerator and denominator from
|
||
- Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for
|
||
- Evaluating Automatic Evaluation Metrics for Machine Translation.
|
||
- In COLING 2004.
|
||
- """
|
||
- return [
|
||
- Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
|
||
- if i != 0
|
||
- else p_n[0]
|
||
- for i in range(len(p_n))
|
||
- ]
|
||
-
|
||
- def method3(self, p_n, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 3: NIST geometric sequence smoothing
|
||
- The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
|
||
- precision score whose matching n-gram count is null.
|
||
- k is 1 for the first 'n' value for which the n-gram match count is null/
|
||
-
|
||
- For example, if the text contains:
|
||
-
|
||
- - one 2-gram match
|
||
- - and (consequently) two 1-gram matches
|
||
-
|
||
- the n-gram count for each individual precision score would be:
|
||
-
|
||
- - n=1 => prec_count = 2 (two unigrams)
|
||
- - n=2 => prec_count = 1 (one bigram)
|
||
- - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
||
- - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
||
- """
|
||
- incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
|
||
- for i, p_i in enumerate(p_n):
|
||
- if p_i.numerator == 0:
|
||
- p_n[i] = 1 / (2**incvnt * p_i.denominator)
|
||
- incvnt += 1
|
||
- return p_n
|
||
-
|
||
- def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 4:
|
||
- Shorter translations may have inflated precision values due to having
|
||
- smaller denominators; therefore, we give them proportionally
|
||
- smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
|
||
- suggests dividing by 1/ln(len(T)), where T is the length of the translation.
|
||
- """
|
||
- incvnt = 1
|
||
- hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
- for i, p_i in enumerate(p_n):
|
||
- if p_i.numerator == 0 and hyp_len > 1:
|
||
- # incvnt = i + 1 * self.k / math.log(
|
||
- # hyp_len
|
||
- # ) # Note that this K is different from the K from NIST.
|
||
- # p_n[i] = incvnt / p_i.denominator\
|
||
- numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
|
||
- p_n[i] = numerator / p_i.denominator
|
||
- incvnt += 1
|
||
- return p_n
|
||
-
|
||
- def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 5:
|
||
- The matched counts for similar values of n should be similar. To a
|
||
- calculate the n-gram matched count, it averages the n−1, n and n+1 gram
|
||
- matched counts.
|
||
- """
|
||
- hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
- m = {}
|
||
- # Requires an precision value for an addition ngram order.
|
||
- p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
|
||
- m[-1] = p_n[0] + 1
|
||
- for i, p_i in enumerate(p_n):
|
||
- p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
|
||
- m[i] = p_n[i]
|
||
- return p_n
|
||
-
|
||
- def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 6:
|
||
- Interpolates the maximum likelihood estimate of the precision *p_n* with
|
||
- a prior estimate *pi0*. The prior is estimated by assuming that the ratio
|
||
- between pn and pn−1 will be the same as that between pn−1 and pn−2; from
|
||
- Gao and He (2013) Training MRF-Based Phrase Translation Models using
|
||
- Gradient Ascent. In NAACL.
|
||
- """
|
||
- hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
- # This smoothing only works when p_1 and p_2 is non-zero.
|
||
- # Raise an error with an appropriate message when the input is too short
|
||
- # to use this smoothing technique.
|
||
- assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
|
||
- for i, p_i in enumerate(p_n):
|
||
- if i in [0, 1]: # Skips the first 2 orders of ngrams.
|
||
- continue
|
||
- else:
|
||
- pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
|
||
- # No. of ngrams in translation that matches the reference.
|
||
- m = p_i.numerator
|
||
- # No. of ngrams in translation.
|
||
- l = sum(1 for _ in ngrams(hypothesis, i + 1))
|
||
- # Calculates the interpolated precision.
|
||
- p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
|
||
- return p_n
|
||
-
|
||
- def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
- """
|
||
- Smoothing method 7:
|
||
- Interpolates methods 4 and 5.
|
||
- """
|
||
- hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
- p_n = self.method4(p_n, references, hypothesis, hyp_len)
|
||
- p_n = self.method5(p_n, references, hypothesis, hyp_len)
|
||
- return p_n
|
||
+# Natural Language Toolkit: BLEU Score
|
||
+#
|
||
+# Copyright (C) 2001-2023 NLTK Project
|
||
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||
+# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
|
||
+# URL: <https://www.nltk.org/>
|
||
+# For license information, see LICENSE.TXT
|
||
+
|
||
+"""BLEU score implementation."""
|
||
+import math
|
||
+import sys
|
||
+import warnings
|
||
+from collections import Counter
|
||
+from fractions import Fraction as _Fraction
|
||
+
|
||
+from nltk.util import ngrams
|
||
+
|
||
+
|
||
+class Fraction(_Fraction):
|
||
+ """Fraction with _normalize=False support for 3.12"""
|
||
+
|
||
+ def __new__(cls, numerator=0, denominator=None, _normalize=False):
|
||
+ if sys.version_info >= (3, 12):
|
||
+ self = super().__new__(cls, numerator, denominator)
|
||
+ else:
|
||
+ self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
|
||
+ self._normalize = _normalize
|
||
+ self._original_numerator = numerator
|
||
+ self._original_denominator = denominator
|
||
+ return self
|
||
+
|
||
+ @property
|
||
+ def numerator(self):
|
||
+ if not self._normalize:
|
||
+ return self._original_numerator
|
||
+ return super().numerator
|
||
+
|
||
+ @property
|
||
+ def denominator(self):
|
||
+ if not self._normalize:
|
||
+ return self._original_denominator
|
||
+ return super().denominator
|
||
+
|
||
+
|
||
+def sentence_bleu(
|
||
+ references,
|
||
+ hypothesis,
|
||
+ weights=(0.25, 0.25, 0.25, 0.25),
|
||
+ smoothing_function=None,
|
||
+ auto_reweigh=False,
|
||
+):
|
||
+ """
|
||
+ Calculate BLEU score (Bilingual Evaluation Understudy) from
|
||
+ Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
|
||
+ "BLEU: a method for automatic evaluation of machine translation."
|
||
+ In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf
|
||
+
|
||
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'always',
|
||
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||
+
|
||
+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||
+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||
+ ... 'that', 'party', 'direct']
|
||
+
|
||
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||
+ ... 'heed', 'Party', 'commands']
|
||
+
|
||
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
|
||
+ ... 'Party']
|
||
+
|
||
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
+ ... 'of', 'the', 'party']
|
||
+
|
||
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
||
+ 0.5045...
|
||
+
|
||
+ If there is no ngrams overlap for any order of n-grams, BLEU returns the
|
||
+ value 0. This is because the precision for the order of n-grams without
|
||
+ overlap is 0, and the geometric mean in the final BLEU score computation
|
||
+ multiplies the 0 with the precision of other n-grams. This results in 0
|
||
+ (independently of the precision of the other n-gram orders). The following
|
||
+ example has zero 3-gram and 4-gram overlaps:
|
||
+
|
||
+ >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
|
||
+ 0.0
|
||
+
|
||
+ To avoid this harsh behaviour when no ngram overlaps are found a smoothing
|
||
+ function can be used.
|
||
+
|
||
+ >>> chencherry = SmoothingFunction()
|
||
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
|
||
+ ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
|
||
+ 0.0370...
|
||
+
|
||
+ The default BLEU calculates a score for up to 4-grams using uniform
|
||
+ weights (this is called BLEU-4). To evaluate your translations with
|
||
+ higher/lower order ngrams, use customized weights. E.g. when accounting
|
||
+ for up to 5-grams with uniform weights (this is called BLEU-5) use:
|
||
+
|
||
+ >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
|
||
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||
+ 0.3920...
|
||
+
|
||
+ Multiple BLEU scores can be computed at once, by supplying a list of weights.
|
||
+ E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
|
||
+ >>> weights = [
|
||
+ ... (1./2., 1./2.),
|
||
+ ... (1./3., 1./3., 1./3.),
|
||
+ ... (1./4., 1./4., 1./4., 1./4.)
|
||
+ ... ]
|
||
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||
+ [0.7453..., 0.6240..., 0.5045...]
|
||
+
|
||
+ :param references: reference sentences
|
||
+ :type references: list(list(str))
|
||
+ :param hypothesis: a hypothesis sentence
|
||
+ :type hypothesis: list(str)
|
||
+ :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||
+ :type weights: tuple(float) / list(tuple(float))
|
||
+ :param smoothing_function:
|
||
+ :type smoothing_function: SmoothingFunction
|
||
+ :param auto_reweigh: Option to re-normalize the weights uniformly.
|
||
+ :type auto_reweigh: bool
|
||
+ :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
|
||
+ :rtype: float / list(float)
|
||
+ """
|
||
+ return corpus_bleu(
|
||
+ [references], [hypothesis], weights, smoothing_function, auto_reweigh
|
||
+ )
|
||
+
|
||
+
|
||
+def corpus_bleu(
|
||
+ list_of_references,
|
||
+ hypotheses,
|
||
+ weights=(0.25, 0.25, 0.25, 0.25),
|
||
+ smoothing_function=None,
|
||
+ auto_reweigh=False,
|
||
+):
|
||
+ """
|
||
+ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
|
||
+ the hypotheses and their respective references.
|
||
+
|
||
+ Instead of averaging the sentence level BLEU scores (i.e. macro-average
|
||
+ precision), the original BLEU metric (Papineni et al. 2002) accounts for
|
||
+ the micro-average precision (i.e. summing the numerators and denominators
|
||
+ for each hypothesis-reference(s) pairs before the division).
|
||
+
|
||
+ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'always',
|
||
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||
+ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||
+ ... 'heed', 'Party', 'commands']
|
||
+ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
+ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||
+ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
+ ... 'of', 'the', 'party']
|
||
+
|
||
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||
+ ... 'interested', 'in', 'world', 'history']
|
||
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||
+ ... 'because', 'he', 'read', 'the', 'book']
|
||
+
|
||
+ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||
+ >>> hypotheses = [hyp1, hyp2]
|
||
+ >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
||
+ 0.5920...
|
||
+
|
||
+ The example below show that corpus_bleu() is different from averaging
|
||
+ sentence_bleu() for hypotheses
|
||
+
|
||
+ >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
|
||
+ >>> score2 = sentence_bleu([ref2a], hyp2)
|
||
+ >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
||
+ 0.6223...
|
||
+
|
||
+ Custom weights may be supplied to fine-tune the BLEU score further.
|
||
+ A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
|
||
+ >>> weights = (0.1, 0.3, 0.5, 0.1)
|
||
+ >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||
+ 0.5818...
|
||
+
|
||
+ This particular weight gave extra value to trigrams.
|
||
+ Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
|
||
+ >>> weights = [
|
||
+ ... (0.5, 0.5),
|
||
+ ... (0.333, 0.333, 0.334),
|
||
+ ... (0.25, 0.25, 0.25, 0.25),
|
||
+ ... (0.2, 0.2, 0.2, 0.2, 0.2)
|
||
+ ... ]
|
||
+ >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||
+ [0.8242..., 0.7067..., 0.5920..., 0.4719...]
|
||
+
|
||
+ :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||
+ :type list_of_references: list(list(list(str)))
|
||
+ :param hypotheses: a list of hypothesis sentences
|
||
+ :type hypotheses: list(list(str))
|
||
+ :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||
+ :type weights: tuple(float) / list(tuple(float))
|
||
+ :param smoothing_function:
|
||
+ :type smoothing_function: SmoothingFunction
|
||
+ :param auto_reweigh: Option to re-normalize the weights uniformly.
|
||
+ :type auto_reweigh: bool
|
||
+ :return: The corpus-level BLEU score.
|
||
+ :rtype: float
|
||
+ """
|
||
+ # Before proceeding to compute BLEU, perform sanity checks.
|
||
+
|
||
+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
|
||
+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
|
||
+ hyp_lengths, ref_lengths = 0, 0
|
||
+
|
||
+ assert len(list_of_references) == len(hypotheses), (
|
||
+ "The number of hypotheses and their reference(s) should be the " "same "
|
||
+ )
|
||
+
|
||
+ try:
|
||
+ weights[0][0]
|
||
+ except TypeError:
|
||
+ weights = [weights]
|
||
+ max_weight_length = max(len(weight) for weight in weights)
|
||
+
|
||
+ # Iterate through each hypothesis and their corresponding references.
|
||
+ for references, hypothesis in zip(list_of_references, hypotheses):
|
||
+ # For each order of ngram, calculate the numerator and
|
||
+ # denominator for the corpus-level modified precision.
|
||
+ for i in range(1, max_weight_length + 1):
|
||
+ p_i = modified_precision(references, hypothesis, i)
|
||
+ p_numerators[i] += p_i.numerator
|
||
+ p_denominators[i] += p_i.denominator
|
||
+
|
||
+ # Calculate the hypothesis length and the closest reference length.
|
||
+ # Adds them to the corpus-level hypothesis and reference counts.
|
||
+ hyp_len = len(hypothesis)
|
||
+ hyp_lengths += hyp_len
|
||
+ ref_lengths += closest_ref_length(references, hyp_len)
|
||
+
|
||
+ # Calculate corpus-level brevity penalty.
|
||
+ bp = brevity_penalty(ref_lengths, hyp_lengths)
|
||
+
|
||
+ # Collects the various precision values for the different ngram orders.
|
||
+ p_n = [
|
||
+ Fraction(p_numerators[i], p_denominators[i], _normalize=False)
|
||
+ for i in range(1, max_weight_length + 1)
|
||
+ ]
|
||
+
|
||
+ # Returns 0 if there's no matching n-grams
|
||
+ # We only need to check for p_numerators[1] == 0, since if there's
|
||
+ # no unigrams, there won't be any higher order ngrams.
|
||
+ if p_numerators[1] == 0:
|
||
+ return 0 if len(weights) == 1 else [0] * len(weights)
|
||
+
|
||
+ # If there's no smoothing, set use method0 from SmoothinFunction class.
|
||
+ if not smoothing_function:
|
||
+ smoothing_function = SmoothingFunction().method0
|
||
+ # Smoothen the modified precision.
|
||
+ # Note: smoothing_function() may convert values into floats;
|
||
+ # it tries to retain the Fraction object as much as the
|
||
+ # smoothing method allows.
|
||
+ p_n = smoothing_function(
|
||
+ p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
|
||
+ )
|
||
+
|
||
+ bleu_scores = []
|
||
+ for weight in weights:
|
||
+ # Uniformly re-weighting based on maximum hypothesis lengths if largest
|
||
+ # order of n-grams < 4 and weights is set at default.
|
||
+ if auto_reweigh:
|
||
+ if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
|
||
+ weight = (1 / hyp_lengths,) * hyp_lengths
|
||
+
|
||
+ s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
|
||
+ s = bp * math.exp(math.fsum(s))
|
||
+ bleu_scores.append(s)
|
||
+ return bleu_scores[0] if len(weights) == 1 else bleu_scores
|
||
+
|
||
+
|
||
+def modified_precision(references, hypothesis, n):
|
||
+ """
|
||
+ Calculate modified ngram precision.
|
||
+
|
||
+ The normal precision method may lead to some wrong translations with
|
||
+ high-precision, e.g., the translation, in which a word of reference
|
||
+ repeats several times, has very high precision.
|
||
+
|
||
+ This function only returns the Fraction object that contains the numerator
|
||
+ and denominator necessary to calculate the corpus-level precision.
|
||
+ To calculate the modified precision for a single pair of hypothesis and
|
||
+ references, cast the Fraction object into a float.
|
||
+
|
||
+ The famous "the the the ... " example shows that you can get BLEU precision
|
||
+ by duplicating high frequency words.
|
||
+
|
||
+ >>> reference1 = 'the cat is on the mat'.split()
|
||
+ >>> reference2 = 'there is a cat on the mat'.split()
|
||
+ >>> hypothesis1 = 'the the the the the the the'.split()
|
||
+ >>> references = [reference1, reference2]
|
||
+ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||
+ 0.2857...
|
||
+
|
||
+ In the modified n-gram precision, a reference word will be considered
|
||
+ exhausted after a matching hypothesis word is identified, e.g.
|
||
+
|
||
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'will',
|
||
+ ... 'forever', 'heed', 'Party', 'commands']
|
||
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
|
||
+ ... 'Party']
|
||
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
+ ... 'of', 'the', 'party']
|
||
+ >>> hypothesis = 'of the'.split()
|
||
+ >>> references = [reference1, reference2, reference3]
|
||
+ >>> float(modified_precision(references, hypothesis, n=1))
|
||
+ 1.0
|
||
+ >>> float(modified_precision(references, hypothesis, n=2))
|
||
+ 1.0
|
||
+
|
||
+ An example of a normal machine translation hypothesis:
|
||
+
|
||
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'always',
|
||
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||
+
|
||
+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||
+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||
+ ... 'that', 'party', 'direct']
|
||
+
|
||
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||
+ ... 'ensures', 'that', 'the', 'military', 'will',
|
||
+ ... 'forever', 'heed', 'Party', 'commands']
|
||
+
|
||
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
|
||
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
|
||
+ ... 'Party']
|
||
+
|
||
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||
+ ... 'of', 'the', 'party']
|
||
+ >>> references = [reference1, reference2, reference3]
|
||
+ >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||
+ 0.9444...
|
||
+ >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
|
||
+ 0.5714...
|
||
+ >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
|
||
+ 0.5882352941176471
|
||
+ >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
|
||
+ 0.07692...
|
||
+
|
||
+
|
||
+ :param references: A list of reference translations.
|
||
+ :type references: list(list(str))
|
||
+ :param hypothesis: A hypothesis translation.
|
||
+ :type hypothesis: list(str)
|
||
+ :param n: The ngram order.
|
||
+ :type n: int
|
||
+ :return: BLEU's modified precision for the nth order ngram.
|
||
+ :rtype: Fraction
|
||
+ """
|
||
+ # Extracts all ngrams in hypothesis
|
||
+ # Set an empty Counter if hypothesis is empty.
|
||
+ counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
|
||
+ # Extract a union of references' counts.
|
||
+ # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
|
||
+ max_counts = {}
|
||
+ for reference in references:
|
||
+ reference_counts = (
|
||
+ Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
|
||
+ )
|
||
+ for ngram in counts:
|
||
+ max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
|
||
+
|
||
+ # Assigns the intersection between hypothesis and references' counts.
|
||
+ clipped_counts = {
|
||
+ ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
|
||
+ }
|
||
+
|
||
+ numerator = sum(clipped_counts.values())
|
||
+ # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
|
||
+ # Usually this happens when the ngram order is > len(reference).
|
||
+ denominator = max(1, sum(counts.values()))
|
||
+
|
||
+ return Fraction(numerator, denominator, _normalize=False)
|
||
+
|
||
+
|
||
+def closest_ref_length(references, hyp_len):
|
||
+ """
|
||
+ This function finds the reference that is the closest length to the
|
||
+ hypothesis. The closest reference length is referred to as *r* variable
|
||
+ from the brevity penalty formula in Papineni et. al. (2002)
|
||
+
|
||
+ :param references: A list of reference translations.
|
||
+ :type references: list(list(str))
|
||
+ :param hyp_len: The length of the hypothesis.
|
||
+ :type hyp_len: int
|
||
+ :return: The length of the reference that's closest to the hypothesis.
|
||
+ :rtype: int
|
||
+ """
|
||
+ ref_lens = (len(reference) for reference in references)
|
||
+ closest_ref_len = min(
|
||
+ ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
|
||
+ )
|
||
+ return closest_ref_len
|
||
+
|
||
+
|
||
+def brevity_penalty(closest_ref_len, hyp_len):
|
||
+ """
|
||
+ Calculate brevity penalty.
|
||
+
|
||
+ As the modified n-gram precision still has the problem from the short
|
||
+ length sentence, brevity penalty is used to modify the overall BLEU
|
||
+ score according to length.
|
||
+
|
||
+ An example from the paper. There are three references with length 12, 15
|
||
+ and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
|
||
+
|
||
+ >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||
+ >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
|
||
+ >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
|
||
+ >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||
+ >>> references = [reference1, reference2, reference3]
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
+ >>> brevity_penalty(closest_ref_len, hyp_len)
|
||
+ 1.0
|
||
+
|
||
+ In case a hypothesis translation is shorter than the references, penalty is
|
||
+ applied.
|
||
+
|
||
+ >>> references = [['a'] * 28, ['a'] * 28]
|
||
+ >>> hypothesis = ['a'] * 12
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
+ >>> brevity_penalty(closest_ref_len, hyp_len)
|
||
+ 0.2635971381157267
|
||
+
|
||
+ The length of the closest reference is used to compute the penalty. If the
|
||
+ length of a hypothesis is 12, and the reference lengths are 13 and 2, the
|
||
+ penalty is applied because the hypothesis length (12) is less then the
|
||
+ closest reference length (13).
|
||
+
|
||
+ >>> references = [['a'] * 13, ['a'] * 2]
|
||
+ >>> hypothesis = ['a'] * 12
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||
+ 0.9200...
|
||
+
|
||
+ The brevity penalty doesn't depend on reference order. More importantly,
|
||
+ when two reference sentences are at the same distance, the shortest
|
||
+ reference sentence length is used.
|
||
+
|
||
+ >>> references = [['a'] * 13, ['a'] * 11]
|
||
+ >>> hypothesis = ['a'] * 12
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
+ >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
|
||
+ >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
|
||
+ >>> bp1 == bp2 == 1
|
||
+ True
|
||
+
|
||
+ A test example from mteval-v13a.pl (starting from the line 705):
|
||
+
|
||
+ >>> references = [['a'] * 11, ['a'] * 8]
|
||
+ >>> hypothesis = ['a'] * 7
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||
+ 0.8668...
|
||
+
|
||
+ >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
|
||
+ >>> hypothesis = ['a'] * 7
|
||
+ >>> hyp_len = len(hypothesis)
|
||
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||
+ >>> brevity_penalty(closest_ref_len, hyp_len)
|
||
+ 1.0
|
||
+
|
||
+ :param hyp_len: The length of the hypothesis for a single sentence OR the
|
||
+ sum of all the hypotheses' lengths for a corpus
|
||
+ :type hyp_len: int
|
||
+ :param closest_ref_len: The length of the closest reference for a single
|
||
+ hypothesis OR the sum of all the closest references for every hypotheses.
|
||
+ :type closest_ref_len: int
|
||
+ :return: BLEU's brevity penalty.
|
||
+ :rtype: float
|
||
+ """
|
||
+ if hyp_len > closest_ref_len:
|
||
+ return 1
|
||
+ # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
|
||
+ elif hyp_len == 0:
|
||
+ return 0
|
||
+ else:
|
||
+ return math.exp(1 - closest_ref_len / hyp_len)
|
||
+
|
||
+
|
||
+class SmoothingFunction:
|
||
+ """
|
||
+ This is an implementation of the smoothing techniques
|
||
+ for segment-level BLEU scores that was presented in
|
||
+ Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
|
||
+ Smoothing Techniques for Sentence-Level BLEU. In WMT14.
|
||
+ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
|
||
+ """
|
||
+
|
||
+ def __init__(self, epsilon=0.1, alpha=5, k=5):
|
||
+ """
|
||
+ This will initialize the parameters required for the various smoothing
|
||
+ techniques, the default values are set to the numbers used in the
|
||
+ experiments from Chen and Cherry (2014).
|
||
+
|
||
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
|
||
+ ... 'that', 'the', 'military', 'always', 'obeys', 'the',
|
||
+ ... 'commands', 'of', 'the', 'party']
|
||
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
|
||
+ ... 'that', 'the', 'military', 'will', 'forever', 'heed',
|
||
+ ... 'Party', 'commands']
|
||
+
|
||
+ >>> chencherry = SmoothingFunction()
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
|
||
+ 0.4118...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
|
||
+ 0.4118...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
|
||
+ 0.4118...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
|
||
+ 0.4452...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
|
||
+ 0.4118...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
|
||
+ 0.4118...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
|
||
+ 0.4905...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
|
||
+ 0.4135...
|
||
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
|
||
+ 0.4905...
|
||
+
|
||
+ :param epsilon: the epsilon value use in method 1
|
||
+ :type epsilon: float
|
||
+ :param alpha: the alpha value use in method 6
|
||
+ :type alpha: int
|
||
+ :param k: the k value use in method 4
|
||
+ :type k: int
|
||
+ """
|
||
+ self.epsilon = epsilon
|
||
+ self.alpha = alpha
|
||
+ self.k = k
|
||
+
|
||
+ def method0(self, p_n, *args, **kwargs):
|
||
+ """
|
||
+ No smoothing.
|
||
+ """
|
||
+ p_n_new = []
|
||
+ for i, p_i in enumerate(p_n):
|
||
+ if p_i.numerator != 0:
|
||
+ p_n_new.append(p_i)
|
||
+ else:
|
||
+ _msg = str(
|
||
+ "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
|
||
+ "Therefore the BLEU score evaluates to 0, independently of\n"
|
||
+ "how many N-gram overlaps of lower order it contains.\n"
|
||
+ "Consider using lower n-gram order or use "
|
||
+ "SmoothingFunction()"
|
||
+ ).format(i + 1)
|
||
+ warnings.warn(_msg)
|
||
+ # When numerator==0 where denonminator==0 or !=0, the result
|
||
+ # for the precision score should be equal to 0 or undefined.
|
||
+ # Due to BLEU geometric mean computation in logarithm space,
|
||
+ # we we need to take the return sys.float_info.min such that
|
||
+ # math.log(sys.float_info.min) returns a 0 precision score.
|
||
+ p_n_new.append(sys.float_info.min)
|
||
+ return p_n_new
|
||
+
|
||
+ def method1(self, p_n, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
|
||
+ """
|
||
+ return [
|
||
+ (p_i.numerator + self.epsilon) / p_i.denominator
|
||
+ if p_i.numerator == 0
|
||
+ else p_i
|
||
+ for p_i in p_n
|
||
+ ]
|
||
+
|
||
+ def method2(self, p_n, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 2: Add 1 to both numerator and denominator from
|
||
+ Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for
|
||
+ Evaluating Automatic Evaluation Metrics for Machine Translation.
|
||
+ In COLING 2004.
|
||
+ """
|
||
+ return [
|
||
+ Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
|
||
+ if i != 0
|
||
+ else p_n[0]
|
||
+ for i in range(len(p_n))
|
||
+ ]
|
||
+
|
||
+ def method3(self, p_n, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 3: NIST geometric sequence smoothing
|
||
+ The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
|
||
+ precision score whose matching n-gram count is null.
|
||
+ k is 1 for the first 'n' value for which the n-gram match count is null/
|
||
+
|
||
+ For example, if the text contains:
|
||
+
|
||
+ - one 2-gram match
|
||
+ - and (consequently) two 1-gram matches
|
||
+
|
||
+ the n-gram count for each individual precision score would be:
|
||
+
|
||
+ - n=1 => prec_count = 2 (two unigrams)
|
||
+ - n=2 => prec_count = 1 (one bigram)
|
||
+ - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
||
+ - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
||
+ """
|
||
+ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
|
||
+ for i, p_i in enumerate(p_n):
|
||
+ if p_i.numerator == 0:
|
||
+ p_n[i] = 1 / (2**incvnt * p_i.denominator)
|
||
+ incvnt += 1
|
||
+ return p_n
|
||
+
|
||
+ def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 4:
|
||
+ Shorter translations may have inflated precision values due to having
|
||
+ smaller denominators; therefore, we give them proportionally
|
||
+ smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
|
||
+ suggests dividing by 1/ln(len(T)), where T is the length of the translation.
|
||
+ """
|
||
+ incvnt = 1
|
||
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
+ for i, p_i in enumerate(p_n):
|
||
+ if p_i.numerator == 0 and hyp_len > 1:
|
||
+ # incvnt = i + 1 * self.k / math.log(
|
||
+ # hyp_len
|
||
+ # ) # Note that this K is different from the K from NIST.
|
||
+ # p_n[i] = incvnt / p_i.denominator\
|
||
+ numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
|
||
+ p_n[i] = numerator / p_i.denominator
|
||
+ incvnt += 1
|
||
+ return p_n
|
||
+
|
||
+ def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 5:
|
||
+ The matched counts for similar values of n should be similar. To a
|
||
+ calculate the n-gram matched count, it averages the n−1, n and n+1 gram
|
||
+ matched counts.
|
||
+ """
|
||
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
+ m = {}
|
||
+ # Requires an precision value for an addition ngram order.
|
||
+ p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
|
||
+ m[-1] = p_n[0] + 1
|
||
+ for i, p_i in enumerate(p_n):
|
||
+ p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
|
||
+ m[i] = p_n[i]
|
||
+ return p_n
|
||
+
|
||
+ def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 6:
|
||
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
|
||
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
|
||
+ between pn and pn−1 will be the same as that between pn−1 and pn−2; from
|
||
+ Gao and He (2013) Training MRF-Based Phrase Translation Models using
|
||
+ Gradient Ascent. In NAACL.
|
||
+ """
|
||
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
+ # This smoothing only works when p_1 and p_2 is non-zero.
|
||
+ # Raise an error with an appropriate message when the input is too short
|
||
+ # to use this smoothing technique.
|
||
+ assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
|
||
+ for i, p_i in enumerate(p_n):
|
||
+ if i in [0, 1]: # Skips the first 2 orders of ngrams.
|
||
+ continue
|
||
+ else:
|
||
+ pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
|
||
+ # No. of ngrams in translation that matches the reference.
|
||
+ m = p_i.numerator
|
||
+ # No. of ngrams in translation.
|
||
+ l = sum(1 for _ in ngrams(hypothesis, i + 1))
|
||
+ # Calculates the interpolated precision.
|
||
+ p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
|
||
+ return p_n
|
||
+
|
||
+ def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||
+ """
|
||
+ Smoothing method 7:
|
||
+ Interpolates methods 4 and 5.
|
||
+ """
|
||
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||
+ p_n = self.method4(p_n, references, hypothesis, hyp_len)
|
||
+ p_n = self.method5(p_n, references, hypothesis, hyp_len)
|
||
+ return p_n
|
||
Index: nltk-3.8.1/README.md
|
||
===================================================================
|
||
--- nltk-3.8.1.orig/README.md
|
||
+++ nltk-3.8.1/README.md
|
||
@@ -1,50 +1,50 @@
|
||
-# Natural Language Toolkit (NLTK)
|
||
-[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk)
|
||
-![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop)
|
||
-
|
||
-NLTK -- the Natural Language Toolkit -- is a suite of open source Python
|
||
-modules, data sets, and tutorials supporting research and development in Natural
|
||
-Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10 or 3.11.
|
||
-
|
||
-For documentation, please visit [nltk.org](https://www.nltk.org/).
|
||
-
|
||
-
|
||
-## Contributing
|
||
-
|
||
-Do you want to contribute to NLTK development? Great!
|
||
-Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
|
||
-
|
||
-See also [how to contribute to NLTK](https://www.nltk.org/contribute.html).
|
||
-
|
||
-
|
||
-## Donate
|
||
-
|
||
-Have you found the toolkit helpful? Please support NLTK development by donating
|
||
-to the project via PayPal, using the link on the NLTK homepage.
|
||
-
|
||
-
|
||
-## Citing
|
||
-
|
||
-If you publish work that uses NLTK, please cite the NLTK book, as follows:
|
||
-
|
||
- Bird, Steven, Edward Loper and Ewan Klein (2009).
|
||
- Natural Language Processing with Python. O'Reilly Media Inc.
|
||
-
|
||
-
|
||
-## Copyright
|
||
-
|
||
-Copyright (C) 2001-2023 NLTK Project
|
||
-
|
||
-For license information, see [LICENSE.txt](LICENSE.txt).
|
||
-
|
||
-[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK.
|
||
-
|
||
-
|
||
-### Redistributing
|
||
-
|
||
-- NLTK source code is distributed under the Apache 2.0 License.
|
||
-- NLTK documentation is distributed under the Creative Commons
|
||
- Attribution-Noncommercial-No Derivative Works 3.0 United States license.
|
||
-- NLTK corpora are provided under the terms given in the README file for each
|
||
- corpus; all are redistributable and available for non-commercial use.
|
||
-- NLTK may be freely redistributed, subject to the provisions of these licenses.
|
||
+# Natural Language Toolkit (NLTK)
|
||
+[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk)
|
||
+![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop)
|
||
+
|
||
+NLTK -- the Natural Language Toolkit -- is a suite of open source Python
|
||
+modules, data sets, and tutorials supporting research and development in Natural
|
||
+Language Processing. NLTK requires Python version 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.
|
||
+
|
||
+For documentation, please visit [nltk.org](https://www.nltk.org/).
|
||
+
|
||
+
|
||
+## Contributing
|
||
+
|
||
+Do you want to contribute to NLTK development? Great!
|
||
+Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
|
||
+
|
||
+See also [how to contribute to NLTK](https://www.nltk.org/contribute.html).
|
||
+
|
||
+
|
||
+## Donate
|
||
+
|
||
+Have you found the toolkit helpful? Please support NLTK development by donating
|
||
+to the project via PayPal, using the link on the NLTK homepage.
|
||
+
|
||
+
|
||
+## Citing
|
||
+
|
||
+If you publish work that uses NLTK, please cite the NLTK book, as follows:
|
||
+
|
||
+ Bird, Steven, Edward Loper and Ewan Klein (2009).
|
||
+ Natural Language Processing with Python. O'Reilly Media Inc.
|
||
+
|
||
+
|
||
+## Copyright
|
||
+
|
||
+Copyright (C) 2001-2023 NLTK Project
|
||
+
|
||
+For license information, see [LICENSE.txt](LICENSE.txt).
|
||
+
|
||
+[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK.
|
||
+
|
||
+
|
||
+### Redistributing
|
||
+
|
||
+- NLTK source code is distributed under the Apache 2.0 License.
|
||
+- NLTK documentation is distributed under the Creative Commons
|
||
+ Attribution-Noncommercial-No Derivative Works 3.0 United States license.
|
||
+- NLTK corpora are provided under the terms given in the README file for each
|
||
+ corpus; all are redistributable and available for non-commercial use.
|
||
+- NLTK may be freely redistributed, subject to the provisions of these licenses.
|
||
Index: nltk-3.8.1/setup.py
|
||
===================================================================
|
||
--- nltk-3.8.1.orig/setup.py
|
||
+++ nltk-3.8.1/setup.py
|
||
@@ -1,125 +1,126 @@
|
||
-#!/usr/bin/env python
|
||
-#
|
||
-# Setup script for the Natural Language Toolkit
|
||
-#
|
||
-# Copyright (C) 2001-2023 NLTK Project
|
||
-# Author: NLTK Team <nltk.team@gmail.com>
|
||
-# URL: <https://www.nltk.org/>
|
||
-# For license information, see LICENSE.TXT
|
||
-
|
||
-# Work around mbcs bug in distutils.
|
||
-# https://bugs.python.org/issue10945
|
||
-import codecs
|
||
-
|
||
-try:
|
||
- codecs.lookup("mbcs")
|
||
-except LookupError:
|
||
- ascii = codecs.lookup("ascii")
|
||
- func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs")
|
||
- codecs.register(func)
|
||
-
|
||
-import os
|
||
-
|
||
-# Use the VERSION file to get NLTK version
|
||
-version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION")
|
||
-with open(version_file) as fh:
|
||
- nltk_version = fh.read().strip()
|
||
-
|
||
-# setuptools
|
||
-from setuptools import find_packages, setup
|
||
-
|
||
-# Specify groups of optional dependencies
|
||
-extras_require = {
|
||
- "machine_learning": [
|
||
- "numpy",
|
||
- "python-crfsuite",
|
||
- "scikit-learn",
|
||
- "scipy",
|
||
- ],
|
||
- "plot": ["matplotlib"],
|
||
- "tgrep": ["pyparsing"],
|
||
- "twitter": ["twython"],
|
||
- "corenlp": ["requests"],
|
||
-}
|
||
-
|
||
-# Add a group made up of all optional dependencies
|
||
-extras_require["all"] = {
|
||
- package for group in extras_require.values() for package in group
|
||
-}
|
||
-
|
||
-# Adds CLI commands
|
||
-console_scripts = """
|
||
-[console_scripts]
|
||
-nltk=nltk.cli:cli
|
||
-"""
|
||
-
|
||
-_project_homepage = "https://www.nltk.org/"
|
||
-
|
||
-setup(
|
||
- name="nltk",
|
||
- description="Natural Language Toolkit",
|
||
- version=nltk_version,
|
||
- url=_project_homepage,
|
||
- project_urls={
|
||
- "Documentation": _project_homepage,
|
||
- "Source Code": "https://github.com/nltk/nltk",
|
||
- "Issue Tracker": "https://github.com/nltk/nltk/issues",
|
||
- },
|
||
- long_description="""\
|
||
-The Natural Language Toolkit (NLTK) is a Python package for
|
||
-natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""",
|
||
- license="Apache License, Version 2.0",
|
||
- keywords=[
|
||
- "NLP",
|
||
- "CL",
|
||
- "natural language processing",
|
||
- "computational linguistics",
|
||
- "parsing",
|
||
- "tagging",
|
||
- "tokenizing",
|
||
- "syntax",
|
||
- "linguistics",
|
||
- "language",
|
||
- "natural language",
|
||
- "text analytics",
|
||
- ],
|
||
- maintainer="NLTK Team",
|
||
- maintainer_email="nltk.team@gmail.com",
|
||
- author="NLTK Team",
|
||
- author_email="nltk.team@gmail.com",
|
||
- classifiers=[
|
||
- "Development Status :: 5 - Production/Stable",
|
||
- "Intended Audience :: Developers",
|
||
- "Intended Audience :: Education",
|
||
- "Intended Audience :: Information Technology",
|
||
- "Intended Audience :: Science/Research",
|
||
- "License :: OSI Approved :: Apache Software License",
|
||
- "Operating System :: OS Independent",
|
||
- "Programming Language :: Python :: 3.7",
|
||
- "Programming Language :: Python :: 3.8",
|
||
- "Programming Language :: Python :: 3.9",
|
||
- "Programming Language :: Python :: 3.10",
|
||
- "Programming Language :: Python :: 3.11",
|
||
- "Topic :: Scientific/Engineering",
|
||
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||
- "Topic :: Scientific/Engineering :: Human Machine Interfaces",
|
||
- "Topic :: Scientific/Engineering :: Information Analysis",
|
||
- "Topic :: Text Processing",
|
||
- "Topic :: Text Processing :: Filters",
|
||
- "Topic :: Text Processing :: General",
|
||
- "Topic :: Text Processing :: Indexing",
|
||
- "Topic :: Text Processing :: Linguistic",
|
||
- ],
|
||
- package_data={"nltk": ["test/*.doctest", "VERSION"]},
|
||
- python_requires=">=3.7",
|
||
- install_requires=[
|
||
- "click",
|
||
- "joblib",
|
||
- "regex>=2021.8.3",
|
||
- "tqdm",
|
||
- ],
|
||
- extras_require=extras_require,
|
||
- packages=find_packages(),
|
||
- zip_safe=False, # since normal files will be present too?
|
||
- entry_points=console_scripts,
|
||
-)
|
||
+#!/usr/bin/env python
|
||
+#
|
||
+# Setup script for the Natural Language Toolkit
|
||
+#
|
||
+# Copyright (C) 2001-2023 NLTK Project
|
||
+# Author: NLTK Team <nltk.team@gmail.com>
|
||
+# URL: <https://www.nltk.org/>
|
||
+# For license information, see LICENSE.TXT
|
||
+
|
||
+# Work around mbcs bug in distutils.
|
||
+# https://bugs.python.org/issue10945
|
||
+import codecs
|
||
+
|
||
+try:
|
||
+ codecs.lookup("mbcs")
|
||
+except LookupError:
|
||
+ ascii = codecs.lookup("ascii")
|
||
+ func = lambda name, enc=ascii: {True: enc}.get(name == "mbcs")
|
||
+ codecs.register(func)
|
||
+
|
||
+import os
|
||
+
|
||
+# Use the VERSION file to get NLTK version
|
||
+version_file = os.path.join(os.path.dirname(__file__), "nltk", "VERSION")
|
||
+with open(version_file) as fh:
|
||
+ nltk_version = fh.read().strip()
|
||
+
|
||
+# setuptools
|
||
+from setuptools import find_packages, setup
|
||
+
|
||
+# Specify groups of optional dependencies
|
||
+extras_require = {
|
||
+ "machine_learning": [
|
||
+ "numpy",
|
||
+ "python-crfsuite",
|
||
+ "scikit-learn",
|
||
+ "scipy",
|
||
+ ],
|
||
+ "plot": ["matplotlib"],
|
||
+ "tgrep": ["pyparsing"],
|
||
+ "twitter": ["twython"],
|
||
+ "corenlp": ["requests"],
|
||
+}
|
||
+
|
||
+# Add a group made up of all optional dependencies
|
||
+extras_require["all"] = {
|
||
+ package for group in extras_require.values() for package in group
|
||
+}
|
||
+
|
||
+# Adds CLI commands
|
||
+console_scripts = """
|
||
+[console_scripts]
|
||
+nltk=nltk.cli:cli
|
||
+"""
|
||
+
|
||
+_project_homepage = "https://www.nltk.org/"
|
||
+
|
||
+setup(
|
||
+ name="nltk",
|
||
+ description="Natural Language Toolkit",
|
||
+ version=nltk_version,
|
||
+ url=_project_homepage,
|
||
+ project_urls={
|
||
+ "Documentation": _project_homepage,
|
||
+ "Source Code": "https://github.com/nltk/nltk",
|
||
+ "Issue Tracker": "https://github.com/nltk/nltk/issues",
|
||
+ },
|
||
+ long_description="""\
|
||
+The Natural Language Toolkit (NLTK) is a Python package for
|
||
+natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10, 3.11 or 3.12.""",
|
||
+ license="Apache License, Version 2.0",
|
||
+ keywords=[
|
||
+ "NLP",
|
||
+ "CL",
|
||
+ "natural language processing",
|
||
+ "computational linguistics",
|
||
+ "parsing",
|
||
+ "tagging",
|
||
+ "tokenizing",
|
||
+ "syntax",
|
||
+ "linguistics",
|
||
+ "language",
|
||
+ "natural language",
|
||
+ "text analytics",
|
||
+ ],
|
||
+ maintainer="NLTK Team",
|
||
+ maintainer_email="nltk.team@gmail.com",
|
||
+ author="NLTK Team",
|
||
+ author_email="nltk.team@gmail.com",
|
||
+ classifiers=[
|
||
+ "Development Status :: 5 - Production/Stable",
|
||
+ "Intended Audience :: Developers",
|
||
+ "Intended Audience :: Education",
|
||
+ "Intended Audience :: Information Technology",
|
||
+ "Intended Audience :: Science/Research",
|
||
+ "License :: OSI Approved :: Apache Software License",
|
||
+ "Operating System :: OS Independent",
|
||
+ "Programming Language :: Python :: 3.7",
|
||
+ "Programming Language :: Python :: 3.8",
|
||
+ "Programming Language :: Python :: 3.9",
|
||
+ "Programming Language :: Python :: 3.10",
|
||
+ "Programming Language :: Python :: 3.11",
|
||
+ "Programming Language :: Python :: 3.12",
|
||
+ "Topic :: Scientific/Engineering",
|
||
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||
+ "Topic :: Scientific/Engineering :: Human Machine Interfaces",
|
||
+ "Topic :: Scientific/Engineering :: Information Analysis",
|
||
+ "Topic :: Text Processing",
|
||
+ "Topic :: Text Processing :: Filters",
|
||
+ "Topic :: Text Processing :: General",
|
||
+ "Topic :: Text Processing :: Indexing",
|
||
+ "Topic :: Text Processing :: Linguistic",
|
||
+ ],
|
||
+ package_data={"nltk": ["test/*.doctest", "VERSION"]},
|
||
+ python_requires=">=3.7",
|
||
+ install_requires=[
|
||
+ "click",
|
||
+ "joblib",
|
||
+ "regex>=2021.8.3",
|
||
+ "tqdm",
|
||
+ ],
|
||
+ extras_require=extras_require,
|
||
+ packages=find_packages(),
|
||
+ zip_safe=False, # since normal files will be present too?
|
||
+ entry_points=console_scripts,
|
||
+)
|