Spaces:

morgankavanagh
/

post_editing_evaluator

Sleeping

App Files Files Community

Morgan commited on May 8

Commit

5d5c713

1 Parent(s): baf0ff2

initialised evaluator

Browse files

Files changed (10) hide show

evaluator/__init__.py +0 -0
evaluator/bleu.py +15 -0
evaluator/chrf.py +55 -0
evaluator/comet.py +24 -0
evaluator/mt_data/beam_search_translations.txt +20 -0
evaluator/mt_data/reference_translations.txt +20 -0
evaluator/mt_data/source_sentences.txt +20 -0
evaluator/run_experiment.py +42 -0
interface.py +118 -0
setup.py +23 -0

evaluator/__init__.py ADDED Viewed

File without changes

evaluator/bleu.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+def calculate_bleu(reference_sentences, translations):
+    """
+    Calculate BLEU scores for a list of translations.
+    :param reference_sentences: List of reference translations.
+    :param translations: List of translated sentences (hypotheses).
+    :return: List of BLEU scores.
+    """
+    smoothing_function = SmoothingFunction().method1
+    bleu_scores = [
+        sentence_bleu([ref.split()], hyp.split(), smoothing_function=smoothing_function)
+        for ref, hyp in zip(reference_sentences, translations)
+    ]
+    return bleu_scores

evaluator/chrf.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from collections import Counter
+def calculate_chrf(
+        hypothesis: str,
+        reference: str,
+        char_order: int = 6,
+        beta: float = 2.0,
+) -> float:
+    """
+    Compute the character n-gram F-score between a hypothesis and a reference string.
+    :param hypothesis:
+        A string representing the hypothesis text.
+    :param reference:
+        A string representing the reference text.
+    :param char_order:
+        The maximum n-gram order to consider. Default is 6.
+        This means that unigrams, bigrams, trigrams, fourgrams, fivegrams and sixgrams will be considered.
+    :param beta:
+        The weight of recall in the F-score. Default is 2.0.
+    """
+    ...  # TODO
+    def get_ngrams(text, n):
+        """Extract character n-grams from a string."""
+        return Counter([text[i:i+n] for i in range(len(text) - n + 1)])
+    # Initialize precision and recall sums
+    precision_sum = 0.0
+    recall_sum = 0.0
+    # Loop over all n-gram orders
+    for n in range(1, char_order + 1):
+        hyp_ngrams = get_ngrams(hypothesis, n)
+        ref_ngrams = get_ngrams(reference, n)
+        # Calculate the intersection of n-grams
+        intersection = sum((hyp_ngrams & ref_ngrams).values())
+        # Calculate precision and recall for this n-gram order
+        precision = intersection / sum(hyp_ngrams.values()) if hyp_ngrams else 0.0
+        recall = intersection / sum(ref_ngrams.values()) if ref_ngrams else 0.0
+        # Accumulate precision and recall
+        precision_sum += precision
+        recall_sum += recall
+    # Average precision and recall across all n-gram orders
+    precision_avg = precision_sum / char_order
+    recall_avg = recall_sum / char_order
+    # Calculate the harmonic mean (ChrF score)
+    beta_squared = beta ** 2
+    if precision_avg + recall_avg == 0:
+        return 0.0
+    chrf = (1 + beta_squared) * (precision_avg * recall_avg) / (beta_squared * precision_avg + recall_avg)
+    return chrf * 100  # Scale to percentage

evaluator/comet.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from comet import download_model, load_from_checkpoint
+def calculate_comet(source_sentences, translations, references):
+    """
+    Calculate COMET scores for a list of translations.
+    :param source_sentences: List of source sentences.
+    :param translations: List of translated sentences (hypotheses).
+    :param references: List of reference translations.
+    :return: List of COMET scores (one score per sentence pair).
+    """
+    # Download and load the COMET model
+    model_path = download_model("Unbabel/wmt20-comet-da")
+    model = load_from_checkpoint(model_path)
+    # Prepare data for COMET
+    data = [
+        {"src": src, "mt": mt, "ref": ref}
+        for src, mt, ref in zip(source_sentences, translations, references)
+    ]
+    # Compute COMET scores
+    results = model.predict(data, batch_size=8, gpus=0)
+    scores = results["scores"]  # Extract the scores from the results
+    return scores

evaluator/mt_data/beam_search_translations.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+The merchandise cost less than 20 euros.
+The fee corresponds to 40% of the value of the goods....
+I am # PRS _ ORG # major customer so it is no problem for me.
+All I need is a number or an instruction on what I should put in the package so that it can be tracked by you as a return.
+I ordered the # PRS _ ORG # a few days ago... for 249 €.
+Today it costs €179.
+To be honest, I find that very annoying.
+Do I have to cancel the order and re-order for the better price or is there another way of credit?
+In principle, I also think that children should not be expected to take on the role of parents for their young siblings.
+However, I think it is also good for big siblings to take care of the little ones in moderation.
+Take a quick look, help a little, bring the bottle, etc.
+In this way they learn responsibility, help in the family and are involved.
+As long as they can still be children themselves, I don't see a problem with that.
+If your K1 occupies itself for the short time and it is OK for it, that is good.
+You can judge for yourself whether this is right for your children.
+It's not as if he's supposed to be wrapping K2 and going to sleep for hours on end.
+Legally, it is probably questionable.
+Perhaps this is where the psychologist comes in.
+Because if something should actually happen like a fire, burglary, earthquake, alien invasion etc., it would actually be too much of a responsibility for K1 to take care of K2.
+Whether this is a risk you want to take is of course up to you;)

evaluator/mt_data/reference_translations.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+The goods cost less than 20 euros.
+The fee would equal 40% of the value of the goods...
+I am #PRS_ORG# a serious customer and that is why it is not a problem for me.
+I just need a number or a instructions what I should attach to the package so that it can be traced by you all as a return.
+I ordered the #PRS_ORG# a few days ago...for €249.
+Now it costs €179.
+That really bothers me, I must say.
+Do I need to cancel the order and reorder for the better price, or is there another way of getting the credit?
+I also think that as a general rule, children should not be expected to overtake a parental role for their younger siblings.
+However, I think it is also good for older siblings to care for the younger ones to some degree.
+Sometimes check-up on them, sometimes help, sometimes get the bottle, etc.
+They learn responsibility this way, help within the family and are thus connected with it.
+As long as they can continue to be a kid themselves, I don’t see the problem.
+If your first child looks after themselves alone for a short amount of time, that is OK, that is actually good.
+You can best get a sense of whether or not this works for your children.
+It is not as if they should change the second child’s diaper and sit for hours by them as they fall asleep.
+It is probably questionable from a strictly legal perspective.
+Maybe the psychologist would also have something to say about it.
+Since if something actually does happen, like a fire, break-in, earthquake, alien invasion, etc., it would be too much responsibility for child 1 to take care of child 2.
+If that is a risk that you would like to take that is up to you, of course ;)

evaluator/mt_data/source_sentences.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+Die Ware hat unter 20 Euro gekostet.
+Die Gebühr entspricht 40% des Warenwertest....
+Ich bin #PRS_ORG# Großkunde und somit ist es kein Problem für mich.
+Brauche lediglich eine Nummer oder Anweisung was ich den Paket beilegen sollte damit es von Ihnen als Retoure getrackt werden kann.
+Ich habe vor ein paar Tagen die #PRS_ORG# bestellt... für 249€.
+Heute kostet sie 179€.
+Das finde ich ehrlich gesagt sehr ärgerlich.
+Muss ich die Bestellung stornieren und neu bestellen für den besseren Preis oder gibt es eine andere Möglichkeit der Gutschrift?
+Ich finde grundsätzlich auch, dass man Kindern nicht zumuten sollte, eine Elternrolle für ihre kleinen Geschwister zu übernehmen.
+Allerdings finde ich, tut es großen Geschwistern auch gut, sich in Maßen mit um die Kleinen zu kümmern.
+Mal kurz gucken, mal kurz helfen, mal die Flasche bringen etc.
+So lernen sie Verantwortung, helfen in der Familie mit und sind mit eingebunden.
+Solange sie noch selbst genug Kind sein können, sehe ich da kein Problem.
+Wenn dein K1 sich für die kurze Zeit allein beschäftigt und es für ihn OK ist, ist das doch gut.
+Du kannst selbst am besten ermessen, ob das für deine Kinder passt.
+Es ist ja nicht so, dass er K2 wickeln und stundenlang beim Einschlafen begleiten soll.
+Rein rechtlich ist es wahrscheinlich fragwürdig.
+Vielleicht kommt daher auch die Aussage der Psychologin.
+Denn falls tatsächlich etwas passieren sollte wie ein Brand, Einbruch, Erdbeben, Alieninvasion etc. wäre es tatsächlich zu viel Verantwortung für K1, sich um K2 zu kümmern.
+Ob das nun ein Risiko ist, das du eingehen willst, ist natürlich dein Ermessen ;)

evaluator/run_experiment.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json
+from pathlib import Path
+from assignment5.chrf import calculate_chrf
+from assignment5.mbr import select_best_hypothesis
+# Load data
+data_dir = Path(__file__).parent / "mt_data"
+source_sentences = (data_dir / "source_sentences.txt").read_text().splitlines()
+reference_translations = (data_dir / "reference_translations.txt").read_text().splitlines()
+beam_search_translations = (data_dir / "beam_search_translations.txt").read_text().splitlines()
+with open(data_dir / "samples.jsonl") as f:
+    samples = [json.loads(line)["samples"] for line in f]
+# Step 1: Select the best hypothesis for each source sentence using MBR decoding
+mbr_translations = [select_best_hypothesis(sample_set) for sample_set in samples]
+# Step 2: Calculate ChrF scores for MBR translations
+mbr_chrf_scores = [
+    calculate_chrf(mbr_translation, reference)
+    for mbr_translation, reference in zip(mbr_translations, reference_translations)
+]
+average_mbr_chrf = sum(mbr_chrf_scores) / len(mbr_chrf_scores)
+# Step 3: Calculate ChrF scores for beam search translations
+beam_chrf_scores = [
+    calculate_chrf(beam_translation, reference)
+    for beam_translation, reference in zip(beam_search_translations, reference_translations)
+]
+average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
+# Step 4: Print the results
+print(f"Average ChrF score for MBR decoding: {average_mbr_chrf:.2f}")
+print(f"Average ChrF score for beam search: {average_beam_chrf:.2f}")
+if average_mbr_chrf > average_beam_chrf:
+    print("MBR decoding produced better translations.")
+else:
+    print("Beam search produced better translations.")

interface.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import requests
+import json
+import os
+from evaluator.chrf import calculate_chrf
+from evaluator.comet import calculate_comet  # Import the COMET function
+from pathlib import Path
+# OpenAI API URL and key
+OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+CHATGPT_MODELS = {
+    "GPT-4": "gpt-4"
+}
+def improve_translations(system_prompt, temperature, top_p):
+    # Load data
+    data_dir = Path(__file__).parent / "evaluator" / "mt_data"
+    source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
+    beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
+    reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
+    improved_translations = []
+    sentence_pairs = []  # To store source, draft 1, draft 2, and reference
+    for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
+        # Construct the prompt
+        user_prompt = f"""
+        As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
+        Source: {source}
+        Target: {target}
+        Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
+        """
+        # Prepare API payload
+        payload = {
+            "model": CHATGPT_MODELS["GPT-4"],
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_tokens": 512
+        }
+        headers = {
+            "Authorization": f"Bearer {OPENAI_API_KEY}",
+            "Content-Type": "application/json"
+        }
+        # Call OpenAI API
+        response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
+        response.raise_for_status()
+        data = response.json()
+        # Extract improved translation
+        output = data["choices"][0]["message"]["content"]
+        improved_translation = output.split("Improved Translation:")[-1].strip()
+        improved_translations.append(improved_translation)
+        # Add sentence pair to the list
+        sentence_pairs.append([source, target, improved_translation, reference])
+    # Calculate ChrF scores
+    beam_chrf_scores = [
+        calculate_chrf(beam_translation, reference)
+        for beam_translation, reference in zip(beam_search_translations, reference_translations)
+    ]
+    improved_chrf_scores = [
+        calculate_chrf(improved_translation, reference)
+        for improved_translation, reference in zip(improved_translations, reference_translations)
+    ]
+    # Calculate COMET scores
+    beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
+    improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
+    # Calculate average scores
+    average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
+    average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
+    average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
+    average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
+    # Calculate score changes
+    chrf_change = average_improved_chrf - average_beam_chrf
+    comet_change = average_improved_comet - average_beam_comet
+    # Prepare dataframes
+    sentence_pairs_df = sentence_pairs  # Dataframe for sentence pairs
+    scores_df = [
+        ["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
+        ["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
+    ]
+    # Return dataframes and evaluation message
+    evaluation_message = f"ChrF Change: {(average_improved_chrf/chrf_change):.2f}%, COMET Change: {(average_improved_comet/comet_change):.2f}%"
+    return sentence_pairs_df, scores_df, evaluation_message
+# Gradio interface
+demo = gr.Interface(
+    fn=improve_translations,
+    inputs=[
+        gr.Textbox(label="System Prompt", placeholder="Define the assistant's behavior here..."),
+        gr.Slider(value=1, minimum=0, maximum=1.9, step=0.1, label="Temperature"),
+        gr.Slider(value=1, minimum=0, maximum=1, step=0.01, label="Top P")
+    ],
+    outputs=[
+        gr.Dataframe(headers=["Source text", "Draft 1", "Draft 2", "Reference"], label="Sentence Pairs"),
+        gr.Dataframe(headers=["Metric", "Draft 1", "Draft 2", "Change"], label="Scores"),
+        gr.Textbox(label="Evaluation Results")
+    ],
+    title="Translation Post-Editing and Evaluation",
+    description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
+)
+demo.launch(share=True)

setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from setuptools import setup, find_packages
+setup(
+    name="translator_evaluator",
+    version="0.1",
+    author="Morgan Kavanagh",
+    description="A package for machine translation evaluation using BertScore, COMET, BLEU, and ChatGPT integration.",
+    packages=find_packages(),
+    python_requires=">=3.6",
+    install_requires=[
+        "torch",
+        "transformers",
+        "numpy",
+        "pytest",
+        "gradio",
+        "requests",
+        "unbabel-comet",
+        "pathlib"
+    ],
+    tests_require=[
+        "pytest",
+    ]
+)