Morgan commited on
Commit
5d5c713
·
1 Parent(s): baf0ff2

initialised evaluator

Browse files
evaluator/__init__.py ADDED
File without changes
evaluator/bleu.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
2
+
3
+ def calculate_bleu(reference_sentences, translations):
4
+ """
5
+ Calculate BLEU scores for a list of translations.
6
+ :param reference_sentences: List of reference translations.
7
+ :param translations: List of translated sentences (hypotheses).
8
+ :return: List of BLEU scores.
9
+ """
10
+ smoothing_function = SmoothingFunction().method1
11
+ bleu_scores = [
12
+ sentence_bleu([ref.split()], hyp.split(), smoothing_function=smoothing_function)
13
+ for ref, hyp in zip(reference_sentences, translations)
14
+ ]
15
+ return bleu_scores
evaluator/chrf.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+
3
+ def calculate_chrf(
4
+ hypothesis: str,
5
+ reference: str,
6
+ char_order: int = 6,
7
+ beta: float = 2.0,
8
+ ) -> float:
9
+ """
10
+ Compute the character n-gram F-score between a hypothesis and a reference string.
11
+ :param hypothesis:
12
+ A string representing the hypothesis text.
13
+ :param reference:
14
+ A string representing the reference text.
15
+ :param char_order:
16
+ The maximum n-gram order to consider. Default is 6.
17
+ This means that unigrams, bigrams, trigrams, fourgrams, fivegrams and sixgrams will be considered.
18
+ :param beta:
19
+ The weight of recall in the F-score. Default is 2.0.
20
+ """
21
+ ... # TODO
22
+ def get_ngrams(text, n):
23
+ """Extract character n-grams from a string."""
24
+ return Counter([text[i:i+n] for i in range(len(text) - n + 1)])
25
+
26
+ # Initialize precision and recall sums
27
+ precision_sum = 0.0
28
+ recall_sum = 0.0
29
+
30
+ # Loop over all n-gram orders
31
+ for n in range(1, char_order + 1):
32
+ hyp_ngrams = get_ngrams(hypothesis, n)
33
+ ref_ngrams = get_ngrams(reference, n)
34
+
35
+ # Calculate the intersection of n-grams
36
+ intersection = sum((hyp_ngrams & ref_ngrams).values())
37
+
38
+ # Calculate precision and recall for this n-gram order
39
+ precision = intersection / sum(hyp_ngrams.values()) if hyp_ngrams else 0.0
40
+ recall = intersection / sum(ref_ngrams.values()) if ref_ngrams else 0.0
41
+
42
+ # Accumulate precision and recall
43
+ precision_sum += precision
44
+ recall_sum += recall
45
+
46
+ # Average precision and recall across all n-gram orders
47
+ precision_avg = precision_sum / char_order
48
+ recall_avg = recall_sum / char_order
49
+
50
+ # Calculate the harmonic mean (ChrF score)
51
+ beta_squared = beta ** 2
52
+ if precision_avg + recall_avg == 0:
53
+ return 0.0
54
+ chrf = (1 + beta_squared) * (precision_avg * recall_avg) / (beta_squared * precision_avg + recall_avg)
55
+ return chrf * 100 # Scale to percentage
evaluator/comet.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from comet import download_model, load_from_checkpoint
2
+
3
+ def calculate_comet(source_sentences, translations, references):
4
+ """
5
+ Calculate COMET scores for a list of translations.
6
+ :param source_sentences: List of source sentences.
7
+ :param translations: List of translated sentences (hypotheses).
8
+ :param references: List of reference translations.
9
+ :return: List of COMET scores (one score per sentence pair).
10
+ """
11
+ # Download and load the COMET model
12
+ model_path = download_model("Unbabel/wmt20-comet-da")
13
+ model = load_from_checkpoint(model_path)
14
+
15
+ # Prepare data for COMET
16
+ data = [
17
+ {"src": src, "mt": mt, "ref": ref}
18
+ for src, mt, ref in zip(source_sentences, translations, references)
19
+ ]
20
+
21
+ # Compute COMET scores
22
+ results = model.predict(data, batch_size=8, gpus=0)
23
+ scores = results["scores"] # Extract the scores from the results
24
+ return scores
evaluator/mt_data/beam_search_translations.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The merchandise cost less than 20 euros.
2
+ The fee corresponds to 40% of the value of the goods....
3
+ I am # PRS _ ORG # major customer so it is no problem for me.
4
+ All I need is a number or an instruction on what I should put in the package so that it can be tracked by you as a return.
5
+ I ordered the # PRS _ ORG # a few days ago... for 249 €.
6
+ Today it costs €179.
7
+ To be honest, I find that very annoying.
8
+ Do I have to cancel the order and re-order for the better price or is there another way of credit?
9
+ In principle, I also think that children should not be expected to take on the role of parents for their young siblings.
10
+ However, I think it is also good for big siblings to take care of the little ones in moderation.
11
+ Take a quick look, help a little, bring the bottle, etc.
12
+ In this way they learn responsibility, help in the family and are involved.
13
+ As long as they can still be children themselves, I don't see a problem with that.
14
+ If your K1 occupies itself for the short time and it is OK for it, that is good.
15
+ You can judge for yourself whether this is right for your children.
16
+ It's not as if he's supposed to be wrapping K2 and going to sleep for hours on end.
17
+ Legally, it is probably questionable.
18
+ Perhaps this is where the psychologist comes in.
19
+ Because if something should actually happen like a fire, burglary, earthquake, alien invasion etc., it would actually be too much of a responsibility for K1 to take care of K2.
20
+ Whether this is a risk you want to take is of course up to you;)
evaluator/mt_data/reference_translations.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The goods cost less than 20 euros.
2
+ The fee would equal 40% of the value of the goods...
3
+ I am #PRS_ORG# a serious customer and that is why it is not a problem for me.
4
+ I just need a number or a instructions what I should attach to the package so that it can be traced by you all as a return.
5
+ I ordered the #PRS_ORG# a few days ago...for €249.
6
+ Now it costs €179.
7
+ That really bothers me, I must say.
8
+ Do I need to cancel the order and reorder for the better price, or is there another way of getting the credit?
9
+ I also think that as a general rule, children should not be expected to overtake a parental role for their younger siblings.
10
+ However, I think it is also good for older siblings to care for the younger ones to some degree.
11
+ Sometimes check-up on them, sometimes help, sometimes get the bottle, etc.
12
+ They learn responsibility this way, help within the family and are thus connected with it.
13
+ As long as they can continue to be a kid themselves, I don’t see the problem.
14
+ If your first child looks after themselves alone for a short amount of time, that is OK, that is actually good.
15
+ You can best get a sense of whether or not this works for your children.
16
+ It is not as if they should change the second child’s diaper and sit for hours by them as they fall asleep.
17
+ It is probably questionable from a strictly legal perspective.
18
+ Maybe the psychologist would also have something to say about it.
19
+ Since if something actually does happen, like a fire, break-in, earthquake, alien invasion, etc., it would be too much responsibility for child 1 to take care of child 2.
20
+ If that is a risk that you would like to take that is up to you, of course ;)
evaluator/mt_data/source_sentences.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Die Ware hat unter 20 Euro gekostet.
2
+ Die Gebühr entspricht 40% des Warenwertest....
3
+ Ich bin #PRS_ORG# Großkunde und somit ist es kein Problem für mich.
4
+ Brauche lediglich eine Nummer oder Anweisung was ich den Paket beilegen sollte damit es von Ihnen als Retoure getrackt werden kann.
5
+ Ich habe vor ein paar Tagen die #PRS_ORG# bestellt... für 249€.
6
+ Heute kostet sie 179€.
7
+ Das finde ich ehrlich gesagt sehr ärgerlich.
8
+ Muss ich die Bestellung stornieren und neu bestellen für den besseren Preis oder gibt es eine andere Möglichkeit der Gutschrift?
9
+ Ich finde grundsätzlich auch, dass man Kindern nicht zumuten sollte, eine Elternrolle für ihre kleinen Geschwister zu übernehmen.
10
+ Allerdings finde ich, tut es großen Geschwistern auch gut, sich in Maßen mit um die Kleinen zu kümmern.
11
+ Mal kurz gucken, mal kurz helfen, mal die Flasche bringen etc.
12
+ So lernen sie Verantwortung, helfen in der Familie mit und sind mit eingebunden.
13
+ Solange sie noch selbst genug Kind sein können, sehe ich da kein Problem.
14
+ Wenn dein K1 sich für die kurze Zeit allein beschäftigt und es für ihn OK ist, ist das doch gut.
15
+ Du kannst selbst am besten ermessen, ob das für deine Kinder passt.
16
+ Es ist ja nicht so, dass er K2 wickeln und stundenlang beim Einschlafen begleiten soll.
17
+ Rein rechtlich ist es wahrscheinlich fragwürdig.
18
+ Vielleicht kommt daher auch die Aussage der Psychologin.
19
+ Denn falls tatsächlich etwas passieren sollte wie ein Brand, Einbruch, Erdbeben, Alieninvasion etc. wäre es tatsächlich zu viel Verantwortung für K1, sich um K2 zu kümmern.
20
+ Ob das nun ein Risiko ist, das du eingehen willst, ist natürlich dein Ermessen ;)
evaluator/run_experiment.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from assignment5.chrf import calculate_chrf
5
+ from assignment5.mbr import select_best_hypothesis
6
+
7
+ # Load data
8
+ data_dir = Path(__file__).parent / "mt_data"
9
+ source_sentences = (data_dir / "source_sentences.txt").read_text().splitlines()
10
+ reference_translations = (data_dir / "reference_translations.txt").read_text().splitlines()
11
+ beam_search_translations = (data_dir / "beam_search_translations.txt").read_text().splitlines()
12
+
13
+ with open(data_dir / "samples.jsonl") as f:
14
+ samples = [json.loads(line)["samples"] for line in f]
15
+
16
+
17
+
18
+ # Step 1: Select the best hypothesis for each source sentence using MBR decoding
19
+ mbr_translations = [select_best_hypothesis(sample_set) for sample_set in samples]
20
+
21
+ # Step 2: Calculate ChrF scores for MBR translations
22
+ mbr_chrf_scores = [
23
+ calculate_chrf(mbr_translation, reference)
24
+ for mbr_translation, reference in zip(mbr_translations, reference_translations)
25
+ ]
26
+ average_mbr_chrf = sum(mbr_chrf_scores) / len(mbr_chrf_scores)
27
+
28
+ # Step 3: Calculate ChrF scores for beam search translations
29
+ beam_chrf_scores = [
30
+ calculate_chrf(beam_translation, reference)
31
+ for beam_translation, reference in zip(beam_search_translations, reference_translations)
32
+ ]
33
+ average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
34
+
35
+ # Step 4: Print the results
36
+ print(f"Average ChrF score for MBR decoding: {average_mbr_chrf:.2f}")
37
+ print(f"Average ChrF score for beam search: {average_beam_chrf:.2f}")
38
+
39
+ if average_mbr_chrf > average_beam_chrf:
40
+ print("MBR decoding produced better translations.")
41
+ else:
42
+ print("Beam search produced better translations.")
interface.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import json
4
+ import os
5
+ from evaluator.chrf import calculate_chrf
6
+ from evaluator.comet import calculate_comet # Import the COMET function
7
+ from pathlib import Path
8
+
9
+ # OpenAI API URL and key
10
+ OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
11
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
12
+
13
+ CHATGPT_MODELS = {
14
+ "GPT-4": "gpt-4"
15
+ }
16
+
17
+ def improve_translations(system_prompt, temperature, top_p):
18
+ # Load data
19
+ data_dir = Path(__file__).parent / "evaluator" / "mt_data"
20
+ source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
21
+ beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
22
+ reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
23
+
24
+ improved_translations = []
25
+ sentence_pairs = [] # To store source, draft 1, draft 2, and reference
26
+
27
+ for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
28
+ # Construct the prompt
29
+ user_prompt = f"""
30
+ As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
31
+ Source: {source}
32
+ Target: {target}
33
+ Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
34
+ """
35
+
36
+ # Prepare API payload
37
+ payload = {
38
+ "model": CHATGPT_MODELS["GPT-4"],
39
+ "messages": [
40
+ {"role": "system", "content": system_prompt},
41
+ {"role": "user", "content": user_prompt}
42
+ ],
43
+ "temperature": temperature,
44
+ "top_p": top_p,
45
+ "max_tokens": 512
46
+ }
47
+
48
+ headers = {
49
+ "Authorization": f"Bearer {OPENAI_API_KEY}",
50
+ "Content-Type": "application/json"
51
+ }
52
+
53
+ # Call OpenAI API
54
+ response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
55
+ response.raise_for_status()
56
+ data = response.json()
57
+
58
+ # Extract improved translation
59
+ output = data["choices"][0]["message"]["content"]
60
+ improved_translation = output.split("Improved Translation:")[-1].strip()
61
+ improved_translations.append(improved_translation)
62
+
63
+ # Add sentence pair to the list
64
+ sentence_pairs.append([source, target, improved_translation, reference])
65
+
66
+ # Calculate ChrF scores
67
+ beam_chrf_scores = [
68
+ calculate_chrf(beam_translation, reference)
69
+ for beam_translation, reference in zip(beam_search_translations, reference_translations)
70
+ ]
71
+ improved_chrf_scores = [
72
+ calculate_chrf(improved_translation, reference)
73
+ for improved_translation, reference in zip(improved_translations, reference_translations)
74
+ ]
75
+
76
+ # Calculate COMET scores
77
+ beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
78
+ improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
79
+
80
+ # Calculate average scores
81
+ average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
82
+ average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
83
+ average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
84
+ average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
85
+
86
+ # Calculate score changes
87
+ chrf_change = average_improved_chrf - average_beam_chrf
88
+ comet_change = average_improved_comet - average_beam_comet
89
+
90
+ # Prepare dataframes
91
+ sentence_pairs_df = sentence_pairs # Dataframe for sentence pairs
92
+ scores_df = [
93
+ ["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
94
+ ["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
95
+ ]
96
+
97
+ # Return dataframes and evaluation message
98
+ evaluation_message = f"ChrF Change: {(average_improved_chrf/chrf_change):.2f}%, COMET Change: {(average_improved_comet/comet_change):.2f}%"
99
+ return sentence_pairs_df, scores_df, evaluation_message
100
+
101
+ # Gradio interface
102
+ demo = gr.Interface(
103
+ fn=improve_translations,
104
+ inputs=[
105
+ gr.Textbox(label="System Prompt", placeholder="Define the assistant's behavior here..."),
106
+ gr.Slider(value=1, minimum=0, maximum=1.9, step=0.1, label="Temperature"),
107
+ gr.Slider(value=1, minimum=0, maximum=1, step=0.01, label="Top P")
108
+ ],
109
+ outputs=[
110
+ gr.Dataframe(headers=["Source text", "Draft 1", "Draft 2", "Reference"], label="Sentence Pairs"),
111
+ gr.Dataframe(headers=["Metric", "Draft 1", "Draft 2", "Change"], label="Scores"),
112
+ gr.Textbox(label="Evaluation Results")
113
+ ],
114
+ title="Translation Post-Editing and Evaluation",
115
+ description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
116
+ )
117
+
118
+ demo.launch(share=True)
setup.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="translator_evaluator",
5
+ version="0.1",
6
+ author="Morgan Kavanagh",
7
+ description="A package for machine translation evaluation using BertScore, COMET, BLEU, and ChatGPT integration.",
8
+ packages=find_packages(),
9
+ python_requires=">=3.6",
10
+ install_requires=[
11
+ "torch",
12
+ "transformers",
13
+ "numpy",
14
+ "pytest",
15
+ "gradio",
16
+ "requests",
17
+ "unbabel-comet",
18
+ "pathlib"
19
+ ],
20
+ tests_require=[
21
+ "pytest",
22
+ ]
23
+ )