Morgan
commited on
Commit
·
5d5c713
1
Parent(s):
baf0ff2
initialised evaluator
Browse files- evaluator/__init__.py +0 -0
- evaluator/bleu.py +15 -0
- evaluator/chrf.py +55 -0
- evaluator/comet.py +24 -0
- evaluator/mt_data/beam_search_translations.txt +20 -0
- evaluator/mt_data/reference_translations.txt +20 -0
- evaluator/mt_data/source_sentences.txt +20 -0
- evaluator/run_experiment.py +42 -0
- interface.py +118 -0
- setup.py +23 -0
evaluator/__init__.py
ADDED
|
File without changes
|
evaluator/bleu.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
| 2 |
+
|
| 3 |
+
def calculate_bleu(reference_sentences, translations):
|
| 4 |
+
"""
|
| 5 |
+
Calculate BLEU scores for a list of translations.
|
| 6 |
+
:param reference_sentences: List of reference translations.
|
| 7 |
+
:param translations: List of translated sentences (hypotheses).
|
| 8 |
+
:return: List of BLEU scores.
|
| 9 |
+
"""
|
| 10 |
+
smoothing_function = SmoothingFunction().method1
|
| 11 |
+
bleu_scores = [
|
| 12 |
+
sentence_bleu([ref.split()], hyp.split(), smoothing_function=smoothing_function)
|
| 13 |
+
for ref, hyp in zip(reference_sentences, translations)
|
| 14 |
+
]
|
| 15 |
+
return bleu_scores
|
evaluator/chrf.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
|
| 3 |
+
def calculate_chrf(
|
| 4 |
+
hypothesis: str,
|
| 5 |
+
reference: str,
|
| 6 |
+
char_order: int = 6,
|
| 7 |
+
beta: float = 2.0,
|
| 8 |
+
) -> float:
|
| 9 |
+
"""
|
| 10 |
+
Compute the character n-gram F-score between a hypothesis and a reference string.
|
| 11 |
+
:param hypothesis:
|
| 12 |
+
A string representing the hypothesis text.
|
| 13 |
+
:param reference:
|
| 14 |
+
A string representing the reference text.
|
| 15 |
+
:param char_order:
|
| 16 |
+
The maximum n-gram order to consider. Default is 6.
|
| 17 |
+
This means that unigrams, bigrams, trigrams, fourgrams, fivegrams and sixgrams will be considered.
|
| 18 |
+
:param beta:
|
| 19 |
+
The weight of recall in the F-score. Default is 2.0.
|
| 20 |
+
"""
|
| 21 |
+
... # TODO
|
| 22 |
+
def get_ngrams(text, n):
|
| 23 |
+
"""Extract character n-grams from a string."""
|
| 24 |
+
return Counter([text[i:i+n] for i in range(len(text) - n + 1)])
|
| 25 |
+
|
| 26 |
+
# Initialize precision and recall sums
|
| 27 |
+
precision_sum = 0.0
|
| 28 |
+
recall_sum = 0.0
|
| 29 |
+
|
| 30 |
+
# Loop over all n-gram orders
|
| 31 |
+
for n in range(1, char_order + 1):
|
| 32 |
+
hyp_ngrams = get_ngrams(hypothesis, n)
|
| 33 |
+
ref_ngrams = get_ngrams(reference, n)
|
| 34 |
+
|
| 35 |
+
# Calculate the intersection of n-grams
|
| 36 |
+
intersection = sum((hyp_ngrams & ref_ngrams).values())
|
| 37 |
+
|
| 38 |
+
# Calculate precision and recall for this n-gram order
|
| 39 |
+
precision = intersection / sum(hyp_ngrams.values()) if hyp_ngrams else 0.0
|
| 40 |
+
recall = intersection / sum(ref_ngrams.values()) if ref_ngrams else 0.0
|
| 41 |
+
|
| 42 |
+
# Accumulate precision and recall
|
| 43 |
+
precision_sum += precision
|
| 44 |
+
recall_sum += recall
|
| 45 |
+
|
| 46 |
+
# Average precision and recall across all n-gram orders
|
| 47 |
+
precision_avg = precision_sum / char_order
|
| 48 |
+
recall_avg = recall_sum / char_order
|
| 49 |
+
|
| 50 |
+
# Calculate the harmonic mean (ChrF score)
|
| 51 |
+
beta_squared = beta ** 2
|
| 52 |
+
if precision_avg + recall_avg == 0:
|
| 53 |
+
return 0.0
|
| 54 |
+
chrf = (1 + beta_squared) * (precision_avg * recall_avg) / (beta_squared * precision_avg + recall_avg)
|
| 55 |
+
return chrf * 100 # Scale to percentage
|
evaluator/comet.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from comet import download_model, load_from_checkpoint
|
| 2 |
+
|
| 3 |
+
def calculate_comet(source_sentences, translations, references):
|
| 4 |
+
"""
|
| 5 |
+
Calculate COMET scores for a list of translations.
|
| 6 |
+
:param source_sentences: List of source sentences.
|
| 7 |
+
:param translations: List of translated sentences (hypotheses).
|
| 8 |
+
:param references: List of reference translations.
|
| 9 |
+
:return: List of COMET scores (one score per sentence pair).
|
| 10 |
+
"""
|
| 11 |
+
# Download and load the COMET model
|
| 12 |
+
model_path = download_model("Unbabel/wmt20-comet-da")
|
| 13 |
+
model = load_from_checkpoint(model_path)
|
| 14 |
+
|
| 15 |
+
# Prepare data for COMET
|
| 16 |
+
data = [
|
| 17 |
+
{"src": src, "mt": mt, "ref": ref}
|
| 18 |
+
for src, mt, ref in zip(source_sentences, translations, references)
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
# Compute COMET scores
|
| 22 |
+
results = model.predict(data, batch_size=8, gpus=0)
|
| 23 |
+
scores = results["scores"] # Extract the scores from the results
|
| 24 |
+
return scores
|
evaluator/mt_data/beam_search_translations.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The merchandise cost less than 20 euros.
|
| 2 |
+
The fee corresponds to 40% of the value of the goods....
|
| 3 |
+
I am # PRS _ ORG # major customer so it is no problem for me.
|
| 4 |
+
All I need is a number or an instruction on what I should put in the package so that it can be tracked by you as a return.
|
| 5 |
+
I ordered the # PRS _ ORG # a few days ago... for 249 €.
|
| 6 |
+
Today it costs €179.
|
| 7 |
+
To be honest, I find that very annoying.
|
| 8 |
+
Do I have to cancel the order and re-order for the better price or is there another way of credit?
|
| 9 |
+
In principle, I also think that children should not be expected to take on the role of parents for their young siblings.
|
| 10 |
+
However, I think it is also good for big siblings to take care of the little ones in moderation.
|
| 11 |
+
Take a quick look, help a little, bring the bottle, etc.
|
| 12 |
+
In this way they learn responsibility, help in the family and are involved.
|
| 13 |
+
As long as they can still be children themselves, I don't see a problem with that.
|
| 14 |
+
If your K1 occupies itself for the short time and it is OK for it, that is good.
|
| 15 |
+
You can judge for yourself whether this is right for your children.
|
| 16 |
+
It's not as if he's supposed to be wrapping K2 and going to sleep for hours on end.
|
| 17 |
+
Legally, it is probably questionable.
|
| 18 |
+
Perhaps this is where the psychologist comes in.
|
| 19 |
+
Because if something should actually happen like a fire, burglary, earthquake, alien invasion etc., it would actually be too much of a responsibility for K1 to take care of K2.
|
| 20 |
+
Whether this is a risk you want to take is of course up to you;)
|
evaluator/mt_data/reference_translations.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The goods cost less than 20 euros.
|
| 2 |
+
The fee would equal 40% of the value of the goods...
|
| 3 |
+
I am #PRS_ORG# a serious customer and that is why it is not a problem for me.
|
| 4 |
+
I just need a number or a instructions what I should attach to the package so that it can be traced by you all as a return.
|
| 5 |
+
I ordered the #PRS_ORG# a few days ago...for €249.
|
| 6 |
+
Now it costs €179.
|
| 7 |
+
That really bothers me, I must say.
|
| 8 |
+
Do I need to cancel the order and reorder for the better price, or is there another way of getting the credit?
|
| 9 |
+
I also think that as a general rule, children should not be expected to overtake a parental role for their younger siblings.
|
| 10 |
+
However, I think it is also good for older siblings to care for the younger ones to some degree.
|
| 11 |
+
Sometimes check-up on them, sometimes help, sometimes get the bottle, etc.
|
| 12 |
+
They learn responsibility this way, help within the family and are thus connected with it.
|
| 13 |
+
As long as they can continue to be a kid themselves, I don’t see the problem.
|
| 14 |
+
If your first child looks after themselves alone for a short amount of time, that is OK, that is actually good.
|
| 15 |
+
You can best get a sense of whether or not this works for your children.
|
| 16 |
+
It is not as if they should change the second child’s diaper and sit for hours by them as they fall asleep.
|
| 17 |
+
It is probably questionable from a strictly legal perspective.
|
| 18 |
+
Maybe the psychologist would also have something to say about it.
|
| 19 |
+
Since if something actually does happen, like a fire, break-in, earthquake, alien invasion, etc., it would be too much responsibility for child 1 to take care of child 2.
|
| 20 |
+
If that is a risk that you would like to take that is up to you, of course ;)
|
evaluator/mt_data/source_sentences.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Die Ware hat unter 20 Euro gekostet.
|
| 2 |
+
Die Gebühr entspricht 40% des Warenwertest....
|
| 3 |
+
Ich bin #PRS_ORG# Großkunde und somit ist es kein Problem für mich.
|
| 4 |
+
Brauche lediglich eine Nummer oder Anweisung was ich den Paket beilegen sollte damit es von Ihnen als Retoure getrackt werden kann.
|
| 5 |
+
Ich habe vor ein paar Tagen die #PRS_ORG# bestellt... für 249€.
|
| 6 |
+
Heute kostet sie 179€.
|
| 7 |
+
Das finde ich ehrlich gesagt sehr ärgerlich.
|
| 8 |
+
Muss ich die Bestellung stornieren und neu bestellen für den besseren Preis oder gibt es eine andere Möglichkeit der Gutschrift?
|
| 9 |
+
Ich finde grundsätzlich auch, dass man Kindern nicht zumuten sollte, eine Elternrolle für ihre kleinen Geschwister zu übernehmen.
|
| 10 |
+
Allerdings finde ich, tut es großen Geschwistern auch gut, sich in Maßen mit um die Kleinen zu kümmern.
|
| 11 |
+
Mal kurz gucken, mal kurz helfen, mal die Flasche bringen etc.
|
| 12 |
+
So lernen sie Verantwortung, helfen in der Familie mit und sind mit eingebunden.
|
| 13 |
+
Solange sie noch selbst genug Kind sein können, sehe ich da kein Problem.
|
| 14 |
+
Wenn dein K1 sich für die kurze Zeit allein beschäftigt und es für ihn OK ist, ist das doch gut.
|
| 15 |
+
Du kannst selbst am besten ermessen, ob das für deine Kinder passt.
|
| 16 |
+
Es ist ja nicht so, dass er K2 wickeln und stundenlang beim Einschlafen begleiten soll.
|
| 17 |
+
Rein rechtlich ist es wahrscheinlich fragwürdig.
|
| 18 |
+
Vielleicht kommt daher auch die Aussage der Psychologin.
|
| 19 |
+
Denn falls tatsächlich etwas passieren sollte wie ein Brand, Einbruch, Erdbeben, Alieninvasion etc. wäre es tatsächlich zu viel Verantwortung für K1, sich um K2 zu kümmern.
|
| 20 |
+
Ob das nun ein Risiko ist, das du eingehen willst, ist natürlich dein Ermessen ;)
|
evaluator/run_experiment.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from assignment5.chrf import calculate_chrf
|
| 5 |
+
from assignment5.mbr import select_best_hypothesis
|
| 6 |
+
|
| 7 |
+
# Load data
|
| 8 |
+
data_dir = Path(__file__).parent / "mt_data"
|
| 9 |
+
source_sentences = (data_dir / "source_sentences.txt").read_text().splitlines()
|
| 10 |
+
reference_translations = (data_dir / "reference_translations.txt").read_text().splitlines()
|
| 11 |
+
beam_search_translations = (data_dir / "beam_search_translations.txt").read_text().splitlines()
|
| 12 |
+
|
| 13 |
+
with open(data_dir / "samples.jsonl") as f:
|
| 14 |
+
samples = [json.loads(line)["samples"] for line in f]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Step 1: Select the best hypothesis for each source sentence using MBR decoding
|
| 19 |
+
mbr_translations = [select_best_hypothesis(sample_set) for sample_set in samples]
|
| 20 |
+
|
| 21 |
+
# Step 2: Calculate ChrF scores for MBR translations
|
| 22 |
+
mbr_chrf_scores = [
|
| 23 |
+
calculate_chrf(mbr_translation, reference)
|
| 24 |
+
for mbr_translation, reference in zip(mbr_translations, reference_translations)
|
| 25 |
+
]
|
| 26 |
+
average_mbr_chrf = sum(mbr_chrf_scores) / len(mbr_chrf_scores)
|
| 27 |
+
|
| 28 |
+
# Step 3: Calculate ChrF scores for beam search translations
|
| 29 |
+
beam_chrf_scores = [
|
| 30 |
+
calculate_chrf(beam_translation, reference)
|
| 31 |
+
for beam_translation, reference in zip(beam_search_translations, reference_translations)
|
| 32 |
+
]
|
| 33 |
+
average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
|
| 34 |
+
|
| 35 |
+
# Step 4: Print the results
|
| 36 |
+
print(f"Average ChrF score for MBR decoding: {average_mbr_chrf:.2f}")
|
| 37 |
+
print(f"Average ChrF score for beam search: {average_beam_chrf:.2f}")
|
| 38 |
+
|
| 39 |
+
if average_mbr_chrf > average_beam_chrf:
|
| 40 |
+
print("MBR decoding produced better translations.")
|
| 41 |
+
else:
|
| 42 |
+
print("Beam search produced better translations.")
|
interface.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from evaluator.chrf import calculate_chrf
|
| 6 |
+
from evaluator.comet import calculate_comet # Import the COMET function
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# OpenAI API URL and key
|
| 10 |
+
OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
|
| 11 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 12 |
+
|
| 13 |
+
CHATGPT_MODELS = {
|
| 14 |
+
"GPT-4": "gpt-4"
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
def improve_translations(system_prompt, temperature, top_p):
|
| 18 |
+
# Load data
|
| 19 |
+
data_dir = Path(__file__).parent / "evaluator" / "mt_data"
|
| 20 |
+
source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
|
| 21 |
+
beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
|
| 22 |
+
reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
|
| 23 |
+
|
| 24 |
+
improved_translations = []
|
| 25 |
+
sentence_pairs = [] # To store source, draft 1, draft 2, and reference
|
| 26 |
+
|
| 27 |
+
for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
|
| 28 |
+
# Construct the prompt
|
| 29 |
+
user_prompt = f"""
|
| 30 |
+
As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
|
| 31 |
+
Source: {source}
|
| 32 |
+
Target: {target}
|
| 33 |
+
Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# Prepare API payload
|
| 37 |
+
payload = {
|
| 38 |
+
"model": CHATGPT_MODELS["GPT-4"],
|
| 39 |
+
"messages": [
|
| 40 |
+
{"role": "system", "content": system_prompt},
|
| 41 |
+
{"role": "user", "content": user_prompt}
|
| 42 |
+
],
|
| 43 |
+
"temperature": temperature,
|
| 44 |
+
"top_p": top_p,
|
| 45 |
+
"max_tokens": 512
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
headers = {
|
| 49 |
+
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
| 50 |
+
"Content-Type": "application/json"
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# Call OpenAI API
|
| 54 |
+
response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
|
| 55 |
+
response.raise_for_status()
|
| 56 |
+
data = response.json()
|
| 57 |
+
|
| 58 |
+
# Extract improved translation
|
| 59 |
+
output = data["choices"][0]["message"]["content"]
|
| 60 |
+
improved_translation = output.split("Improved Translation:")[-1].strip()
|
| 61 |
+
improved_translations.append(improved_translation)
|
| 62 |
+
|
| 63 |
+
# Add sentence pair to the list
|
| 64 |
+
sentence_pairs.append([source, target, improved_translation, reference])
|
| 65 |
+
|
| 66 |
+
# Calculate ChrF scores
|
| 67 |
+
beam_chrf_scores = [
|
| 68 |
+
calculate_chrf(beam_translation, reference)
|
| 69 |
+
for beam_translation, reference in zip(beam_search_translations, reference_translations)
|
| 70 |
+
]
|
| 71 |
+
improved_chrf_scores = [
|
| 72 |
+
calculate_chrf(improved_translation, reference)
|
| 73 |
+
for improved_translation, reference in zip(improved_translations, reference_translations)
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
# Calculate COMET scores
|
| 77 |
+
beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
|
| 78 |
+
improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
|
| 79 |
+
|
| 80 |
+
# Calculate average scores
|
| 81 |
+
average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
|
| 82 |
+
average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
|
| 83 |
+
average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
|
| 84 |
+
average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
|
| 85 |
+
|
| 86 |
+
# Calculate score changes
|
| 87 |
+
chrf_change = average_improved_chrf - average_beam_chrf
|
| 88 |
+
comet_change = average_improved_comet - average_beam_comet
|
| 89 |
+
|
| 90 |
+
# Prepare dataframes
|
| 91 |
+
sentence_pairs_df = sentence_pairs # Dataframe for sentence pairs
|
| 92 |
+
scores_df = [
|
| 93 |
+
["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
|
| 94 |
+
["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
# Return dataframes and evaluation message
|
| 98 |
+
evaluation_message = f"ChrF Change: {(average_improved_chrf/chrf_change):.2f}%, COMET Change: {(average_improved_comet/comet_change):.2f}%"
|
| 99 |
+
return sentence_pairs_df, scores_df, evaluation_message
|
| 100 |
+
|
| 101 |
+
# Gradio interface
|
| 102 |
+
demo = gr.Interface(
|
| 103 |
+
fn=improve_translations,
|
| 104 |
+
inputs=[
|
| 105 |
+
gr.Textbox(label="System Prompt", placeholder="Define the assistant's behavior here..."),
|
| 106 |
+
gr.Slider(value=1, minimum=0, maximum=1.9, step=0.1, label="Temperature"),
|
| 107 |
+
gr.Slider(value=1, minimum=0, maximum=1, step=0.01, label="Top P")
|
| 108 |
+
],
|
| 109 |
+
outputs=[
|
| 110 |
+
gr.Dataframe(headers=["Source text", "Draft 1", "Draft 2", "Reference"], label="Sentence Pairs"),
|
| 111 |
+
gr.Dataframe(headers=["Metric", "Draft 1", "Draft 2", "Change"], label="Scores"),
|
| 112 |
+
gr.Textbox(label="Evaluation Results")
|
| 113 |
+
],
|
| 114 |
+
title="Translation Post-Editing and Evaluation",
|
| 115 |
+
description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
demo.launch(share=True)
|
setup.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name="translator_evaluator",
|
| 5 |
+
version="0.1",
|
| 6 |
+
author="Morgan Kavanagh",
|
| 7 |
+
description="A package for machine translation evaluation using BertScore, COMET, BLEU, and ChatGPT integration.",
|
| 8 |
+
packages=find_packages(),
|
| 9 |
+
python_requires=">=3.6",
|
| 10 |
+
install_requires=[
|
| 11 |
+
"torch",
|
| 12 |
+
"transformers",
|
| 13 |
+
"numpy",
|
| 14 |
+
"pytest",
|
| 15 |
+
"gradio",
|
| 16 |
+
"requests",
|
| 17 |
+
"unbabel-comet",
|
| 18 |
+
"pathlib"
|
| 19 |
+
],
|
| 20 |
+
tests_require=[
|
| 21 |
+
"pytest",
|
| 22 |
+
]
|
| 23 |
+
)
|