ko-translation-leaderbaord / translation2.py
davidkim205's picture
Upload folder using huggingface_hub
577164e verified
import argparse
import json
import os
from tqdm import tqdm
from utils.simple_bleu import simple_score
from model import load_model, translate_en2ko, translate_ko2en
def load_json(filename):
json_data = []
with open(filename, "r", encoding="utf-8") as f:
if os.path.splitext(filename)[1] != ".jsonl":
json_data = json.load(f)
else:
for line in f:
json_data.append(json.loads(line))
return json_data
def save_json(json_data, filename, option="a"):
directory, _ = os.path.split(filename)
if not os.path.exists(directory):
os.makedirs(directory)
filename = filename.replace(" ", "_")
with open(filename, option, encoding="utf-8") as f:
if not filename.endswith(".jsonl"):
json.dump(json_data, f, ensure_ascii=False, indent=4)
else:
for data in json_data:
json.dump(data, f, ensure_ascii=False)
f.write("\n")
def task_bleu(data):
chat = data["conversations"]
input = chat[0]["value"]
reference = chat[1]["value"]
def clean_text(text):
if chat[0]["value"].find("ν•œκΈ€λ‘œ λ²ˆμ—­ν•˜μ„Έμš”.") != -1:
cur_lang = "en"
else:
cur_lang = "ko"
text = text.split("λ²ˆμ—­ν•˜μ„Έμš”.\n", 1)[-1]
return text, cur_lang
input, cur_lang = clean_text(input)
def do_translation(text, cur_lang):
trans = ""
try:
if cur_lang == "en":
trans = translate_en2ko(text)
else:
trans = translate_ko2en(text)
except Exception as e:
trans = ""
return trans
generation = do_translation(input, cur_lang)
bleu = simple_score(reference, generation)
bleu = round(bleu, 3)
result = {
"reference": reference,
"generation": generation,
"bleu": bleu,
"lang": cur_lang,
}
return result
def task_self_bleu(data):
chat = data["conversations"]
src = data["src"]
input = chat[0]["value"]
def clean_text(text):
if chat[0]["value"].find("ν•œκΈ€λ‘œ λ²ˆμ—­ν•˜μ„Έμš”.") != -1:
cur_lang = "en"
else:
cur_lang = "ko"
text = text.split("λ²ˆμ—­ν•˜μ„Έμš”.\n", 1)[-1]
return text, cur_lang
input, cur_lang = clean_text(input)
def do_translation(text, cur_lang):
trans = ""
try:
if cur_lang == "en":
trans = translate_en2ko(text)
else:
trans = translate_ko2en(text)
except Exception as e:
trans = ""
return trans
generation1 = do_translation(input, cur_lang)
next_lang = "ko" if cur_lang == "en" else "en"
generation2 = do_translation(generation1, next_lang)
bleu = simple_score(input, generation2)
bleu = round(bleu, 3)
result = {
"reference": input,
"generation": generation2,
"generation1": generation1,
"bleu": bleu,
"lang": cur_lang,
}
return result
def main():
parser = argparse.ArgumentParser("argument")
parser.add_argument(
"--input_file",
default="./data/komt-1810k-test.jsonl",
type=str,
help="input_file",
)
parser.add_argument("--output", default=None, type=str, help="model path")
parser.add_argument("--model", default="davidkim205/iris-7b", type=str, help="model")
parser.add_argument("--template_name", default=None, type=str, help="template_name")
args = parser.parse_args()
json_data = load_json(args.input_file)
load_model(args.model, args.template_name)
def make_output(args, prefix):
if args.output:
output = args.output
else:
filename = args.model.split('/')[-1]
output = f"results_{prefix}/{filename}-result.jsonl"
return output
# task bleu
print('task bleu ')
for index, data in tqdm(enumerate(json_data)):
result = task_bleu(data)
result['index'] = index
result['model'] = args.model
result['src'] = data["src"]
result['conversations'] = data["conversations"]
print(json.dumps(result, ensure_ascii=False, indent=2))
save_json([result], make_output(args, 'bleu'))
# task self bleu
print('task self bleu')
for index, data in tqdm(enumerate(json_data)):
result = task_self_bleu(data)
result['index'] = index
result['model'] = args.model
result['src'] = data["src"]
result['conversations'] = data["conversations"]
print(json.dumps(result, ensure_ascii=False, indent=2))
save_json([result], make_output(args, 'self'))
if __name__ == "__main__":
main()