live-lm-critic / gec /scripts /get_corr_from_m2.py
Olivia Figueira
Upload code with streamlit addition
b6e5241
raw history blame
No virus
1.23 kB
import argparse
# Apply the edits of a single annotator to generate the corrected sentences.
def main(args):
m2 = open(args.m2_file).read().strip().split("\n\n")
out = open(args.out, "w")
# Do not apply edits with these error types
skip = {"noop", "UNK", "Um"}
for sent in m2:
sent = sent.split("\n")
cor_sent = sent[0].split()[1:] # Ignore "S "
edits = sent[1:]
offset = 0
for edit in edits:
edit = edit.split("|||")
if edit[1] in skip: continue # Ignore certain edits
coder = int(edit[-1])
if coder != args.id: continue # Ignore other coders
span = edit[0].split()[1:] # Ignore "A "
start = int(span[0])
end = int(span[1])
cor = edit[2].split()
cor_sent[start+offset:end+offset] = cor
offset = offset-(end-start)+len(cor)
out.write(" ".join(cor_sent)+"\n")
if __name__ == "__main__":
# Define and parse program input
parser = argparse.ArgumentParser()
parser.add_argument("m2_file", help="The path to an input m2 file.")
parser.add_argument("-out", help="A path to where we save the output corrected text file.", required=True)
parser.add_argument("-id", help="The id of the target annotator in the m2 file.", type=int, default=0)
args = parser.parse_args()
main(args)