def load_ctc_pred(path_to_code_pred_file): word_ctc_pred_dict={} for line in open(path_to_code_pred_file): line=line.strip() try: (word, label)= line.split("\t") word_ctc_pred_dict[word]=label except Exception as e: print(line) continue # print(len(word_ctc_pred_dict)) return word_ctc_pred_dict def load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file, word_ctc_pred_dict, op_file): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ count_question=0 count_answer=0 max_len = 0 sentences = [] #list of sentences sentence = [] #list of words in the current sentence in formate each word list looks like [word, markdow tag name, mark down tag, NER tag] for line in open(path_main_file): if line.startswith("Question_ID"): count_question+=1 if line.startswith("Answer_to_Question_ID"): count_answer+=1 if line.strip()=="": if len(sentence) > 0: #print(sentence) output_line = " ".join(w[0] for w in sentence) #print(output_line) if "code omitted for annotation" in output_line and "CODE_BLOCK :" in output_line: sentence = [] continue elif "omitted for annotation" in output_line and "OP_BLOCK :" in output_line: sentence = [] continue elif "Question_URL :" in output_line: sentence = [] continue elif "Question_ID :" in output_line: sentence = [] continue else: #print(output_line) sentences.append(sentence) if len(sentence)>max_len: max_len=len(sentence) sentence=[] else: line_values=line.strip().split() gold_word=line_values[0] gold_label=line_values[1] raw_word=line_values[2] raw_label=line_values[3] gold_word=" ".join(gold_word.split('-----')) # gold_label_name= gold_label.replace("B-","").replace("I-","") # if gold_label_name not in set_of_selected_tags: # gold_label="O" # if parameters['segmentation_only']: # if gold_label!="O": # # print(gold_label) # gold_label_prefix=gold_label.split("-")[0] # gold_label=gold_label_prefix+"-"+"Name" # # print(gold_label) # # print("updated gold label") if gold_label!="O": gold_label="B-Name" raw_label_name=raw_label.replace("B-","").replace("I-","") word_info=[gold_word, raw_label_name, raw_label, gold_label] sentence.append(word_info) sentences_preds = [] sentence_pred = [] for line in open(path_segmenter_pred_file): if line.strip()=="": if len(sentence_pred) > 0: sentences_preds.append(sentence_pred) sentence_pred=[] else: line_values=line.strip().split() pred_word= ' '.join(line_values[:-2]) pred_label=line_values[-1] word_info=[pred_word, pred_label] sentence_pred.append(word_info) print(len(sentences_preds),len(sentences)) pred_merged_sentences = [] for sent_index in range(len(sentences)): main_sent = sentences[sent_index] pred_sent = sentences_preds[sent_index] new_sent = [] new_word_info =[] for word_index in range(len(main_sent)): [gold_word, raw_label_name, raw_label, gold_label] = main_sent[word_index] [pred_word, pred_seg_label] = pred_sent[word_index] ctc_pred = word_ctc_pred_dict[gold_word.strip()] new_word_info = [gold_word, raw_label_name, raw_label, ctc_pred, pred_seg_label, gold_label] new_sent.append(new_word_info) if len(new_sent)>0: pred_merged_sentences.append(new_sent) fout = open(op_file,'w') for sent in pred_merged_sentences: fout.write("\n") for word_info in sent: [gold_word, raw_label_name, raw_label, ctc_pred, pred_seg_label, gold_label] = word_info opline=gold_word+"\t"+gold_label+"\t"+"CTC_PRED:"+ctc_pred+"\t"+"pred_seg_label:"+pred_seg_label+"\n" fout.write(opline) print(opline) # print("------------------------------------------------------------") # print("Number of questions in ", path_main_file, " : ", count_question) # print("Number of answers in ", path_main_file, " : ", count_answer) # print("Number of sentences in ", path_main_file, " : ", len(sentences)) # print("Number of sentences after merging : " , len(pred_merged_sentences)) # print("Max len sentences has", max_len, "words") # print("------------------------------------------------------------") return pred_merged_sentences if __name__ == '__main__': path_to_code_pred_file="auxilary_inputs_ner/ctc_pred.tsv" word_ctc_pred_dict = load_ctc_pred(path_to_code_pred_file) path_main_file = "CONLL_Data/train_gold_raw_merged_merged_labels.txt" path_segmenter_pred_file="auxilary_inputs_ner/segmenter_pred/segmenter_pred_train.txt" op_file = "train_seg.txt" load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file, word_ctc_pred_dict, op_file) # path_main_file = "CONLL_Data/test_gold_raw_merged_merged_labels.txt" # path_segmenter_pred_file="auxilary_inputs_ner/segmenter_pred/segmenter_pred_test.txt" # op_file = "test.txt" # load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file, word_ctc_pred_dict, op_file) # path_main_file = "CONLL_Data/dev_gold_raw_merged_merged_labels.txt" # path_segmenter_pred_file="auxilary_inputs_ner/segmenter_pred/segmenter_pred_dev.txt" # op_file = "dev.txt" # load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file, word_ctc_pred_dict, op_file)