def load_ctc_pred(path_to_code_pred_file):
	word_ctc_pred_dict={}
	for line in open(path_to_code_pred_file):
		line=line.strip()
		try:
			(word, label)= line.split("\t")
			word_ctc_pred_dict[word]=label

		except Exception as e:
			print(line)
			continue

	# print(len(word_ctc_pred_dict))
	return word_ctc_pred_dict
		
		
def load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file,   word_ctc_pred_dict, op_file):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    count_question=0
    count_answer=0
    max_len = 0

    
    sentences = [] #list of sentences

    sentence = [] #list of words in the current sentence in formate each word list looks like [word, markdow tag name, mark down tag, NER tag]

    for line in open(path_main_file):
        if line.startswith("Question_ID"):
            count_question+=1

        if line.startswith("Answer_to_Question_ID"):
            count_answer+=1

        if line.strip()=="":
            if len(sentence) > 0:
                #print(sentence)
                output_line = " ".join(w[0] for w in sentence)
                #print(output_line)
                if "code omitted for annotation" in output_line and "CODE_BLOCK :" in output_line:
                    sentence = []
                    continue
                elif "omitted for annotation" in output_line and "OP_BLOCK :" in output_line:
                    sentence = []
                    continue
                elif "Question_URL :" in output_line:
                    sentence = []
                    continue
                elif "Question_ID :" in output_line:
                    sentence = []
                    continue
                else:
                    #print(output_line)
                    sentences.append(sentence)
                    if len(sentence)>max_len:
                        max_len=len(sentence)
                    sentence=[]
                
            
        else:
            line_values=line.strip().split()

            gold_word=line_values[0]
            gold_label=line_values[1]
            raw_word=line_values[2]
            raw_label=line_values[3]

            
            gold_word=" ".join(gold_word.split('-----'))
            

            # gold_label_name= gold_label.replace("B-","").replace("I-","")
            # if gold_label_name not in set_of_selected_tags:
            #     gold_label="O"

            # if parameters['segmentation_only']:
            #     if gold_label!="O":
            #         # print(gold_label)
            #         gold_label_prefix=gold_label.split("-")[0]
            #         gold_label=gold_label_prefix+"-"+"Name"
            #         # print(gold_label)
            #         # print("updated gold label")
            if gold_label!="O":
            	gold_label="B-Name"

            
            raw_label_name=raw_label.replace("B-","").replace("I-","")
            
            word_info=[gold_word, raw_label_name, raw_label, gold_label]


            sentence.append(word_info)

    
    sentences_preds = []
    sentence_pred = []
    
    for line in open(path_segmenter_pred_file):
        if line.strip()=="":
            if len(sentence_pred) > 0:
                sentences_preds.append(sentence_pred)
                sentence_pred=[]
        else:
            line_values=line.strip().split()
            pred_word= ' '.join(line_values[:-2])
            pred_label=line_values[-1]

            word_info=[pred_word,  pred_label]
            sentence_pred.append(word_info)

    print(len(sentences_preds),len(sentences))

   
    pred_merged_sentences = []
    for sent_index in range(len(sentences)):
        main_sent = sentences[sent_index]
        pred_sent = sentences_preds[sent_index]
        

        new_sent = []
        new_word_info =[]

        for word_index in range(len(main_sent)):
            [gold_word, raw_label_name, raw_label, gold_label] = main_sent[word_index]
            [pred_word, pred_seg_label] = pred_sent[word_index]
            
            ctc_pred = word_ctc_pred_dict[gold_word.strip()]
            new_word_info = [gold_word, raw_label_name, raw_label,  ctc_pred, pred_seg_label, gold_label]
            new_sent.append(new_word_info)

        if len(new_sent)>0:
            pred_merged_sentences.append(new_sent)


    fout = open(op_file,'w')
    for sent in pred_merged_sentences:
    	fout.write("\n")
    	for word_info in sent:
    		[gold_word, raw_label_name, raw_label,  ctc_pred, pred_seg_label, gold_label] = word_info
    		opline=gold_word+"\t"+gold_label+"\t"+"CTC_PRED:"+ctc_pred+"\t"+"pred_seg_label:"+pred_seg_label+"\n"
    		fout.write(opline)
    		print(opline)	
		

    # print("------------------------------------------------------------")
    # print("Number of questions in ", path_main_file, " : ", count_question)
    # print("Number of answers in ", path_main_file, " : ", count_answer)
    # print("Number of sentences in ", path_main_file, " : ", len(sentences))
    # print("Number of sentences after merging : " , len(pred_merged_sentences))
    # print("Max len sentences has", max_len, "words")
    # print("------------------------------------------------------------")
    return pred_merged_sentences
                

if __name__ == '__main__':
	
	path_to_code_pred_file="auxilary_inputs_ner/ctc_pred.tsv"
	word_ctc_pred_dict = load_ctc_pred(path_to_code_pred_file)


	path_main_file = "CONLL_Data/train_gold_raw_merged_merged_labels.txt"
	path_segmenter_pred_file="auxilary_inputs_ner/segmenter_pred/segmenter_pred_train.txt"
	op_file = "train_seg.txt"
	load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file,  word_ctc_pred_dict, op_file)

	# path_main_file = "CONLL_Data/test_gold_raw_merged_merged_labels.txt"
	# path_segmenter_pred_file="auxilary_inputs_ner/segmenter_pred/segmenter_pred_test.txt"
	# op_file = "test.txt"
	# load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file,  word_ctc_pred_dict, op_file)

	# path_main_file = "CONLL_Data/dev_gold_raw_merged_merged_labels.txt"
	# path_segmenter_pred_file="auxilary_inputs_ner/segmenter_pred/segmenter_pred_dev.txt"
	# op_file = "dev.txt"
	# load_sentences_so_w_pred(path_main_file, path_segmenter_pred_file,  word_ctc_pred_dict, op_file)