clinical_segment_splitter / run_segbot.py
ando55's picture
Update run_segbot.py
e3ce53f
import re
import pickle
import numpy as np
import random
import torch
from solver import TrainSolver
from model import PointerNetworks
import gensim
import MeCab
import pysbd
import io
class CPU_Unpickler(pickle.Unpickler):
def find_class(self, module, name):
if module == 'torch.storage' and name == '_load_from_bytes':
return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
else: return super().find_class(module, name)
def create_data(doc,fm,split_method):
wakati = MeCab.Tagger("-Owakati -b 81920 -r /etc/mecabrc -d /home/user/app/mecab-ipadic-neologd")
seg = pysbd.Segmenter(language="ja", clean=False)
texts = []
sent = ""
label = []
alls = []
labels, text, num = [], [], []
allab, altex, fukugenss = [], [], []
for n in range(1):
fukugens = []
if split_method == "pySBD":
lines = seg.segment(doc)
else:
doc = doc.strip().replace("。","。\n").replace(".",".\n")
doc = re.sub("(\n)+","\n",doc)
lines = doc.split("\n")
for line in lines:
line = line.strip()
if line == "":
continue
sent = wakati.parse(line).split(" ")[:-1]
flag = 0
label = []
texts = []
fukugen = []
for i in sent:
try:
texts.append(fm.vocab[i].index)
except KeyError:
texts.append(fm.vocab["<unk>"].index)
fukugen.append(i)
label.append(0)
label[-1] = 1
labels.append(np.array(label))
text.append(np.array(texts))
fukugens.append(fukugen)
allab.append(labels)
altex.append(text)
fukugenss.append(fukugens)
labels, text, fukugens= [], [], []
return altex, allab, fukugenss
def generate(doc, mymodel, fm, index2word, split_method):
X_tes, Y_tes, fukugen = create_data(doc,fm,split_method)
output_texts = mymodel.check_accuracy(X_tes, Y_tes,index2word, fukugen)
return output_texts
def setup():
with open('index2word.pickle', 'rb') as f:
index2word = pickle.load(f)
with open('model.pickle', 'rb') as f:
mysolver = CPU_Unpickler(f).load()
with open('fm.pickle', 'rb') as f:
fm = pickle.load(f)
return mysolver,fm,index2word