import re import pickle import numpy as np import random import torch from solver import TrainSolver from model import PointerNetworks import gensim import MeCab import pysbd import io class CPU_Unpickler(pickle.Unpickler): def find_class(self, module, name): if module == 'torch.storage' and name == '_load_from_bytes': return lambda b: torch.load(io.BytesIO(b), map_location='cpu') else: return super().find_class(module, name) def create_data(doc,fm,split_method): wakati = MeCab.Tagger("-Owakati -b 81920 -r /etc/mecabrc -d /home/user/app/mecab-ipadic-neologd") seg = pysbd.Segmenter(language="ja", clean=False) texts = [] sent = "" label = [] alls = [] labels, text, num = [], [], [] allab, altex, fukugenss = [], [], [] for n in range(1): fukugens = [] if split_method == "pySBD": lines = seg.segment(doc) else: doc = doc.strip().replace("。","。\n").replace(".",".\n") doc = re.sub("(\n)+","\n",doc) lines = doc.split("\n") for line in lines: line = line.strip() if line == "": continue sent = wakati.parse(line).split(" ")[:-1] flag = 0 label = [] texts = [] fukugen = [] for i in sent: try: texts.append(fm.vocab[i].index) except KeyError: texts.append(fm.vocab[""].index) fukugen.append(i) label.append(0) label[-1] = 1 labels.append(np.array(label)) text.append(np.array(texts)) fukugens.append(fukugen) allab.append(labels) altex.append(text) fukugenss.append(fukugens) labels, text, fukugens= [], [], [] return altex, allab, fukugenss def generate(doc, mymodel, fm, index2word, split_method): X_tes, Y_tes, fukugen = create_data(doc,fm,split_method) output_texts = mymodel.check_accuracy(X_tes, Y_tes,index2word, fukugen) return output_texts def setup(): with open('index2word.pickle', 'rb') as f: index2word = pickle.load(f) with open('model.pickle', 'rb') as f: mysolver = CPU_Unpickler(f).load() with open('fm.pickle', 'rb') as f: fm = pickle.load(f) return mysolver,fm,index2word