File size: 4,576 Bytes
671cd2f db031ee 94b0caa 1bdf4e2 3ad54a9 7e7da2b 3ad54a9 1bdf4e2 671cd2f db031ee 671cd2f db031ee 9b6e58a db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json
import pandas as pd
import numpy as np
with open('CHAR_TYPES_MAP.json') as json_file:
CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
CHAR_TYPE_FLATTEN = json.load(json_file)
class TimestepDropout(Dropout):
def __init__(self, rate, **kwargs):
super(TimestepDropout, self).__init__(rate, **kwargs)
def _get_noise_shape(self, inputs):
input_shape = K.shape(inputs)
noise_shape = (input_shape[0], input_shape[1], 1)
return noise_shape
def model_(n_gram = 21):
input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
input2 = Input(shape=(21,),dtype='float32',name = 'type_input')
a = Embedding(178, 32,input_length=21)(input1)
a = SpatialDropout1D(0.15)(a)
#a = TimestepDropout(0.05)(a)
char_input = BatchNormalization()(a)
a_concat = []
filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[8,200],[11,150],[12,100]]
#filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
for (window_size, filters_size) in filters:
convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
convs = Activation('elu')(convs)
convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
a_concat.append(convs)
token_max = Maximum()(a_concat)
lstm_char = Bidirectional(LSTM(128 ,return_sequences=True,kernel_regularizer=regularizers.L2(0.0000001),bias_regularizer=regularizers.L2(0.0000001)))(char_input)
lstm_char = Dense(64, activation='elu')(lstm_char)
#lstm_char = Bidirectional(LSTM(64 ,return_sequences=True))(lstm_char)
#lstm_char = Attention(return_sequences=True)(lstm_char)
b = Embedding(12, 12, input_length=21)(input2)
type_inputs = SpatialDropout1D(0.15)(b)
#type_inputs = TimestepDropout(0.05)(b)
x = Concatenate()([type_inputs, char_input, lstm_char, token_max])
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dense(100, activation='elu')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='sigmoid',dtype = 'float32',kernel_regularizer=regularizers.L2(0.01),bias_regularizer=regularizers.L2(0.01))(x)
model = Model(inputs=[input1, input2], outputs=out)
return model
def create_feature_array(text, n_pad=21):
n = len(text)
n_pad_2 = int((n_pad - 1)/2)
text_pad = [' '] * n_pad_2 + [t for t in text] + [' '] * n_pad_2
x_char, x_type = [], []
for i in range(n_pad_2, n_pad_2 + n):
char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
list(reversed(text_pad[i - n_pad_2: i])) + \
[text_pad[i]]
char_map = [CHARS_MAP.get(c, 179) for c in char_list]
char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
for c in char_list]
x_char.append(char_map)
x_type.append(char_type)
x_char = np.array(x_char).astype(float)
x_type = np.array(x_type).astype(float)
return x_char, x_type
def tokenize(text):
n_pad = 21
if not text:
return ['']
if isinstance(text, str) and sys.version_info.major == 2:
text = text.decode('utf-8')
x_char, x_type = create_feature_array(text, n_pad=n_pad)
word_end = []
y_predict = model.predict([x_char, x_type], batch_size = 512)
y_predict = (y_predict.ravel() > 0.46542968749999997).astype(int)
word_end = y_predict[1:].tolist() + [1]
tokens = []
word = ''
for char, w_e in zip(text, word_end):
word += char
if w_e:
tokens.append(word)
word = ''
return tokens
model = model_()
model.load_weights("cutto_tf2.h5")
text = st.text_area("Enter original text!")
words = tokenize(text)
st.write('|'.join(words)) |