Cutto

Running

File size: 4,576 Bytes

671cd2f
db031ee
 
 
 
 
94b0caa
1bdf4e2
3ad54a9
7e7da2b
 
3ad54a9
 
 
 
 
 
 
1bdf4e2
671cd2f
db031ee
671cd2f
db031ee
 
9b6e58a
db031ee
 
 
 
 
 
 
 
 
 
 
e63b020
 
 
db031ee
 
 
e63b020
 
db031ee
 
 
 
 
 
 
 
e63b020
 
 
 
db031ee
 
e63b020
 
db031ee
e63b020
db031ee
e63b020
db031ee
e63b020
db031ee
e63b020
db031ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e63b020
db031ee

import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json
import pandas as pd
import numpy as np

with open('CHAR_TYPES_MAP.json') as json_file:
    CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
    CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
    CHAR_TYPE_FLATTEN = json.load(json_file)


class TimestepDropout(Dropout):

    def __init__(self, rate, **kwargs):
        super(TimestepDropout, self).__init__(rate, **kwargs)

    def _get_noise_shape(self, inputs):
        input_shape = K.shape(inputs)
        noise_shape = (input_shape[0], input_shape[1], 1)
        return noise_shape


def model_(n_gram = 21):
    
    input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
    input2 = Input(shape=(21,),dtype='float32',name = 'type_input')

    a = Embedding(178, 32,input_length=21)(input1)
    a = SpatialDropout1D(0.15)(a)
    #a = TimestepDropout(0.05)(a)
    char_input = BatchNormalization()(a)

    a_concat = []
    filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[8,200],[11,150],[12,100]]
    #filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
    
    for (window_size, filters_size) in filters:
        convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
        convs = Activation('elu')(convs)
        convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
        convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
        a_concat.append(convs)
    token_max = Maximum()(a_concat)
    lstm_char = Bidirectional(LSTM(128 ,return_sequences=True,kernel_regularizer=regularizers.L2(0.0000001),bias_regularizer=regularizers.L2(0.0000001)))(char_input)
    lstm_char = Dense(64, activation='elu')(lstm_char)
    #lstm_char = Bidirectional(LSTM(64 ,return_sequences=True))(lstm_char)
    #lstm_char = Attention(return_sequences=True)(lstm_char)
    
    b = Embedding(12, 12, input_length=21)(input2)
    type_inputs = SpatialDropout1D(0.15)(b)
    #type_inputs = TimestepDropout(0.05)(b)

    x = Concatenate()([type_inputs, char_input, lstm_char, token_max])
    x = BatchNormalization()(x)
    
    x = Flatten()(x)
    x = Dense(100, activation='elu')(x)
    x = Dropout(0.2)(x)
    out = Dense(1, activation='sigmoid',dtype = 'float32',kernel_regularizer=regularizers.L2(0.01),bias_regularizer=regularizers.L2(0.01))(x)
    

    model = Model(inputs=[input1, input2], outputs=out)
   
    return model


def create_feature_array(text, n_pad=21):

    n = len(text)
    n_pad_2 = int((n_pad - 1)/2)
    text_pad = [' '] * n_pad_2  + [t for t in text] + [' '] * n_pad_2
    x_char, x_type = [], []
    for i in range(n_pad_2, n_pad_2 + n):
        char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
                    list(reversed(text_pad[i - n_pad_2: i])) + \
                    [text_pad[i]]
        char_map = [CHARS_MAP.get(c, 179) for c in char_list]
        char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
                     for c in char_list]
        x_char.append(char_map)
        x_type.append(char_type)
    x_char = np.array(x_char).astype(float)
    x_type = np.array(x_type).astype(float)
    return x_char, x_type

def tokenize(text):
        n_pad = 21

        if not text:
            return ['']

        if isinstance(text, str) and sys.version_info.major == 2:
            text = text.decode('utf-8')

        x_char, x_type = create_feature_array(text, n_pad=n_pad)
        word_end = []
 
        y_predict = model.predict([x_char, x_type], batch_size = 512)
        y_predict = (y_predict.ravel() > 0.46542968749999997).astype(int)
        word_end = y_predict[1:].tolist() + [1]
        
        tokens = []
        word = ''
        for char, w_e in zip(text, word_end):
            word += char
            if w_e:
                tokens.append(word)
                word = ''
        return tokens


model = model_()
model.load_weights("cutto_tf2.h5")

text = st.text_area("Enter original text!")
words = tokenize(text)

st.write('|'.join(words))