File size: 5,682 Bytes
671cd2f
db031ee
 
 
 
 
94b0caa
1bdf4e2
3ad54a9
7e7da2b
 
3ad54a9
 
 
 
 
 
 
1bdf4e2
671cd2f
db031ee
671cd2f
db031ee
 
9b6e58a
db031ee
 
 
 
 
 
 
 
 
5b9b8c3
e63b020
 
db031ee
 
e63b020
 
db031ee
 
 
 
 
 
 
 
e63b020
 
 
 
db031ee
5b9b8c3
e63b020
 
 
db031ee
 
e63b020
db031ee
e63b020
db031ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78a3063
db031ee
 
 
 
 
 
 
 
e63b020
db031ee
 
 
 
 
 
 
 
 
 
 
 
42fcb82
21d0659
 
 
 
9bf9010
21d0659
 
 
 
 
722a04d
 
 
 
 
21d0659
722a04d
 
21d0659
722a04d
 
457af50
 
35e50f3
 
35aac79
457af50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json
import pandas as pd
import numpy as np

with open('CHAR_TYPES_MAP.json') as json_file:
    CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
    CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
    CHAR_TYPE_FLATTEN = json.load(json_file)


class TimestepDropout(Dropout):

    def __init__(self, rate, **kwargs):
        super(TimestepDropout, self).__init__(rate, **kwargs)

    def _get_noise_shape(self, inputs):
        input_shape = K.shape(inputs)
        noise_shape = (input_shape[0], input_shape[1], 1)
        return noise_shape

def model_(n_gram = 21):
    
    input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
    input2 = Input(shape=(21,),dtype='float32',name = 'type_input')
    a = Embedding(178, 32)(input1)
    a = SpatialDropout1D(0.15)(a)
    #a = TimestepDropout(0.05)(a)
    char_input = BatchNormalization()(a)
    a_concat = []
    filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[8,200],[11,150],[12,100]]
    #filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
    
    for (window_size, filters_size) in filters:
        convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
        convs = Activation('elu')(convs)
        convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
        convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
        a_concat.append(convs)
    token_max = Maximum()(a_concat)
    lstm_char = Bidirectional(LSTM(128 ,return_sequences=True,kernel_regularizer=regularizers.L2(0.0000001),bias_regularizer=regularizers.L2(0.0000001)))(char_input)
    lstm_char = Dense(64, activation='elu')(lstm_char)
    #lstm_char = Bidirectional(LSTM(64 ,return_sequences=True))(lstm_char)
    #lstm_char = Attention(return_sequences=True)(lstm_char)
    
    b = Embedding(12, 12)(input2)
    type_inputs = SpatialDropout1D(0.15)(b)
    #type_inputs = TimestepDropout(0.05)(b)
    x = Concatenate()([type_inputs, char_input, lstm_char, token_max])
    x = BatchNormalization()(x)
    x = Flatten()(x)
    x = Dense(100, activation='elu')(x)
    x = Dropout(0.2)(x)
    out = Dense(1, activation='sigmoid',dtype = 'float32',kernel_regularizer=regularizers.L2(0.01),bias_regularizer=regularizers.L2(0.01))(x)
    model = Model(inputs=[input1, input2], outputs=out)
    return model


def create_feature_array(text, n_pad=21):

    n = len(text)
    n_pad_2 = int((n_pad - 1)/2)
    text_pad = [' '] * n_pad_2  + [t for t in text] + [' '] * n_pad_2
    x_char, x_type = [], []
    for i in range(n_pad_2, n_pad_2 + n):
        char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
                    list(reversed(text_pad[i - n_pad_2: i])) + \
                    [text_pad[i]]
        char_map = [CHARS_MAP.get(c, 179) for c in char_list]
        char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
                     for c in char_list]
        x_char.append(char_map)
        x_type.append(char_type)
    x_char = np.array(x_char).astype(float)
    x_type = np.array(x_type).astype(float)
    return x_char, x_type
def tokenize(text):
    
        n_pad = 21
        if not text:
            return ['']
        if isinstance(text, str) and sys.version_info.major == 2:
            text = text.decode('utf-8')
        x_char, x_type = create_feature_array(text, n_pad=n_pad)
        word_end = []
        y_predict = model.predict([x_char, x_type], batch_size = 512)
        y_predict = (y_predict.ravel() > 0.46542968749999997).astype(int)
        word_end = y_predict[1:].tolist() + [1]
        tokens = []
        word = ''
        for char, w_e in zip(text, word_end):
            word += char
            if w_e:
                tokens.append(word)
                word = ''
        return tokens

model = model_()
model.load_weights("cutto_tf2.h5")
st.title("Cutto Thai word segmentation.")
text = st.text_area("Enter original text!")
if st.button("cut it!!"):
    if text:
        words = tokenize(text)
        st.subheader("Answer:")
        st.write('|'.join(words))
    else:
        st.warning("Please enter some text to seggmentation")

multi = '''### Score
Evaluate the model performance using the test dataset divided from BEST CORPUS 2009, which comprises 10 percent, with the following scores:
- F1-Score: 98.37		
- Precision: 98.02
- Recall: 98.67

### Resource Funding
NSTDA Supercomputer center (ThaiSC) and the National e-Science Infrastructure Consortium for their support of computer facilities.

### Citation
If you use cutto in your project or publication, please cite the model as follows:
'''
st.markdown(multi)
st.code(f"""
ปรีชานนท์ ชาติไทย และ สัจจวัจน์ ส่งเสริม. (2567),
การสรุปข้อความข่าวภาษาไทยด้วยโครงข่ายประสาทเทียม (Thai News Text Summarization Using Neural Network),
วิทยาศาสตรบัณฑิต (วทบ.):ขอนแก่น, มหาวิทยาลัยขอนแก่น)
""")