File size: 4,682 Bytes
62f76db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14d09ee
62f76db
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import re
import nltk
nltk.download('all')
import keras
import spacy
import string 
import pickle
import tempfile
import numpy as np
import gradio as gr
import contractions
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences


class Attention(Layer):
    
    def __init__(self, return_sequences=True, **kwargs):
        self.return_sequences = return_sequences
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="zeros")
        
        super(Attention,self).build(input_shape)
        
    def call(self, x):
        
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        
        if self.return_sequences:
            return output
        
        return K.sum(output, axis=1)
    


def load_tokenizer(path):
    with open(path, 'rb') as f:
        tokenizer = pickle.load(f)
    return tokenizer


def cleaning(text):
    # Punctuation symbols to remove
    exclude = string.punctuation
    
    def expand_contractions(text): return contractions.fix(text)
    text = expand_contractions(text)

    text = text.lower()
    
    def remove_tags(text): return re.sub(r'@\w*', ' ' , text)
    text = remove_tags(text)
    
    def remove_hashtags(text): return re.sub(r'#\w*', ' ' , text)
    text = remove_hashtags(text)
    
    def remove_apostrophe(text): return re.sub(r"'s\b", "", text)
    text = remove_apostrophe(text)

    def remove_special_chars(text): return re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
    text = remove_special_chars(text)
    
    def remove_number(text): return re.sub(r'[\d]', ' ', text)
    text = remove_number(text)
    
    def remove_punc(text): return ''.join([c for c in text if c not in exclude])
    text = remove_punc(text)
    
    def remove_extra_spaces(text): return re.sub('^\S', ' ', text)
    text = remove_extra_spaces(text)
    
    def map_pos_tags(pos_tags):
    # Map NLTK POS tags to WordNet tags
        tag_map = {
            'N': wordnet.NOUN,
            'V': wordnet.VERB,
            'R': wordnet.ADV,
            'J': wordnet.ADJ
        }
        
        mapped_tags = []
        for token, tag in pos_tags:
            mapped_tag = tag[0].upper()
            if mapped_tag in tag_map:
                mapped_tag = tag_map[mapped_tag]
            else:
                mapped_tag = wordnet.NOUN  # Default to noun if no mapping found
            mapped_tags.append(mapped_tag)

        return mapped_tags
    
    def remove_stopwords(text):
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        filtered_text = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_text)
    text = remove_stopwords(text)
    
    def pos_tag_and_lemmatize(text):
        tokens = word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        
        # Map POS tags to WordNet tags
        wordnet_tags = map_pos_tags(pos_tags)

        # Lemmatize based on POS tags
        lemmatizer = WordNetLemmatizer()
        lemmas = " ".join([lemmatizer.lemmatize(token, tag) for token, tag in zip(tokens, wordnet_tags)])

        return lemmas
    text = pos_tag_and_lemmatize(text)

    return text


def label_tweet(test_review):
  token_list = tokenizer.texts_to_sequences([test_review])[0]
  token_list = pad_sequences([token_list], maxlen=44, padding='post')
  predicted = model.predict(token_list, verbose=0)
  if predicted >= 0.5:
    return 1
  else: 
    return 0


def analyze_text(comment):
    comment = cleaning(comment)
    result = label_tweet(comment)
    if result == 0: 
        text = "Negative"
    else:
        text = "Positive"
    return text


# It can be used to reconstruct the model identically.
model = keras.models.load_model("twitter_sentiment.keras",
                                custom_objects={'Attention': Attention})

# Load tokenizer
tokenizer = load_tokenizer('tokenizer.pkl')

interface = gr.Interface(fn=analyze_text, inputs=gr.inputs.Textbox(lines=2, placeholder="Enter the text here."),
                         outputs='text',title='Twitter Sentimental Analysis', theme='darkhuggingface')
interface.launch(inline=False)