File size: 5,524 Bytes
455ab40
 
272f043
 
 
db551a2
73f115b
 
0c3ea8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77f8a57
 
0c3ea8f
77f8a57
 
 
aad5244
77f8a57
 
8ad5809
0c3ea8f
77f8a57
 
 
0c3ea8f
88924af
39ab779
c4d34b8
 
 
 
 
39ab779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77f8a57
39ab779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77f8a57
39ab779
 
 
 
 
 
 
 
 
 
 
77f8a57
39ab779
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering
from datasets import Dataset
import streamlit as st




# loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
model_checkpoint = "Modfiededition/roberta-fine-tuned-tweet-sentiment-extractor"

@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_tokenizer():
    return AutoTokenizer.from_pretrained(model_checkpoint )
tokenizer = load_tokenizer()  

@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_model():
    return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model = load_model()


#prompts
st.title("Tweet Sentiment Extractor...")

# take text/tweet input
textbox = st.text_area('Write your text in this box:', '',height=100,  max_chars=500 )
option = st.selectbox(
     'Pick the sentiment',
         ('positive', 'negative', 'neutral'))

python_dict = {"text":[textbox], "sentiment":[option]}

dataset = Dataset.from_dict(python_dict)

MAX_LENGTH = 105

button = st.button('Click here to extract the word/phrase from the text with the given sentiment: {0}..'.format(option))


if button:
     if not textbox:
          st.markdown("#### " +"Please write something in the above textbox..")
     else:
          with st.spinner('In progress.......'):
            
               def process_data(examples):
                    questions = examples["sentiment"]
                    context = examples["text"]
                    inputs = tokenizer(
                           questions,
                           context,
                           max_length = MAX_LENGTH,
                           padding="max_length",
                           return_offsets_mapping = True,   
                    )
                    # Assigning None values to all offset mapping of tokens which are not the context tokens.
                    for i in range(len(inputs["input_ids"])):
                         offset = inputs["offset_mapping"][i]
                         sequence_ids = inputs.sequence_ids(i)
                         inputs["offset_mapping"][i] = [
                               o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
                         ]
                    return inputs
                   
               processed_raw_data = dataset.map(
                       process_data,
                       batched = True
               )
               tf_raw_dataset = processed_raw_data.to_tf_dataset(
                       columns=["input_ids", "attention_mask"],
                       shuffle=False,
                       batch_size=1,
                   )
               
               # final predictions.
               outputs = model.predict(tf_raw_dataset)
               start_logits = outputs.start_logits
               end_logits = outputs.end_logits
                   
                   # Post Processing.
                   # Using start_logits and end_logits to generate the final answer from the given context.
               n_best = 20
               
               def predict_answers(inputs):
                    predicted_answer = []
                    for i in range(len(inputs["offset_mapping"])):
                         start_logit = inputs["start_logits"][i]
                         end_logit = inputs["end_logits"][i]
                         context = inputs["text"][i]
                         offset = inputs["offset_mapping"][i]
                         start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
                         end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
               
                         flag = False
                         for start_index in start_indexes:
                              for end_index in end_indexes:
                                   # skip answer that are not in the context.
                                   if offset[start_index] is None or offset[end_index] is None:
                                        continue
                                   # skip answer with length that is either < 0
                                   if end_index < start_index:
                                        continue
                                   flag = True
                                   answer = context[offset[start_index][0]: offset[end_index][1]]
                                   predicted_answer.append(answer)
                                   break
                              if flag:
                                   break
                         if not flag:
                              predicted_answer.append(answer)
                    return {"predicted_answer":predicted_answer}
                   
               processed_raw_data.set_format("pandas")
                   
               processed_raw_df =  processed_raw_data[:]
               processed_raw_df["start_logits"] = start_logits.tolist()
               processed_raw_df["end_logits"] = end_logits.tolist()
               processed_raw_df["text"] = python_dict["text"]
                   
               final_data = Dataset.from_pandas(processed_raw_df)
               final_data = final_data.map(predict_answers,batched=True)
              
            
          
          st.markdown("## " +final_data["predicted_answer"][0])