Modfiededition commited on
Commit
0c3ea8f
1 Parent(s): 2427070

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -4
app.py CHANGED
@@ -12,11 +12,106 @@ st.title("Tweet Sentiment Extractor...")
12
  textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 )
13
  option = st.selectbox(
14
  'How would you like to be contacted?',
15
- ('Email', 'Home phone', 'Mobile phone'))
16
 
17
- st.write('You selected:', option)
18
 
19
- #MAX_LENGTH = 105
20
 
21
- #dataset = Dataset.from_dict(X)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 )
13
  option = st.selectbox(
14
  'How would you like to be contacted?',
15
+ ('positive', 'negative', 'neutral'))
16
 
 
17
 
18
+ python_dict = {"text":textbox, "sentiment":option}
19
 
20
+ dataset = Dataset.from_dict(python_dict)
21
 
22
+ MAX_LENGTH = 105
23
+
24
+
25
+ # loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
26
+ model_checkpoint = "Modfiededition/roberta-fine-tuned-tweet-sentiment-extractor"
27
+
28
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
29
+ def load_tokenizer():
30
+ return AutoTokenizer.from_pretrained(model_checkpoint )
31
+ tokenizer = load_tokenizer()
32
+
33
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
34
+ def load_model():
35
+ return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
36
+ model = load_model()
37
+
38
+ def process_data(examples):
39
+ questions = examples["sentiment"]
40
+ context = examples["text"]
41
+ inputs = tokenizer(
42
+ questions,
43
+ context,
44
+ max_length = MAX_LENGTH,
45
+ padding="max_length",
46
+ return_offsets_mapping = True,
47
+ )
48
+ # Assigning None values to all offset mapping of tokens which are not the context tokens.
49
+ for i in range(len(inputs["input_ids"])):
50
+ offset = inputs["offset_mapping"][i]
51
+ sequence_ids = inputs.sequence_ids(i)
52
+ inputs["offset_mapping"][i] = [
53
+ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
54
+ ]
55
+ return inputs
56
+
57
+ processed_raw_data = dataset.map(
58
+ process_data,
59
+ batched = True
60
+ )
61
+ tf_raw_dataset = processed_raw_data.to_tf_dataset(
62
+ columns=["input_ids", "attention_mask"],
63
+ shuffle=False,
64
+ batch_size=1,
65
+ )
66
+
67
+ # final predictions.
68
+ outputs = model.predict(tf_raw_dataset)
69
+ start_logits = outputs.start_logits
70
+ end_logits = outputs.end_logits
71
+
72
+ # Post Processing.
73
+ # Using start_logits and end_logits to generate the final answer from the given context.
74
+ n_best = 20
75
+
76
+ def predict_answers(inputs):
77
+ predicted_answer = []
78
+ for i in range(len(inputs["offset_mapping"])):
79
+ start_logit = inputs["start_logits"][i]
80
+ end_logit = inputs["end_logits"][i]
81
+ context = inputs["text"][i]
82
+ offset = inputs["offset_mapping"][i]
83
+ start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
84
+ end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
85
+
86
+ flag = False
87
+ for start_index in start_indexes:
88
+ for end_index in end_indexes:
89
+ # skip answer that are not in the context.
90
+ if offset[start_index] is None or offset[end_index] is None:
91
+ continue
92
+ # skip answer with length that is either < 0
93
+ if end_index < start_index:
94
+ continue
95
+ flag = True
96
+ answer = context[offset[start_index][0]: offset[end_index][1]]
97
+ predicted_answer.append(answer)
98
+ break
99
+ if flag:
100
+ break
101
+ if not flag:
102
+ predicted_answer.append(answer)
103
+ return {"predicted_answer":predicted_answer}
104
+
105
+ processed_raw_data.set_format("pandas")
106
+
107
+ processed_raw_df = processed_raw_data[:]
108
+ processed_raw_df["start_logits"] = start_logits.tolist()
109
+ processed_raw_df["end_logits"] = end_logits.tolist()
110
+ processed_raw_df["text"] = X["text"]
111
+
112
+ final_data = Dataset.from_pandas(processed_raw_df)
113
+ final_data = final_data.map(predict_answers,batched=True)
114
+
115
+
116
+
117
+ st.markdown("## " +final_data["predicted_answer"] )