Modfiededition commited on
Commit
77f8a57
1 Parent(s): 881c1d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -90
app.py CHANGED
@@ -5,21 +5,7 @@ from transformers import TFAutoModelForQuestionAnswering
5
  from datasets import Dataset
6
  import streamlit as st
7
 
8
- #prompts
9
- st.title("Tweet Sentiment Extractor...")
10
-
11
- # take text/tweet input
12
- textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 )
13
- option = st.selectbox(
14
- 'How would you like to be contacted?',
15
- ('positive', 'negative', 'neutral'))
16
-
17
-
18
- python_dict = {"text":textbox, "sentiment":option}
19
 
20
- dataset = Dataset.from_dict(python_dict)
21
-
22
- MAX_LENGTH = 105
23
 
24
 
25
  # loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
@@ -35,83 +21,104 @@ def load_model():
35
  return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
36
  model = load_model()
37
 
38
- def process_data(examples):
39
- questions = examples["sentiment"]
40
- context = examples["text"]
41
- inputs = tokenizer(
42
- questions,
43
- context,
44
- max_length = MAX_LENGTH,
45
- padding="max_length",
46
- return_offsets_mapping = True,
47
- )
48
- # Assigning None values to all offset mapping of tokens which are not the context tokens.
49
- for i in range(len(inputs["input_ids"])):
50
- offset = inputs["offset_mapping"][i]
51
- sequence_ids = inputs.sequence_ids(i)
52
- inputs["offset_mapping"][i] = [
53
- o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
54
- ]
55
- return inputs
56
-
57
- processed_raw_data = dataset.map(
58
- process_data,
59
- batched = True
60
- )
61
- tf_raw_dataset = processed_raw_data.to_tf_dataset(
62
- columns=["input_ids", "attention_mask"],
63
- shuffle=False,
64
- batch_size=1,
65
- )
66
 
67
- # final predictions.
68
- outputs = model.predict(tf_raw_dataset)
69
- start_logits = outputs.start_logits
70
- end_logits = outputs.end_logits
71
-
72
- # Post Processing.
73
- # Using start_logits and end_logits to generate the final answer from the given context.
74
- n_best = 20
75
 
76
- def predict_answers(inputs):
77
- predicted_answer = []
78
- for i in range(len(inputs["offset_mapping"])):
79
- start_logit = inputs["start_logits"][i]
80
- end_logit = inputs["end_logits"][i]
81
- context = inputs["text"][i]
82
- offset = inputs["offset_mapping"][i]
83
- start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
84
- end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
85
 
86
- flag = False
87
- for start_index in start_indexes:
88
- for end_index in end_indexes:
89
- # skip answer that are not in the context.
90
- if offset[start_index] is None or offset[end_index] is None:
91
- continue
92
- # skip answer with length that is either < 0
93
- if end_index < start_index:
94
- continue
95
- flag = True
96
- answer = context[offset[start_index][0]: offset[end_index][1]]
97
- predicted_answer.append(answer)
98
- break
99
- if flag:
100
- break
101
- if not flag:
102
- predicted_answer.append(answer)
103
- return {"predicted_answer":predicted_answer}
104
-
105
- processed_raw_data.set_format("pandas")
106
-
107
- processed_raw_df = processed_raw_data[:]
108
- processed_raw_df["start_logits"] = start_logits.tolist()
109
- processed_raw_df["end_logits"] = end_logits.tolist()
110
- processed_raw_df["text"] = X["text"]
111
-
112
- final_data = Dataset.from_pandas(processed_raw_df)
113
- final_data = final_data.map(predict_answers,batched=True)
114
-
115
-
116
 
117
- st.markdown("## " +final_data["predicted_answer"] )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from datasets import Dataset
6
  import streamlit as st
7
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
9
 
10
 
11
  # loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
 
21
  return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
22
  model = load_model()
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ #prompts
26
+ st.title("Tweet Sentiment Extractor...")
 
 
 
 
 
 
27
 
28
+ # take text/tweet input
29
+ textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 )
30
+ option = st.selectbox(
31
+ 'How would you like to be contacted?',
32
+ ('positive', 'negative', 'neutral'))
33
+
34
+
35
+ python_dict = {"text":textbox, "sentiment":option}
 
36
 
37
+ dataset = Dataset.from_dict(python_dict)
38
+
39
+ MAX_LENGTH = 105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ button = st.button('Extract text of the given sentiment..')
42
+ if button:
43
+ with st.spinner('In progress.......'):
44
+
45
+ def process_data(examples):
46
+ questions = examples["sentiment"]
47
+ context = examples["text"]
48
+ inputs = tokenizer(
49
+ questions,
50
+ context,
51
+ max_length = MAX_LENGTH,
52
+ padding="max_length",
53
+ return_offsets_mapping = True,
54
+ )
55
+ # Assigning None values to all offset mapping of tokens which are not the context tokens.
56
+ for i in range(len(inputs["input_ids"])):
57
+ offset = inputs["offset_mapping"][i]
58
+ sequence_ids = inputs.sequence_ids(i)
59
+ inputs["offset_mapping"][i] = [
60
+ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
61
+ ]
62
+ return inputs
63
+
64
+ processed_raw_data = dataset.map(
65
+ process_data,
66
+ batched = True
67
+ )
68
+ tf_raw_dataset = processed_raw_data.to_tf_dataset(
69
+ columns=["input_ids", "attention_mask"],
70
+ shuffle=False,
71
+ batch_size=1,
72
+ )
73
+
74
+ # final predictions.
75
+ outputs = model.predict(tf_raw_dataset)
76
+ start_logits = outputs.start_logits
77
+ end_logits = outputs.end_logits
78
+
79
+ # Post Processing.
80
+ # Using start_logits and end_logits to generate the final answer from the given context.
81
+ n_best = 20
82
+
83
+ def predict_answers(inputs):
84
+ predicted_answer = []
85
+ for i in range(len(inputs["offset_mapping"])):
86
+ start_logit = inputs["start_logits"][i]
87
+ end_logit = inputs["end_logits"][i]
88
+ context = inputs["text"][i]
89
+ offset = inputs["offset_mapping"][i]
90
+ start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
91
+ end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
92
+
93
+ flag = False
94
+ for start_index in start_indexes:
95
+ for end_index in end_indexes:
96
+ # skip answer that are not in the context.
97
+ if offset[start_index] is None or offset[end_index] is None:
98
+ continue
99
+ # skip answer with length that is either < 0
100
+ if end_index < start_index:
101
+ continue
102
+ flag = True
103
+ answer = context[offset[start_index][0]: offset[end_index][1]]
104
+ predicted_answer.append(answer)
105
+ break
106
+ if flag:
107
+ break
108
+ if not flag:
109
+ predicted_answer.append(answer)
110
+ return {"predicted_answer":predicted_answer}
111
+
112
+ processed_raw_data.set_format("pandas")
113
+
114
+ processed_raw_df = processed_raw_data[:]
115
+ processed_raw_df["start_logits"] = start_logits.tolist()
116
+ processed_raw_df["end_logits"] = end_logits.tolist()
117
+ processed_raw_df["text"] = X["text"]
118
+
119
+ final_data = Dataset.from_pandas(processed_raw_df)
120
+ final_data = final_data.map(predict_answers,batched=True)
121
+
122
+
123
+
124
+ st.markdown("## " +final_data["predicted_answer"] )