MadhuP commited on
Commit
ee0bad3
·
1 Parent(s): 70f7158

Update app.py

Browse files

adding feature of per Word classification

Files changed (1) hide show
  1. app.py +116 -70
app.py CHANGED
@@ -13,9 +13,7 @@ import nltk
13
 
14
  def check_by_url(txt_url):
15
  parsed_url = urlparse(txt_url)
16
- url = (
17
- f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
18
- )
19
  print(url)
20
 
21
  new_data = []
@@ -28,9 +26,7 @@ def check_by_url(txt_url):
28
  punctuationfree = "".join([i for i in title if i not in string.punctuation])
29
  return punctuationfree
30
 
31
- css_class_to_remove = (
32
- "dp-highlighter" # Replace with the CSS class you want to remove
33
- )
34
  # Find <div> tags with the specified CSS class and remove their content
35
  div_tags = soup.find_all(["code", "pre"])
36
  for div_tag in div_tags:
@@ -55,38 +51,30 @@ def check_by_url(txt_url):
55
  content_without_style += p_content
56
 
57
  # Replace Unicode characters in the content and remove duplicates
58
- normalized_content_with_style = re.sub(
59
- r"\s+", " ", content_with_style
60
- ) # Remove extra spaces
61
- normalized_content_with_style = normalized_content_with_style.replace(
62
- "\r", ""
63
- ) # Replace '\r' characters
64
- normalized_content_with_style = unicodedata.normalize(
65
- "NFKD", normalized_content_with_style
66
- )
67
  normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
68
 
69
- normalized_content_without_style = re.sub(
70
- r"\s+", " ", content_without_style
71
- ) # Remove extra spaces
72
- normalized_content_without_style = normalized_content_without_style.replace(
73
- "\r", ""
74
- ) # Replace '\r' characters
75
- normalized_content_without_style = unicodedata.normalize(
76
- "NFKD", normalized_content_without_style
77
- )
78
- normalized_content_without_style = unidecode.unidecode(
79
- normalized_content_without_style
80
- )
81
 
82
  normalized_content_with_style += normalized_content_without_style
83
  new_data = {"title": title, "content": normalized_content_with_style}
 
84
 
85
- model = DistilBertForSequenceClassification.from_pretrained(".")
86
- tokenizer = DistilBertTokenizer.from_pretrained(".")
87
 
 
88
  test_encodings = tokenizer.encode_plus(
89
- title, truncation=True, padding=True, max_length=512, return_tensors="pt"
 
 
 
 
90
  )
91
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
92
  test_input_ids = test_encodings["input_ids"].to(device)
@@ -98,9 +86,9 @@ def check_by_url(txt_url):
98
  logits = outputs.logits
99
  predicted_labels = torch.argmax(logits, dim=1)
100
  probabilities = F.softmax(logits, dim=1)
101
- confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
102
- predicted_label_title = predicted_labels.item()
103
-
104
  test_encodings = tokenizer.encode_plus(
105
  normalized_content_with_style,
106
  truncation=True,
@@ -116,11 +104,7 @@ def check_by_url(txt_url):
116
  predicted_labels = torch.argmax(logits, dim=1)
117
  probabilities = F.softmax(logits, dim=1)
118
  confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
119
- predicted_label_content = predicted_labels.item()
120
-
121
- label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
122
- predicted_label_title = label_mapping[predicted_label_title]
123
- predicted_label_content = label_mapping[predicted_label_content]
124
 
125
  return (
126
  predicted_label_title,
@@ -128,35 +112,35 @@ def check_by_url(txt_url):
128
  predicted_label_content,
129
  confidence_scores_content,
130
  new_data,
 
131
  )
132
 
133
-
134
  label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
135
-
136
-
137
  def predict_2(txt_url, normalized_content_with_style):
138
- (
139
  predicted_label_title,
140
  confidence_score_title,
141
  predicted_label_content,
142
  confidence_scores_content,
143
- new_data,
144
  ) = (None, None, None, None, None)
145
- predicted_label_text, confidence_score_text = None, None
146
 
147
- if txt_url.startswith("http://") or txt_url.startswith("https://"):
148
- (
 
 
149
  predicted_label_title,
150
  confidence_score_title,
151
  predicted_label_content,
152
  confidence_scores_content,
153
  new_data,
154
- ) = check_by_url(txt_url)
155
- elif txt_url.startswith(""):
156
- model = DistilBertForSequenceClassification.from_pretrained(".")
157
- tokenizer = DistilBertTokenizer.from_pretrained(".")
158
 
159
- test_encodings = tokenizer.encode_plus(
160
  normalized_content_with_style,
161
  truncation=True,
162
  padding=True,
@@ -164,11 +148,11 @@ def predict_2(txt_url, normalized_content_with_style):
164
  return_tensors="pt",
165
  )
166
 
167
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
168
- test_input_ids = test_encodings["input_ids"].to(device)
169
- test_attention_mask = test_encodings["attention_mask"].to(device)
170
 
171
- with torch.no_grad():
172
  model = model.to(device)
173
  model.eval()
174
  outputs = model(test_input_ids, attention_mask=test_attention_mask)
@@ -178,22 +162,83 @@ def predict_2(txt_url, normalized_content_with_style):
178
  confidence_score_text = torch.max(probabilities, dim=1).values.tolist()
179
  predicted_label_text = label_mapping[predicted_labels.item()]
180
 
181
- else:
182
- print("Done")
 
 
 
 
 
 
 
 
183
 
184
- return (
185
- predicted_label_title,
186
- confidence_score_title,
187
- predicted_label_content,
188
- confidence_scores_content,
189
- new_data,
190
- predicted_label_text,
191
- confidence_score_text,
192
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
 
195
  demo = gr.Interface(
196
- fn=predict_2,
197
  inputs=[
198
  gr.inputs.Textbox(label="URL", placeholder="Enter URL"),
199
  gr.inputs.Textbox(label="Text", placeholder="Enter Text"),
@@ -204,9 +249,10 @@ demo = gr.Interface(
204
  gr.outputs.Textbox(label="Content_prediction"),
205
  gr.outputs.Textbox(label="Content_confidence_score"),
206
  gr.outputs.Textbox(label="Description").style(show_copy_button=True),
207
- gr.outputs.Textbox(label="Text_prediction"),
208
  gr.outputs.Textbox(label="Text_confidence_score"),
 
209
  ],
210
- )
211
 
212
- demo.launch()
 
13
 
14
  def check_by_url(txt_url):
15
  parsed_url = urlparse(txt_url)
16
+ url = (f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/")
 
 
17
  print(url)
18
 
19
  new_data = []
 
26
  punctuationfree = "".join([i for i in title if i not in string.punctuation])
27
  return punctuationfree
28
 
29
+ css_class_to_remove = ("dp-highlighter") # Replace with the CSS class you want to remove
 
 
30
  # Find <div> tags with the specified CSS class and remove their content
31
  div_tags = soup.find_all(["code", "pre"])
32
  for div_tag in div_tags:
 
51
  content_without_style += p_content
52
 
53
  # Replace Unicode characters in the content and remove duplicates
54
+ normalized_content_with_style = re.sub(r"\s+", " ", content_with_style) # Remove extra spaces
55
+ normalized_content_with_style = normalized_content_with_style.replace("\r", "") # Replace '\r' characters
56
+ normalized_content_with_style = unicodedata.normalize("NFKD", normalized_content_with_style)
 
 
 
 
 
 
57
  normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
58
 
59
+ normalized_content_without_style = re.sub(r"\s+", " ", content_without_style) # Remove extra spaces
60
+ normalized_content_without_style = normalized_content_without_style.replace("\r", "") # Replace '\r' characters
61
+ normalized_content_without_style = unicodedata.normalize("NFKD", normalized_content_without_style)
62
+ normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
 
 
 
 
 
 
 
 
63
 
64
  normalized_content_with_style += normalized_content_without_style
65
  new_data = {"title": title, "content": normalized_content_with_style}
66
+ # return new_data
67
 
68
+ model = DistilBertForSequenceClassification.from_pretrained("/content/LoadModel")
69
+ tokenizer = DistilBertTokenizer.from_pretrained("/content/LoadModel")
70
 
71
+ label_mapping = {1: "SFW", 0: "NSFW"}
72
  test_encodings = tokenizer.encode_plus(
73
+ title,
74
+ truncation=True,
75
+ padding=True,
76
+ max_length=512,
77
+ return_tensors="pt"
78
  )
79
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80
  test_input_ids = test_encodings["input_ids"].to(device)
 
86
  logits = outputs.logits
87
  predicted_labels = torch.argmax(logits, dim=1)
88
  probabilities = F.softmax(logits, dim=1)
89
+ confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
90
+ predicted_label_title = label_mapping[predicted_labels.item()]
91
+
92
  test_encodings = tokenizer.encode_plus(
93
  normalized_content_with_style,
94
  truncation=True,
 
104
  predicted_labels = torch.argmax(logits, dim=1)
105
  probabilities = F.softmax(logits, dim=1)
106
  confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
107
+ predicted_label_content = label_mapping[predicted_labels.item()]
 
 
 
 
108
 
109
  return (
110
  predicted_label_title,
 
112
  predicted_label_content,
113
  confidence_scores_content,
114
  new_data,
115
+ #new1,
116
  )
117
 
 
118
  label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
119
+
 
120
  def predict_2(txt_url, normalized_content_with_style):
121
+ (
122
  predicted_label_title,
123
  confidence_score_title,
124
  predicted_label_content,
125
  confidence_scores_content,
126
+ new_data,
127
  ) = (None, None, None, None, None)
 
128
 
129
+ predicted_label_text, confidence_score_text = None, None
130
+
131
+ if txt_url.startswith("http://") or txt_url.startswith("https://"):
132
+ (
133
  predicted_label_title,
134
  confidence_score_title,
135
  predicted_label_content,
136
  confidence_scores_content,
137
  new_data,
138
+ ) = check_by_url(txt_url)
139
+ elif txt_url.startswith(""):
140
+ model = DistilBertForSequenceClassification.from_pretrained("/content/LoadModel")
141
+ tokenizer = DistilBertTokenizer.from_pretrained("/content/LoadModel")
142
 
143
+ test_encodings = tokenizer.encode_plus(
144
  normalized_content_with_style,
145
  truncation=True,
146
  padding=True,
 
148
  return_tensors="pt",
149
  )
150
 
151
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
152
+ test_input_ids = test_encodings["input_ids"].to(device)
153
+ test_attention_mask = test_encodings["attention_mask"].to(device)
154
 
155
+ with torch.no_grad():
156
  model = model.to(device)
157
  model.eval()
158
  outputs = model(test_input_ids, attention_mask=test_attention_mask)
 
162
  confidence_score_text = torch.max(probabilities, dim=1).values.tolist()
163
  predicted_label_text = label_mapping[predicted_labels.item()]
164
 
165
+ return (
166
+ predicted_label_title,
167
+ confidence_score_title,
168
+ predicted_label_content,
169
+ confidence_scores_content,
170
+ new_data,
171
+ predicted_label_text,
172
+ confidence_score_text,
173
+ #new,
174
+ )
175
 
176
+ def word_by_word(txt_url, normalized_content_with_style):
177
+ if txt_url.startswith("http://") or txt_url.startswith("https://") or txt_url.startswith(""):
178
+ (
179
+ predicted_label_title,
180
+ confidence_score_title,
181
+ predicted_label_content,
182
+ confidence_scores_content,
183
+ new_data,
184
+ predicted_label_text,
185
+ confidence_score_text,
186
+ ) = predict_2(txt_url, normalized_content_with_style)
187
+
188
+ model = DistilBertForSequenceClassification.from_pretrained("/content/LoadModel")
189
+ tokenizer = DistilBertTokenizer.from_pretrained("/content/LoadModel")
190
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
191
+ model = model.to(device)
192
+ model.eval()
193
+
194
+ new_word={}
195
+ content_words =[]
196
+ words_2 =[]
197
+ if predicted_label_content=="NSFW" or predicted_label_text=="NSFW":
198
+ if txt_url.startswith("http://") or txt_url.startswith("https://"):
199
+ content_words = new_data['content'].split()
200
+ else:
201
+ words_2 = normalized_content_with_style.split()
202
+
203
+ results = []
204
+ for word in content_words or words_2 :
205
+ encoding = tokenizer.encode_plus(
206
+ word,
207
+ truncation=True,
208
+ padding=True,
209
+ max_length=512,
210
+ return_tensors="pt"
211
+ )
212
+ input_ids = encoding["input_ids"].to(device)
213
+ attention_mask = encoding["attention_mask"].to(device)
214
+ with torch.no_grad():
215
+ outputs = model(input_ids, attention_mask=attention_mask)
216
+ logits = outputs.logits
217
+ probabilities = F.softmax(logits, dim=1)
218
+ predicted_label = torch.argmax(logits, dim=1).item()
219
+ #label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:False
220
+ predicted_label_word = label_mapping[predicted_label]
221
+ confidence_score_word = torch.max(probabilities, dim=1).values.item()
222
+
223
+ #new_word={}
224
+ if predicted_label_word=="NSFW":
225
+ result = {"Word": word, "Label": predicted_label_word, "Confidence": confidence_score_word}
226
+ results.append(result)
227
+ new_word = json.dumps(results)
228
+ return(
229
+ predicted_label_title,
230
+ confidence_score_title,
231
+ predicted_label_content,
232
+ confidence_scores_content,
233
+ new_data,
234
+ predicted_label_text,
235
+ confidence_score_text,
236
+ new_word,
237
+ )
238
 
239
 
240
  demo = gr.Interface(
241
+ fn=word_by_word,
242
  inputs=[
243
  gr.inputs.Textbox(label="URL", placeholder="Enter URL"),
244
  gr.inputs.Textbox(label="Text", placeholder="Enter Text"),
 
249
  gr.outputs.Textbox(label="Content_prediction"),
250
  gr.outputs.Textbox(label="Content_confidence_score"),
251
  gr.outputs.Textbox(label="Description").style(show_copy_button=True),
252
+ gr.outputs.Textbox(label="Text_prediction_score"),
253
  gr.outputs.Textbox(label="Text_confidence_score"),
254
+ gr.outputs.Textbox(label="word-by-word").style(show_copy_button=True),
255
  ],
256
+ )
257
 
258
+ demo.launch(debug=True, share= True)