RohitGuptaAI commited on
Commit
43cdd6d
·
1 Parent(s): 6ad29ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -115
app.py CHANGED
@@ -8,124 +8,166 @@ from sklearn.metrics import confusion_matrix, accuracy_score
8
  import torch.nn.functional as F
9
  import gradio as gr
10
  import torch
 
11
 
12
  def check_by_url(txt_url):
13
- #txt_url = "https://www.c-sharpcorner.com/article/how-to-add-multimedia-content-with-html/default.txt"
14
- parsed_url = urlparse(txt_url)
15
- url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
16
- print(url)
17
-
18
- new_data =[]
19
- page = urlopen(url=url).read().decode("utf-8")
20
- soup = BeautifulSoup(page, 'html.parser')
21
- title = soup.find('title').get_text()
22
-
23
- css_class_to_remove = "dp-highlighter" # Replace with the CSS class you want to remove
24
- #Find <div> tags with the specified CSS class and remove their content
25
- div_tags = soup.find_all(['code', 'pre'])
26
- for div_tag in div_tags:
27
- div_tag.clear()
28
-
29
- div_tags = soup.find_all('div', class_=css_class_to_remove)
30
- for div_tag in div_tags:
31
- div_tag.clear()
32
-
33
- # Fetch content of remaining tags
34
- content_with_style = ""
35
- p_tags_with_style = soup.find_all('p', style=True)
36
- for p_tag in p_tags_with_style:
37
- p_content = re.sub(r'\n', '', p_tag.get_text())
38
- content_with_style += p_content
39
-
40
- # Fetch content of <p> tags without style
41
- content_without_style = ""
42
- p_tags_without_style = soup.find_all('p', style=False)
43
- for p_tag in p_tags_without_style:
44
- p_content = re.sub(r'\n', '', p_tag.get_text())
45
- content_without_style += p_content
46
-
47
- # Replace Unicode characters in the content and remove duplicates
48
- normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style) # Remove extra spaces
49
- normalized_content_with_style = normalized_content_with_style.replace('\r', '') # Replace '\r' characters
50
- normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
51
- normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
52
-
53
- normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style) # Remove extra spaces
54
- normalized_content_without_style = normalized_content_without_style.replace('\r', '') # Replace '\r' characters
55
- normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
56
- normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
57
-
58
- normalized_content_with_style += normalized_content_without_style
59
- new_data = {"title": title, "content": normalized_content_with_style}
60
-
61
- model = DistilBertForSequenceClassification.from_pretrained(".")
62
- tokenizer = DistilBertTokenizer.from_pretrained(".")
63
-
64
- test_encodings = tokenizer.encode_plus(
65
- title,
66
- truncation=True,
67
- padding=True,
68
- max_length=512,
69
- return_tensors="pt"
70
- )
71
- model1=[]
72
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
73
- test_input_ids = test_encodings["input_ids"].to(device)
74
- test_attention_mask = test_encodings["attention_mask"].to(device)
75
- with torch.no_grad():
76
- model1= model.to(device)
77
- model1.eval()
78
- outputs= model1( test_input_ids, attention_mask=test_attention_mask)
79
- logits = outputs.logits
80
- predicted_labels = torch.argmax(logits, dim=1)
81
- probabilities = F.softmax(logits, dim=1)
82
- confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
83
- predicted_labels = torch.argmax(outputs.logits, dim=1)
84
- label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
85
- predicted_label_title = label_mapping[predicted_labels.item()]
86
-
87
-
88
- test_encodings = tokenizer.encode_plus(
89
- normalized_content_with_style,
90
- truncation=True,
91
- padding=True,
92
- max_length=512,
93
- return_tensors="pt"
94
- )
95
- model1=[]
96
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
97
- test_input_ids = test_encodings["input_ids"].to(device)
98
- test_attention_mask = test_encodings["attention_mask"].to(device)
99
- with torch.no_grad():
100
- model1= model.to(device)
101
- model1.eval()
102
- outputs= model1( test_input_ids, attention_mask=test_attention_mask)
103
- logits = outputs.logits
104
- predicted_labels = torch.argmax(logits, dim=1)
105
- probabilities = F.softmax(logits, dim=1)
106
- confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
107
- label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
108
- predicted_label_content = label_mapping[predicted_labels.item()]
109
-
110
- return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
111
-
112
- def predict_2( url):
113
- predicted_label_title, confidence_score_title,predicted_label_content, confidence_scores_content, new_data = check_by_url(url)
114
- return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  demo = gr.Interface(
117
- fn=predict_2,
118
- inputs= [
119
- gr.inputs.Textbox(label="Enter URL"),
120
-
121
- ],
122
- outputs= [
123
-
124
- gr.outputs.Textbox(label="Title_prediction"),
125
- gr.outputs.Textbox(label="Title_confidence_score"),
126
- gr.outputs.Textbox(label="Content_prediction"),
127
- gr.outputs.Textbox(label="content_confidence_score"),
128
- gr.outputs.Textbox(label="new_data").style(show_copy_button=True)
129
- ],
 
 
130
  )
 
131
  demo.launch()
 
8
  import torch.nn.functional as F
9
  import gradio as gr
10
  import torch
11
+ import nltk
12
 
13
  def check_by_url(txt_url):
14
+ #if txt_url.startswith("http://") or txt_url.startswith("https://"):
15
+ parsed_url = urlparse(txt_url)
16
+ url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
17
+ print(url)
18
+
19
+ new_data = []
20
+ page = urlopen(url=url).read().decode("utf-8")
21
+ soup = BeautifulSoup(page, 'html.parser')
22
+ title = soup.find('title').get_text()
23
+
24
+ # remove punctuations from title
25
+ def remove_punctuation(title):
26
+ punctuationfree = "".join([i for i in title if i not in string.punctuation])
27
+ return punctuationfree
28
+
29
+ css_class_to_remove = "dp-highlighter" # Replace with the CSS class you want to remove
30
+ # Find <div> tags with the specified CSS class and remove their content
31
+ div_tags = soup.find_all(['code', 'pre'])
32
+ for div_tag in div_tags:
33
+ div_tag.clear()
34
+
35
+ div_tags = soup.find_all('div', class_=css_class_to_remove)
36
+ for div_tag in div_tags:
37
+ div_tag.clear()
38
+
39
+ # Fetch content of remaining tags
40
+ content_with_style = ""
41
+ p_tags_with_style = soup.find_all('p', style=True)
42
+ for p_tag in p_tags_with_style:
43
+ p_content = re.sub(r'\n', '', p_tag.get_text())
44
+ content_with_style += p_content
45
+
46
+ # Fetch content of <p> tags without style
47
+ content_without_style = ""
48
+ p_tags_without_style = soup.find_all('p', style=False)
49
+ for p_tag in p_tags_without_style:
50
+ p_content = re.sub(r'\n', '', p_tag.get_text())
51
+ content_without_style += p_content
52
+
53
+ # Replace Unicode characters in the content and remove duplicates
54
+ normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style) # Remove extra spaces
55
+ normalized_content_with_style = normalized_content_with_style.replace('\r', '') # Replace '\r' characters
56
+ normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
57
+ normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
58
+
59
+ normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style) # Remove extra spaces
60
+ normalized_content_without_style = normalized_content_without_style.replace('\r', '') # Replace '\r' characters
61
+ normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
62
+ normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
63
+
64
+ normalized_content_with_style += normalized_content_without_style
65
+ new_data = {"title": title, "content": normalized_content_with_style}
66
+
67
+ Save_model = "/content/G:\Model_SAVE" # Replace with your saved model name
68
+
69
+ model = DistilBertForSequenceClassification.from_pretrained(Save_model)
70
+ tokenizer = DistilBertTokenizer.from_pretrained(Save_model)
71
+
72
+ test_encodings = tokenizer.encode_plus(
73
+ title,
74
+ truncation=True,
75
+ padding=True,
76
+ max_length=512,
77
+ return_tensors="pt"
78
+ )
79
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80
+ test_input_ids = test_encodings["input_ids"].to(device)
81
+ test_attention_mask = test_encodings["attention_mask"].to(device)
82
+ with torch.no_grad():
83
+ model = model.to(device)
84
+ model.eval()
85
+ outputs = model(test_input_ids, attention_mask=test_attention_mask)
86
+ logits = outputs.logits
87
+ predicted_labels = torch.argmax(logits, dim=1)
88
+ probabilities = F.softmax(logits, dim=1)
89
+ confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
90
+ predicted_label_title = predicted_labels.item()
91
+
92
+ test_encodings = tokenizer.encode_plus(
93
+ normalized_content_with_style,
94
+ truncation=True,
95
+ padding=True,
96
+ max_length=512,
97
+ return_tensors="pt"
98
+ )
99
+ test_input_ids = test_encodings["input_ids"].to(device)
100
+ test_attention_mask = test_encodings["attention_mask"].to(device)
101
+ with torch.no_grad():
102
+ outputs = model(test_input_ids, attention_mask=test_attention_mask)
103
+ logits = outputs.logits
104
+ predicted_labels = torch.argmax(logits, dim=1)
105
+ probabilities = F.softmax(logits, dim=1)
106
+ confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
107
+ predicted_label_content = predicted_labels.item()
108
+
109
+ label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
110
+ predicted_label_title = label_mapping[predicted_label_title]
111
+ predicted_label_content = label_mapping[predicted_label_content]
112
+
113
+ return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
114
+
115
+ label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
116
+ def predict_2(txt_url, normalized_content_with_style):
117
+ predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data = None, None, None, None, None
118
+ predicted_label_text, confidence_score_text = None, None
119
+
120
+ if txt_url.startswith("http://") or txt_url.startswith("https://"):
121
+ predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data = check_by_url(txt_url)
122
+ elif text.startswith(""):
123
+ model = DistilBertForSequenceClassification.from_pretrained(Save_model)
124
+ tokenizer = DistilBertTokenizer.from_pretrained(Save_model)
125
+
126
+ test_encodings = tokenizer.encode_plus(
127
+ normalized_content_with_style,
128
+ truncation=True,
129
+ padding=True,
130
+ max_length=512,
131
+ return_tensors="pt"
132
+ )
133
+
134
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
+ test_input_ids = test_encodings["input_ids"].to(device)
136
+ test_attention_mask = test_encodings["attention_mask"].to(device)
137
+
138
+ with torch.no_grad():
139
+ model = model.to(device)
140
+ model.eval()
141
+ outputs = model(test_input_ids, attention_mask=test_attention_mask)
142
+ logits = outputs.logits
143
+ predicted_labels = torch.argmax(logits, dim=1)
144
+ probabilities = F.softmax(logits, dim=1)
145
+ confidence_score_text = torch.max(probabilities, dim=1).values.tolist()
146
+ predicted_label_text = label_mapping[predicted_labels.item()]
147
+
148
+
149
+ #predicted_label_text, confidence_score_text=check_by_text(normalized_content_with_style)
150
+ else:
151
+ print("Done")
152
+
153
+ return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data, predicted_label_text, confidence_score_text
154
 
155
  demo = gr.Interface(
156
+ fn=predict_2,
157
+ inputs=[
158
+ gr.inputs.Textbox(label="URL", placeholder="Enter URL"),
159
+ gr.inputs.Textbox(label="Text", placeholder="Enter Text"),
160
+ #gr.inputs.Textbox(label="Content", placeholder="Enter Content"),
161
+ ],
162
+ outputs=[
163
+ gr.outputs.Textbox(label="Title_prediction"),
164
+ gr.outputs.Textbox(label="Title_confidence_score"),
165
+ gr.outputs.Textbox(label="Content_prediction"),
166
+ gr.outputs.Textbox(label="Content_confidence_score"),
167
+ gr.outputs.Textbox(label="Description").style(show_copy_button=True),
168
+ gr.outputs.Textbox(label="Tex_prediction"),
169
+ gr.outputs.Textbox(label="Text_confidence_score"),
170
+ ],
171
  )
172
+
173
  demo.launch()