File size: 7,987 Bytes
a32ba3c
371710a
a32ba3c
 
 
 
 
 
 
21537e2
43cdd6d
a32ba3c
 
43cdd6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a32ba3c
 
43cdd6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a32ba3c
43cdd6d
a32ba3c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from transformers.pipelines.image_segmentation import Predictions
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import unidecode, re, unicodedata
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlparse
from sklearn.metrics import confusion_matrix, accuracy_score
import torch.nn.functional as F
import gradio as gr
import torch
import nltk

def check_by_url(txt_url):
    #if txt_url.startswith("http://") or txt_url.startswith("https://"):
        parsed_url = urlparse(txt_url)
        url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
        print(url)

        new_data = []
        page = urlopen(url=url).read().decode("utf-8")
        soup = BeautifulSoup(page, 'html.parser')
        title = soup.find('title').get_text()

        # remove punctuations from title
        def remove_punctuation(title):
            punctuationfree = "".join([i for i in title if i not in string.punctuation])
            return punctuationfree

        css_class_to_remove = "dp-highlighter"  # Replace with the CSS class you want to remove
        # Find <div> tags with the specified CSS class and remove their content
        div_tags = soup.find_all(['code', 'pre'])
        for div_tag in div_tags:
            div_tag.clear()

        div_tags = soup.find_all('div', class_=css_class_to_remove)
        for div_tag in div_tags:
            div_tag.clear()

        # Fetch content of remaining tags
        content_with_style = ""
        p_tags_with_style = soup.find_all('p', style=True)
        for p_tag in p_tags_with_style:
            p_content = re.sub(r'\n', '', p_tag.get_text())
            content_with_style += p_content

        # Fetch content of <p> tags without style
        content_without_style = ""
        p_tags_without_style = soup.find_all('p', style=False)
        for p_tag in p_tags_without_style:
            p_content = re.sub(r'\n', '', p_tag.get_text())
            content_without_style += p_content

        # Replace Unicode characters in the content and remove duplicates
        normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style)  # Remove extra spaces
        normalized_content_with_style = normalized_content_with_style.replace('\r', '')  # Replace '\r' characters
        normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
        normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)

        normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style)  # Remove extra spaces
        normalized_content_without_style = normalized_content_without_style.replace('\r', '')  # Replace '\r' characters
        normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
        normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)

        normalized_content_with_style += normalized_content_without_style
        new_data = {"title": title, "content": normalized_content_with_style}

        Save_model = "/content/G:\Model_SAVE"  # Replace with your saved model name

        model = DistilBertForSequenceClassification.from_pretrained(Save_model)
        tokenizer = DistilBertTokenizer.from_pretrained(Save_model)

        test_encodings = tokenizer.encode_plus(
            title,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        test_input_ids = test_encodings["input_ids"].to(device)
        test_attention_mask = test_encodings["attention_mask"].to(device)
        with torch.no_grad():
            model = model.to(device)
            model.eval()
            outputs = model(test_input_ids, attention_mask=test_attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            probabilities = F.softmax(logits, dim=1)
            confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
            predicted_label_title = predicted_labels.item()

        test_encodings = tokenizer.encode_plus(
            normalized_content_with_style,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )
        test_input_ids = test_encodings["input_ids"].to(device)
        test_attention_mask = test_encodings["attention_mask"].to(device)
        with torch.no_grad():
            outputs = model(test_input_ids, attention_mask=test_attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            probabilities = F.softmax(logits, dim=1)
            confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
            predicted_label_content = predicted_labels.item()

        label_mapping = {1: "SFW", 0: "NSFW"}  # 1:True 0:false
        predicted_label_title = label_mapping[predicted_label_title]
        predicted_label_content = label_mapping[predicted_label_content]

        return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
    
label_mapping = {1: "SFW", 0: "NSFW"}  # 1:True 0:false
def predict_2(txt_url, normalized_content_with_style):
    predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data = None, None, None, None, None
    predicted_label_text, confidence_score_text = None, None

    if txt_url.startswith("http://") or txt_url.startswith("https://"):
        predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data = check_by_url(txt_url)
    elif text.startswith(""):
        model = DistilBertForSequenceClassification.from_pretrained(Save_model)
        tokenizer = DistilBertTokenizer.from_pretrained(Save_model)

        test_encodings = tokenizer.encode_plus(
            normalized_content_with_style,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        test_input_ids = test_encodings["input_ids"].to(device)
        test_attention_mask = test_encodings["attention_mask"].to(device)

        with torch.no_grad():
            model = model.to(device)
            model.eval()
            outputs = model(test_input_ids, attention_mask=test_attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            probabilities = F.softmax(logits, dim=1)
            confidence_score_text = torch.max(probabilities, dim=1).values.tolist()
            predicted_label_text = label_mapping[predicted_labels.item()]
            
          
        #predicted_label_text, confidence_score_text=check_by_text(normalized_content_with_style)
    else:
      print("Done")

    return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data, predicted_label_text, confidence_score_text

demo = gr.Interface(
    fn=predict_2,
    inputs=[
        gr.inputs.Textbox(label="URL", placeholder="Enter URL"),
        gr.inputs.Textbox(label="Text", placeholder="Enter Text"),
        #gr.inputs.Textbox(label="Content", placeholder="Enter Content"),
    ],
    outputs=[
        gr.outputs.Textbox(label="Title_prediction"),
        gr.outputs.Textbox(label="Title_confidence_score"),
        gr.outputs.Textbox(label="Content_prediction"),
        gr.outputs.Textbox(label="Content_confidence_score"),
        gr.outputs.Textbox(label="Description").style(show_copy_button=True),
        gr.outputs.Textbox(label="Tex_prediction"),
        gr.outputs.Textbox(label="Text_confidence_score"),
    ],
)

demo.launch()