File size: 5,421 Bytes
a32ba3c
371710a
a32ba3c
 
 
 
 
 
 
21537e2
a32ba3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a076467
b01ccca
a32ba3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from transformers.pipelines.image_segmentation import Predictions
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import unidecode, re, unicodedata
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlparse
from sklearn.metrics import confusion_matrix, accuracy_score
import torch.nn.functional as F
import gradio as gr
import torch

def check_by_url(txt_url):
  #txt_url = "https://www.c-sharpcorner.com/article/how-to-add-multimedia-content-with-html/default.txt" 
  parsed_url = urlparse(txt_url)
  url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
  print(url)

  new_data =[]
  page = urlopen(url=url).read().decode("utf-8")
  soup = BeautifulSoup(page, 'html.parser')
  title = soup.find('title').get_text()

  css_class_to_remove = "dp-highlighter"  # Replace with the CSS class you want to remove
    #Find <div> tags with the specified CSS class and remove their content
  div_tags = soup.find_all(['code', 'pre'])
  for div_tag in div_tags:
    div_tag.clear()

  div_tags = soup.find_all('div', class_=css_class_to_remove)
  for div_tag in div_tags:
    div_tag.clear()

  # Fetch content of remaining tags
  content_with_style = ""
  p_tags_with_style = soup.find_all('p', style=True)
  for p_tag in p_tags_with_style:
      p_content = re.sub(r'\n', '', p_tag.get_text())
      content_with_style += p_content

  # Fetch content of <p> tags without style
  content_without_style = ""
  p_tags_without_style = soup.find_all('p', style=False)
  for p_tag in p_tags_without_style:
      p_content = re.sub(r'\n', '', p_tag.get_text())
      content_without_style += p_content

  # Replace Unicode characters in the content and remove duplicates
  normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style)  # Remove extra spaces
  normalized_content_with_style = normalized_content_with_style.replace('\r', '')  # Replace '\r' characters
  normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
  normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)

  normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style)  # Remove extra spaces
  normalized_content_without_style = normalized_content_without_style.replace('\r', '')  # Replace '\r' characters
  normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
  normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
  
  normalized_content_with_style += normalized_content_without_style
  new_data = {"title": title, "content": normalized_content_with_style}

  model = DistilBertForSequenceClassification.from_pretrained(".")
  tokenizer = DistilBertTokenizer.from_pretrained(".")

  test_encodings = tokenizer.encode_plus(
    title,    
    truncation=True,
  padding=True,
  max_length=512,
  return_tensors="pt"
   )
  model1=[]
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  test_input_ids = test_encodings["input_ids"].to(device)
  test_attention_mask = test_encodings["attention_mask"].to(device)
  with torch.no_grad():
    model1= model.to(device)
    model1.eval()
    outputs= model1( test_input_ids, attention_mask=test_attention_mask)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)    
    probabilities = F.softmax(logits, dim=1)
    confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
    predicted_labels = torch.argmax(outputs.logits, dim=1)
  label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false  
  predicted_label_title = label_mapping[predicted_labels.item()]
  

  test_encodings = tokenizer.encode_plus(
      normalized_content_with_style,    
      truncation=True,
      padding=True,
      max_length=512,
      return_tensors="pt"
   )
  model1=[]
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  test_input_ids = test_encodings["input_ids"].to(device)
  test_attention_mask = test_encodings["attention_mask"].to(device)
  with torch.no_grad():
    model1= model.to(device)
    model1.eval()
    outputs= model1( test_input_ids, attention_mask=test_attention_mask)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)    
    probabilities = F.softmax(logits, dim=1)
    confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
  label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false  
  predicted_label_content = label_mapping[predicted_labels.item()] 

  return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data

def predict_2( url):
    predicted_label_title, confidence_score_title,predicted_label_content, confidence_scores_content, new_data = check_by_url(url)
    return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data

demo = gr.Interface(
  fn=predict_2, 
  inputs= [
      gr.inputs.Textbox(label="Enter URL"),
  
      ],
  outputs= [
      
      gr.outputs.Textbox(label="Title_prediction"),
      gr.outputs.Textbox(label="Title_confidence_score"), 
      gr.outputs.Textbox(label="Content_prediction"),
      gr.outputs.Textbox(label="content_confidence_score"),
      gr.outputs.Textbox(label="new_data").style(show_copy_button=True)
      ],
)
demo.launch()