Spaces:
Runtime error
Runtime error
File size: 5,421 Bytes
a32ba3c 371710a a32ba3c 21537e2 a32ba3c a076467 b01ccca a32ba3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from transformers.pipelines.image_segmentation import Predictions
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import unidecode, re, unicodedata
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlparse
from sklearn.metrics import confusion_matrix, accuracy_score
import torch.nn.functional as F
import gradio as gr
import torch
def check_by_url(txt_url):
#txt_url = "https://www.c-sharpcorner.com/article/how-to-add-multimedia-content-with-html/default.txt"
parsed_url = urlparse(txt_url)
url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
print(url)
new_data =[]
page = urlopen(url=url).read().decode("utf-8")
soup = BeautifulSoup(page, 'html.parser')
title = soup.find('title').get_text()
css_class_to_remove = "dp-highlighter" # Replace with the CSS class you want to remove
#Find <div> tags with the specified CSS class and remove their content
div_tags = soup.find_all(['code', 'pre'])
for div_tag in div_tags:
div_tag.clear()
div_tags = soup.find_all('div', class_=css_class_to_remove)
for div_tag in div_tags:
div_tag.clear()
# Fetch content of remaining tags
content_with_style = ""
p_tags_with_style = soup.find_all('p', style=True)
for p_tag in p_tags_with_style:
p_content = re.sub(r'\n', '', p_tag.get_text())
content_with_style += p_content
# Fetch content of <p> tags without style
content_without_style = ""
p_tags_without_style = soup.find_all('p', style=False)
for p_tag in p_tags_without_style:
p_content = re.sub(r'\n', '', p_tag.get_text())
content_without_style += p_content
# Replace Unicode characters in the content and remove duplicates
normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style) # Remove extra spaces
normalized_content_with_style = normalized_content_with_style.replace('\r', '') # Replace '\r' characters
normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style) # Remove extra spaces
normalized_content_without_style = normalized_content_without_style.replace('\r', '') # Replace '\r' characters
normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
normalized_content_with_style += normalized_content_without_style
new_data = {"title": title, "content": normalized_content_with_style}
model = DistilBertForSequenceClassification.from_pretrained(".")
tokenizer = DistilBertTokenizer.from_pretrained(".")
test_encodings = tokenizer.encode_plus(
title,
truncation=True,
padding=True,
max_length=512,
return_tensors="pt"
)
model1=[]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_input_ids = test_encodings["input_ids"].to(device)
test_attention_mask = test_encodings["attention_mask"].to(device)
with torch.no_grad():
model1= model.to(device)
model1.eval()
outputs= model1( test_input_ids, attention_mask=test_attention_mask)
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1)
probabilities = F.softmax(logits, dim=1)
confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
predicted_labels = torch.argmax(outputs.logits, dim=1)
label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
predicted_label_title = label_mapping[predicted_labels.item()]
test_encodings = tokenizer.encode_plus(
normalized_content_with_style,
truncation=True,
padding=True,
max_length=512,
return_tensors="pt"
)
model1=[]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_input_ids = test_encodings["input_ids"].to(device)
test_attention_mask = test_encodings["attention_mask"].to(device)
with torch.no_grad():
model1= model.to(device)
model1.eval()
outputs= model1( test_input_ids, attention_mask=test_attention_mask)
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1)
probabilities = F.softmax(logits, dim=1)
confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false
predicted_label_content = label_mapping[predicted_labels.item()]
return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
def predict_2( url):
predicted_label_title, confidence_score_title,predicted_label_content, confidence_scores_content, new_data = check_by_url(url)
return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
demo = gr.Interface(
fn=predict_2,
inputs= [
gr.inputs.Textbox(label="Enter URL"),
],
outputs= [
gr.outputs.Textbox(label="Title_prediction"),
gr.outputs.Textbox(label="Title_confidence_score"),
gr.outputs.Textbox(label="Content_prediction"),
gr.outputs.Textbox(label="content_confidence_score"),
gr.outputs.Textbox(label="new_data").style(show_copy_button=True)
],
)
demo.launch() |