Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						14b8594
	
1
								Parent(s):
							
							310f1bb
								
Upload 2 files
Browse files- app.py +131 -0
- requirements.txt +7 -0
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,131 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from transformers.pipelines.image_segmentation import Predictions
         | 
| 2 | 
            +
            from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
         | 
| 3 | 
            +
            import unidecode, re, unicodedata
         | 
| 4 | 
            +
            from bs4 import BeautifulSoup
         | 
| 5 | 
            +
            from urllib.request import urlopen
         | 
| 6 | 
            +
            from urllib.parse import urlparse
         | 
| 7 | 
            +
            from sklearn.metrics import confusion_matrix, accuracy_score
         | 
| 8 | 
            +
            import torch.nn.functional as F
         | 
| 9 | 
            +
            import gradio as gr
         | 
| 10 | 
            +
            import torch
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            def check_by_url(txt_url):
         | 
| 13 | 
            +
              #txt_url = "https://www.c-sharpcorner.com/article/how-to-add-multimedia-content-with-html/default.txt" 
         | 
| 14 | 
            +
              parsed_url = urlparse(txt_url)
         | 
| 15 | 
            +
              url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
         | 
| 16 | 
            +
              print(url)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              new_data =[]
         | 
| 19 | 
            +
              page = urlopen(url=url).read().decode("utf-8")
         | 
| 20 | 
            +
              soup = BeautifulSoup(page, 'html.parser')
         | 
| 21 | 
            +
              title = soup.find('title').get_text()
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              css_class_to_remove = "dp-highlighter"  # Replace with the CSS class you want to remove
         | 
| 24 | 
            +
                #Find <div> tags with the specified CSS class and remove their content
         | 
| 25 | 
            +
              div_tags = soup.find_all(['code', 'pre'])
         | 
| 26 | 
            +
              for div_tag in div_tags:
         | 
| 27 | 
            +
                div_tag.clear()
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              div_tags = soup.find_all('div', class_=css_class_to_remove)
         | 
| 30 | 
            +
              for div_tag in div_tags:
         | 
| 31 | 
            +
                div_tag.clear()
         | 
| 32 | 
            +
             | 
| 33 | 
            +
              # Fetch content of remaining tags
         | 
| 34 | 
            +
              content_with_style = ""
         | 
| 35 | 
            +
              p_tags_with_style = soup.find_all('p', style=True)
         | 
| 36 | 
            +
              for p_tag in p_tags_with_style:
         | 
| 37 | 
            +
                  p_content = re.sub(r'\n', '', p_tag.get_text())
         | 
| 38 | 
            +
                  content_with_style += p_content
         | 
| 39 | 
            +
             | 
| 40 | 
            +
              # Fetch content of <p> tags without style
         | 
| 41 | 
            +
              content_without_style = ""
         | 
| 42 | 
            +
              p_tags_without_style = soup.find_all('p', style=False)
         | 
| 43 | 
            +
              for p_tag in p_tags_without_style:
         | 
| 44 | 
            +
                  p_content = re.sub(r'\n', '', p_tag.get_text())
         | 
| 45 | 
            +
                  content_without_style += p_content
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              # Replace Unicode characters in the content and remove duplicates
         | 
| 48 | 
            +
              normalized_content_with_style = re.sub(r'\s+', ' ', content_with_style)  # Remove extra spaces
         | 
| 49 | 
            +
              normalized_content_with_style = normalized_content_with_style.replace('\r', '')  # Replace '\r' characters
         | 
| 50 | 
            +
              normalized_content_with_style = unicodedata.normalize('NFKD', normalized_content_with_style)
         | 
| 51 | 
            +
              normalized_content_with_style = unidecode.unidecode(normalized_content_with_style)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
              normalized_content_without_style = re.sub(r'\s+', ' ', content_without_style)  # Remove extra spaces
         | 
| 54 | 
            +
              normalized_content_without_style = normalized_content_without_style.replace('\r', '')  # Replace '\r' characters
         | 
| 55 | 
            +
              normalized_content_without_style = unicodedata.normalize('NFKD', normalized_content_without_style)
         | 
| 56 | 
            +
              normalized_content_without_style = unidecode.unidecode(normalized_content_without_style)
         | 
| 57 | 
            +
              
         | 
| 58 | 
            +
              normalized_content_with_style += normalized_content_without_style
         | 
| 59 | 
            +
              new_data = {"title": title, "content": normalized_content_with_style}
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              model = DistilBertForSequenceClassification.from_pretrained(".")
         | 
| 62 | 
            +
              tokenizer = DistilBertTokenizer.from_pretrained(".")
         | 
| 63 | 
            +
             | 
| 64 | 
            +
              test_encodings = tokenizer.encode_plus(
         | 
| 65 | 
            +
                title,    
         | 
| 66 | 
            +
                truncation=True,
         | 
| 67 | 
            +
              padding=True,
         | 
| 68 | 
            +
              max_length=512,
         | 
| 69 | 
            +
              return_tensors="pt"
         | 
| 70 | 
            +
               )
         | 
| 71 | 
            +
              model1=[]
         | 
| 72 | 
            +
              device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 73 | 
            +
              test_input_ids = test_encodings["input_ids"].to(device)
         | 
| 74 | 
            +
              test_attention_mask = test_encodings["attention_mask"].to(device)
         | 
| 75 | 
            +
              with torch.no_grad():
         | 
| 76 | 
            +
                model1= model.to(device)
         | 
| 77 | 
            +
                model1.eval()
         | 
| 78 | 
            +
                outputs= model1( test_input_ids, attention_mask=test_attention_mask)
         | 
| 79 | 
            +
                logits = outputs.logits
         | 
| 80 | 
            +
                predicted_labels = torch.argmax(logits, dim=1)    
         | 
| 81 | 
            +
                probabilities = F.softmax(logits, dim=1)
         | 
| 82 | 
            +
                confidence_score_title = torch.max(probabilities, dim=1).values.tolist()
         | 
| 83 | 
            +
                predicted_labels = torch.argmax(outputs.logits, dim=1)
         | 
| 84 | 
            +
              label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false  
         | 
| 85 | 
            +
              predicted_label_title = label_mapping[predicted_labels.item()]
         | 
| 86 | 
            +
              
         | 
| 87 | 
            +
             | 
| 88 | 
            +
              test_encodings = tokenizer.encode_plus(
         | 
| 89 | 
            +
                  normalized_content_with_style,    
         | 
| 90 | 
            +
                  truncation=True,
         | 
| 91 | 
            +
                  padding=True,
         | 
| 92 | 
            +
                  max_length=512,
         | 
| 93 | 
            +
                  return_tensors="pt"
         | 
| 94 | 
            +
               )
         | 
| 95 | 
            +
              model1=[]
         | 
| 96 | 
            +
              device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 97 | 
            +
              test_input_ids = test_encodings["input_ids"].to(device)
         | 
| 98 | 
            +
              test_attention_mask = test_encodings["attention_mask"].to(device)
         | 
| 99 | 
            +
              with torch.no_grad():
         | 
| 100 | 
            +
                model1= model.to(device)
         | 
| 101 | 
            +
                model1.eval()
         | 
| 102 | 
            +
                outputs= model1( test_input_ids, attention_mask=test_attention_mask)
         | 
| 103 | 
            +
                logits = outputs.logits
         | 
| 104 | 
            +
                predicted_labels = torch.argmax(logits, dim=1)    
         | 
| 105 | 
            +
                probabilities = F.softmax(logits, dim=1)
         | 
| 106 | 
            +
                confidence_scores_content = torch.max(probabilities, dim=1).values.tolist()
         | 
| 107 | 
            +
              label_mapping = {1: "SFW", 0: "NSFW"} # 1:True 0:false  
         | 
| 108 | 
            +
              predicted_label_content = label_mapping[predicted_labels.item()] 
         | 
| 109 | 
            +
             | 
| 110 | 
            +
              return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            def predict_2( url):
         | 
| 113 | 
            +
                predicted_label_title, confidence_score_title,predicted_label_content, confidence_scores_content, new_data = check_by_url(url)
         | 
| 114 | 
            +
                return predicted_label_title, confidence_score_title, predicted_label_content, confidence_scores_content, new_data
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            demo = gr.Interface(
         | 
| 117 | 
            +
              fn=predict_2, 
         | 
| 118 | 
            +
              inputs= [
         | 
| 119 | 
            +
                  gr.inputs.Textbox(label="Enter URL"),
         | 
| 120 | 
            +
              
         | 
| 121 | 
            +
                  ],
         | 
| 122 | 
            +
              outputs= [
         | 
| 123 | 
            +
                  
         | 
| 124 | 
            +
                  gr.outputs.Textbox(label="Title_prediction"),
         | 
| 125 | 
            +
                  gr.outputs.Textbox(label="Title_confidence_score"), 
         | 
| 126 | 
            +
                  gr.outputs.Textbox(label="Content_prediction"),
         | 
| 127 | 
            +
                  gr.outputs.Textbox(label="content_confidence_score"),
         | 
| 128 | 
            +
                  gr.outputs.Textbox(label="new_data").style(show_copy_button=True)
         | 
| 129 | 
            +
                  ],
         | 
| 130 | 
            +
            )
         | 
| 131 | 
            +
            demo.launch()
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            transformers 
         | 
| 2 | 
            +
            torch 
         | 
| 3 | 
            +
            numpy 
         | 
| 4 | 
            +
            unidecode
         | 
| 5 | 
            +
            gradio
         | 
| 6 | 
            +
            BeautifulSoup4
         | 
| 7 | 
            +
            scikit-learn
         | 
