Spaces:
Sleeping
Sleeping
File size: 4,793 Bytes
77f904f 69a88fc e024b69 ba5f8d5 84d8e35 f8f4b5f b71cd64 d79611d 75dc5f6 b71cd64 c4db18d f8f4b5f 3c7207a 22da72a af31e8a 3c7207a 192d8ff 3c7207a b71cd64 d6a49a1 b71cd64 9ee5788 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 b71cd64 d6a49a1 1548124 b71cd64 d6a49a1 3c7207a b71cd64 d6a49a1 22da72a 52f1084 d6a49a1 22da72a 7e83a81 d6a49a1 a42e0f5 fd5cbde d6a49a1 b71cd64 7bf92fa 192d8ff f8f4b5f 6d50c62 af31e8a 22da72a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import joblib
import streamlit as st
import json
import requests
from bs4 import BeautifulSoup
# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model("LSTM.h5") # Sequential
SVM_model = joblib.load("SVM_Linear_Kernel.joblib") # SVM
logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
vectorizer = joblib.load("vectorizer.joblib") # global vocabulary (used for Logistic, SVC)
# tokenizer = joblib.load("tokenizer.joblib") # used for LSTM
# Decode label function
# {'business': 0, 'entertainment': 1, 'health': 2, 'politics': 3, 'sport': 4}
def decodedLabel(input_number):
print('receive label encoded', input_number)
categories = {
0: 'Business',
1: 'Entertainment',
2: 'Health',
3: 'Politics',
4: 'Sport'
}
result = categories.get(input_number) # Ex: Health
print('decoded result', result)
return result
# Web Crawler function
def crawURL(url):
# Fetch the URL content
response = requests.get(url)
# Parse the sitemap HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find all anchor tags that are children of span tags with class 'sitemap-link'
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
# Crawl pages and extract data
try:
print(f"Crawling page: {url}")
# Fetch page content
page_response = requests.get(url)
page_content = page_response.content
# Parse page content with BeautifulSoup
soup = BeautifulSoup(page_content, 'html.parser')
# Extract data you need from the page
author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
url = soup.find("meta", {"property": "og:url"}).attrs['content']
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
text = soup.find(itemprop="articleBody")
# Find all <p> tags with class "paragraph inline-placeholder"
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
# Initialize an empty list to store the text content of each paragraph
paragraph_texts = []
# Iterate over each <p> tag and extract its text content
for paragraph in paragraphs:
paragraph_texts.append(paragraph.text.strip())
# Join the text content of all paragraphs into a single string
full_text = ''.join(paragraph_texts)
return full_text
except Exception as e:
print(f"Failed to crawl page: {url}, Error: {str(e)}")
return null
# Predict for text category using Models
def process_api(text):
# Vectorize the text data
processed_text = vectorizer.transform([text])
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
# Get the predicted result from models
Logistic_Predicted = logistic_model.predict(processed_text).tolist() # Logistic Model
SVM_Predicted = SVM_model.predict(processed_text).tolist() # SVC Model
# Seq_Predicted = Seq_model.predict(padded_sequence)
# predicted_label_index = np.argmax(Seq_Predicted)
# ----------- Debug Logs -----------
logistic_debug = decodedLabel(int(Logistic_Predicted[0]))
svc_debug = decodedLabel(int(SVM_Predicted[0]))
print('Logistic', int(Logistic_Predicted[0]), logistic_debug)
print('SVM', int(SVM_Predicted[0]), svc_debug)
return {
'Logistic_Predicted':decodedLabel(int(Logistic_Predicted[0])),
'SVM_Predicted': decodedLabel(int(SVM_Predicted[0])),
'Article_Content': text
}
# Using Model to handle and return Category Route
def categorize(url):
try:
article_content = crawURL(url)
result = process_api(article_content)
return result
except Exception as error:
if hasattr(error, 'message'):
return {"error_message": error.message}
else:
return {"error_message": error}
# Main App
url = st.text_input("enter your CNN's URL here")
if url:
result = categorize(url)
article_content = result.get('Article_Content')
st.text_area("Article Content", value=article_content, height=400) # render the article content as textarea element
st.json({
"Logistic": result.get("Logistic_Predicted"),
"SVC": result.get("SVM_Predicted")
}) |