Spaces:

MINHCT
/

Classification

Sleeping

File size: 5,781 Bytes

77f904f
69a88fc
e024b69
ba5f8d5
 
84d8e35
f8f4b5f
b71cd64
d79611d
75dc5f6
b71cd64
c4db18d
f8f4b5f
3c7207a
 
22da72a
af31e8a
3c7207a
 
 
 
 
 
 
192d8ff
3c7207a
 
 
b71cd64
d6a49a1
b71cd64
9ee5788
d6a49a1
 
 
 
 
 
 
84d8e35
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
 
 
 
 
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
d6a49a1
84d8e35
 
 
 
 
 
 
d6a49a1
b71cd64
d6a49a1
 
 
1548124
 
b71cd64
d6a49a1
3c7207a
 
b71cd64
d6a49a1
22da72a
 
 
 
 
 
52f1084
d6a49a1
22da72a
 
7e83a81
d6a49a1
 
 
 
 
 
 
 
a42e0f5
fd5cbde
 
 
 
d6a49a1
b71cd64
4604f63
 
 
 
 
 
 
 
 
 
 
192d8ff
f8f4b5f
6d50c62
af31e8a
 
4604f63
22da72a
 
 
4604f63

import joblib
import streamlit as st
import json
import requests
from bs4 import BeautifulSoup

# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model("LSTM.h5") # Sequential
SVM_model = joblib.load("SVM_Linear_Kernel.joblib") # SVM
logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
vectorizer = joblib.load("vectorizer.joblib") # global vocabulary (used for Logistic, SVC)
# tokenizer = joblib.load("tokenizer.joblib") # used for LSTM

# Decode label function
# {'business': 0, 'entertainment': 1, 'health': 2, 'politics': 3, 'sport': 4}
def decodedLabel(input_number):
    print('receive label encoded', input_number)
    categories = {
      0: 'Business',
      1: 'Entertainment',
      2: 'Health',
      3: 'Politics',
      4: 'Sport'
    }
    result = categories.get(input_number) # Ex: Health
    print('decoded result', result)
    return result

# Web Crawler function
def crawURL(url):
    # Fetch the URL content
    response = requests.get(url)
    # Parse the sitemap HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags that are children of span tags with class 'sitemap-link'
    urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]

    # Crawl pages and extract data
    try:
        print(f"Crawling page: {url}")
        # Fetch page content
        page_response = requests.get(url)
        page_content = page_response.content

        # Parse page content with BeautifulSoup
        soup = BeautifulSoup(page_content, 'html.parser')

        # Extract data you need from the page
        author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
        date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
        article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
        url = soup.find("meta", {"property": "og:url"}).attrs['content']
        headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
        description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
        keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
        text = soup.find(itemprop="articleBody")
        # Find all <p> tags with class "paragraph inline-placeholder"
        paragraphs = text.find_all('p', class_="paragraph inline-placeholder")

        # Initialize an empty list to store the text content of each paragraph
        paragraph_texts = []

        # Iterate over each <p> tag and extract its text content
        for paragraph in paragraphs:
            paragraph_texts.append(paragraph.text.strip())

        # Join the text content of all paragraphs into a single string
        full_text = ''.join(paragraph_texts)
        return full_text
        
    except Exception as e:
        print(f"Failed to crawl page: {url}, Error: {str(e)}")
        return null

# Predict for text category using Models
def process_api(text):
    # Vectorize the text data
    processed_text = vectorizer.transform([text])
    # sequence = tokenizer.texts_to_sequences([text])
    # padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
    
    # Get the predicted result from models
    Logistic_Predicted = logistic_model.predict(processed_text).tolist() # Logistic Model
    SVM_Predicted = SVM_model.predict(processed_text).tolist() # SVC Model
    # Seq_Predicted = Seq_model.predict(padded_sequence)
    # predicted_label_index = np.argmax(Seq_Predicted)

    # ----------- Debug Logs -----------
    logistic_debug = decodedLabel(int(Logistic_Predicted[0]))
    svc_debug = decodedLabel(int(SVM_Predicted[0]))
    print('Logistic', int(Logistic_Predicted[0]), logistic_debug)
    print('SVM', int(SVM_Predicted[0]), svc_debug)
    
    return {
            'Logistic_Predicted':decodedLabel(int(Logistic_Predicted[0])),
            'SVM_Predicted': decodedLabel(int(SVM_Predicted[0])),
            'Article_Content': text
        }

# Using Model to handle and return Category Route
def categorize(url):
    try:
        article_content = crawURL(url)
        result = process_api(article_content)
        return result
    except Exception as error:
        if hasattr(error, 'message'):
            return {"error_message": error.message}
        else:
            return {"error_message": error}

# Main App
with st.expander("See explanation"): # Explain to user why this project is only worked for CNN domain
    st.write(
        '''
            This project works best with CNN articles right now. 
            Our web crawler is like a special tool for CNN's website. 
            It can't quite understand other websites because they're built differently
        '''
    )

url = st.text_input("Paste your CNN Article's URL Here")


if url:
    result = categorize(url)
    article_content = result.get('Article_Content')
    st.text_area("Article Content", value=article_content, height=400) # render the article content as textarea element
    st.divider()  # 👈 Draws a horizontal rule
    st.json({
        "Logistic": result.get("Logistic_Predicted"),
        "SVC": result.get("SVM_Predicted")
    })
st.divider()  # 👈 Draws a horizontal rule

# Category labels and corresponding counts
categories = ["Sport", "Health", "Entertainment", "Politics", "Business"]
counts = [5638, 4547, 2658, 2461, 1362]

# Create the bar chart
st.bar_chart(data=dict(zip(categories, counts)))

# Optional: Add a chart title
st.title("Training Data Category Distribution")

# Optional: Display additional information
st.write("Here's a breakdown of the number of articles in each category:")
for category, count in zip(categories, counts):
  st.write(f"- {category}: {count}")