File size: 4,266 Bytes
f8f4b5f
 
 
 
 
 
 
 
 
1548124
84d8e35
 
1548124
84d8e35
f8f4b5f
05c7436
1548124
84d8e35
 
1548124
f8f4b5f
d6a49a1
 
 
 
 
 
 
 
 
 
 
84d8e35
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
 
 
 
 
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
d6a49a1
84d8e35
 
 
 
 
 
 
d6a49a1
 
 
 
1548124
 
d6a49a1
 
1548124
d6a49a1
 
 
 
 
1548124
d6a49a1
 
 
 
 
 
 
 
 
 
 
 
 
9e29b10
 
 
 
f8f4b5f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import joblib # Load Joblib file
import json # Load JSON file
from sklearn.feature_extraction.text import CountVectorizer # Convert text to BOW format
from tensorflow.keras.preprocessing.text import Tokenizer # tokenizing text documents into sequences of tokens (Seq Model)
from tensorflow.keras.preprocessing.sequence import pad_sequences # ensure that all sequences in a dataset have the same length (Seq Model)
from tensorflow.keras.models import load_model # load a pre-trained Keras model saved in the Hierarchical Data Format (HDF5) file format
import numpy as np # scientific computing in Python
import streamlit as st

# from . import SVM_Linear_Model
from . import Logistic_Model
from . import vectorizer
# from . import tokenizer

# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model('./LSTM.h5') # Sequential
# SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
logistic_model = joblib.load(Logistic_Model) # Logistic
vectorizer = joblib.load(vectorizer) # global vocabulary
# tokenizer = joblib.load(tokenizer)

def crawURL(url):
    print(f"Crawling page: {url}")
    # Fetch the sitemap
    response = requests.get(sitemap_url)
    # Parse the sitemap HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags that are children of span tags with class 'sitemap-link'
    urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]

    # Crawl pages and extract data
    try:
        print(f"Crawling page: {url}")
        # Fetch page content
        page_response = requests.get(url)
        page_content = page_response.content

        # Parse page content with BeautifulSoup
        soup = BeautifulSoup(page_content, 'html.parser')

        # Extract data you need from the page
        author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
        date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
        article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
        url = soup.find("meta", {"property": "og:url"}).attrs['content']
        headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
        description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
        keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
        text = soup.find(itemprop="articleBody")
        # Find all <p> tags with class "paragraph inline-placeholder"
        paragraphs = text.find_all('p', class_="paragraph inline-placeholder")

        # Initialize an empty list to store the text content of each paragraph
        paragraph_texts = []

        # Iterate over each <p> tag and extract its text content
        for paragraph in paragraphs:
            paragraph_texts.append(paragraph.text.strip())

        # Join the text content of all paragraphs into a single string
        full_text = ''.join(paragraph_texts)
        return full_text
        
    except Exception as e:
        print(f"Failed to crawl page: {url}, Error: {str(e)}")
        return null

def process_api(text):
    # Vectorize the text data
    processed_text = vectorizer.transform([text])
    # sequence = tokenizer.texts_to_sequences([text])
    # padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
    # Get the predicted result from models
    # Seq_Predicted = Seq_model.predict(padded_sequence)
    # SVM_Predicted = SVM_model.predict(processed_text).tolist()
    Logistic_Predicted = logistic_model.predict(processed_text).tolist()

    # predicted_label_index = np.argmax(Seq_Predicted)
    return {
            'Article_Content': text,
            # 'SVM_Predicted': int(SVM_Predicted[0]),
            'Logistic_Predicted': int(Logistic_Predicted[0])
        }

# Using Model to handle and return Category Route
def categorize(url):
    try:
        article_content = crawURL(url)
        result = process_api(article_content)
        return result
    except:
        return "No text found in the response body"

url = st.text_input("enter your CNN's URL here")
# Test
x = st.slider('Select a value')
st.write(x, 'squared is', x * x)

if url:
    result = categorize(url)
    st.json(result)