File size: 3,884 Bytes
77f904f
69a88fc
e024b69
fca5286
92ed581
fca5286
1548124
84d8e35
f8f4b5f
05c7436
1548124
75dc5f6
 
1548124
f8f4b5f
d6a49a1
 
 
 
 
 
 
 
 
 
84d8e35
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
 
 
 
 
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
d6a49a1
84d8e35
 
 
 
 
 
 
d6a49a1
 
 
 
1548124
 
d6a49a1
 
1548124
d6a49a1
 
 
 
 
1548124
d6a49a1
 
 
 
 
74f2a8e
d6a49a1
74f2a8e
d6a49a1
74f2a8e
d6a49a1
74f2a8e
d6a49a1
a42e0f5
fd5cbde
 
 
 
 
d6a49a1
 
9e29b10
f8f4b5f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import joblib
import streamlit as st
import json
#from . import SVM_Linear_Model
#import Logistic_Model
#from . import vectorizer
# from . import tokenizer

# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model('./LSTM.h5') # Sequential
# SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
vectorizer = joblib.load("vectorizer.joblib") # global vocabulary
# tokenizer = joblib.load(tokenizer)

def crawURL(url):
    # Fetch the sitemap
    response = requests.get(sitemap_url)
    # Parse the sitemap HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags that are children of span tags with class 'sitemap-link'
    urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]

    # Crawl pages and extract data
    try:
        print(f"Crawling page: {url}")
        # Fetch page content
        page_response = requests.get(url)
        page_content = page_response.content

        # Parse page content with BeautifulSoup
        soup = BeautifulSoup(page_content, 'html.parser')

        # Extract data you need from the page
        author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
        date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
        article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
        url = soup.find("meta", {"property": "og:url"}).attrs['content']
        headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
        description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
        keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
        text = soup.find(itemprop="articleBody")
        # Find all <p> tags with class "paragraph inline-placeholder"
        paragraphs = text.find_all('p', class_="paragraph inline-placeholder")

        # Initialize an empty list to store the text content of each paragraph
        paragraph_texts = []

        # Iterate over each <p> tag and extract its text content
        for paragraph in paragraphs:
            paragraph_texts.append(paragraph.text.strip())

        # Join the text content of all paragraphs into a single string
        full_text = ''.join(paragraph_texts)
        return full_text
        
    except Exception as e:
        print(f"Failed to crawl page: {url}, Error: {str(e)}")
        return null

def process_api(text):
    # Vectorize the text data
    processed_text = vectorizer.transform([text])
    # sequence = tokenizer.texts_to_sequences([text])
    # padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
    # Get the predicted result from models
    # Seq_Predicted = Seq_model.predict(padded_sequence)
    # SVM_Predicted = SVM_model.predict(processed_text).tolist()
    Logistic_Predicted = logistic_model.predict(processed_text).tolist()

    # predicted_label_index = np.argmax(Seq_Predicted)
    return {
            'Article_Content': text,
            # 'SVM_Predicted': int(SVM_Predicted[0]),
            'Logistic_Predicted': int(Logistic_Predicted[0])
        }

# Using Model to handle and return Category Route
def categorize(url):
    print("outside try", url)
    try:
        print("inside try", url)
        article_content = crawURL(url)
        print("inside try article_content", article_content)
        result = process_api(article_content)
        print("inside try result", result)
        return result
    except Exception as error:
        if hasattr(error, 'message'):
            return {"error_message": error.message}
        else:
            return {"error_message": error}
        

url = st.text_input("enter your CNN's URL here")

if url:
    result = categorize(url)
    st.json(result)