File size: 3,651 Bytes
fca5286
92ed581
fca5286
1548124
84d8e35
f8f4b5f
05c7436
1548124
75dc5f6
 
1548124
f8f4b5f
d6a49a1
 
 
 
 
 
 
 
 
 
 
84d8e35
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
 
 
 
 
 
 
 
 
d6a49a1
84d8e35
 
d6a49a1
84d8e35
 
 
d6a49a1
84d8e35
 
 
 
 
 
 
d6a49a1
 
 
 
1548124
 
d6a49a1
 
1548124
d6a49a1
 
 
 
 
1548124
d6a49a1
 
 
 
 
 
 
 
 
 
 
 
 
9e29b10
 
 
 
f8f4b5f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#from . import SVM_Linear_Model
#import Logistic_Model
#from . import vectorizer
# from . import tokenizer

# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model('./LSTM.h5') # Sequential
# SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
vectorizer = joblib.load("vectorizer.joblib") # global vocabulary
# tokenizer = joblib.load(tokenizer)

def crawURL(url):
    print(f"Crawling page: {url}")
    # Fetch the sitemap
    response = requests.get(sitemap_url)
    # Parse the sitemap HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags that are children of span tags with class 'sitemap-link'
    urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]

    # Crawl pages and extract data
    try:
        print(f"Crawling page: {url}")
        # Fetch page content
        page_response = requests.get(url)
        page_content = page_response.content

        # Parse page content with BeautifulSoup
        soup = BeautifulSoup(page_content, 'html.parser')

        # Extract data you need from the page
        author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
        date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
        article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
        url = soup.find("meta", {"property": "og:url"}).attrs['content']
        headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
        description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
        keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
        text = soup.find(itemprop="articleBody")
        # Find all <p> tags with class "paragraph inline-placeholder"
        paragraphs = text.find_all('p', class_="paragraph inline-placeholder")

        # Initialize an empty list to store the text content of each paragraph
        paragraph_texts = []

        # Iterate over each <p> tag and extract its text content
        for paragraph in paragraphs:
            paragraph_texts.append(paragraph.text.strip())

        # Join the text content of all paragraphs into a single string
        full_text = ''.join(paragraph_texts)
        return full_text
        
    except Exception as e:
        print(f"Failed to crawl page: {url}, Error: {str(e)}")
        return null

def process_api(text):
    # Vectorize the text data
    processed_text = vectorizer.transform([text])
    # sequence = tokenizer.texts_to_sequences([text])
    # padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
    # Get the predicted result from models
    # Seq_Predicted = Seq_model.predict(padded_sequence)
    # SVM_Predicted = SVM_model.predict(processed_text).tolist()
    Logistic_Predicted = logistic_model.predict(processed_text).tolist()

    # predicted_label_index = np.argmax(Seq_Predicted)
    return {
            'Article_Content': text,
            # 'SVM_Predicted': int(SVM_Predicted[0]),
            'Logistic_Predicted': int(Logistic_Predicted[0])
        }

# Using Model to handle and return Category Route
def categorize(url):
    try:
        article_content = crawURL(url)
        result = process_api(article_content)
        return result
    except:
        return "No text found in the response body"

url = st.text_input("enter your CNN's URL here")
# Test
x = st.slider('Select a value')
st.write(x, 'squared is', x * x)

if url:
    result = categorize(url)
    st.json(result)