Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,80 @@ logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
|
|
15 |
vectorizer = joblib.load('vectorizer.joblib') # global vocabulary
|
16 |
tokenizer = joblib.load('tokenizer.joblib')
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# Test
|
19 |
x = st.slider('Select a value')
|
20 |
st.write(x, 'squared is', x * x)
|
|
|
15 |
vectorizer = joblib.load('vectorizer.joblib') # global vocabulary
|
16 |
tokenizer = joblib.load('tokenizer.joblib')
|
17 |
|
18 |
+
def crawURL(url):
|
19 |
+
print(f"Crawling page: {url}")
|
20 |
+
# Fetch the sitemap
|
21 |
+
response = requests.get(sitemap_url)
|
22 |
+
# Parse the sitemap HTML
|
23 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
24 |
+
|
25 |
+
# Find all anchor tags that are children of span tags with class 'sitemap-link'
|
26 |
+
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
|
27 |
+
|
28 |
+
# Crawl pages and extract data
|
29 |
+
try:
|
30 |
+
print(f"Crawling page: {url}")
|
31 |
+
# Fetch page content
|
32 |
+
page_response = requests.get(url)
|
33 |
+
page_content = page_response.content
|
34 |
+
|
35 |
+
# Parse page content with BeautifulSoup
|
36 |
+
soup = BeautifulSoup(page_content, 'html.parser')
|
37 |
+
|
38 |
+
# Extract data you need from the page
|
39 |
+
author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
|
40 |
+
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
|
41 |
+
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
|
42 |
+
url = soup.find("meta", {"property": "og:url"}).attrs['content']
|
43 |
+
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
|
44 |
+
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
|
45 |
+
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
|
46 |
+
text = soup.find(itemprop="articleBody")
|
47 |
+
# Find all <p> tags with class "paragraph inline-placeholder"
|
48 |
+
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
|
49 |
+
|
50 |
+
# Initialize an empty list to store the text content of each paragraph
|
51 |
+
paragraph_texts = []
|
52 |
+
|
53 |
+
# Iterate over each <p> tag and extract its text content
|
54 |
+
for paragraph in paragraphs:
|
55 |
+
paragraph_texts.append(paragraph.text.strip())
|
56 |
+
|
57 |
+
# Join the text content of all paragraphs into a single string
|
58 |
+
full_text = ''.join(paragraph_texts)
|
59 |
+
return full_text
|
60 |
+
|
61 |
+
except Exception as e:
|
62 |
+
print(f"Failed to crawl page: {url}, Error: {str(e)}")
|
63 |
+
return null
|
64 |
+
|
65 |
+
def process_api(text):
|
66 |
+
# Vectorize the text data
|
67 |
+
processed_text = vectorizer.transform([text])
|
68 |
+
sequence = tokenizer.texts_to_sequences([text])
|
69 |
+
padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
|
70 |
+
# Get the predicted result from models
|
71 |
+
# Seq_Predicted = Seq_model.predict(padded_sequence)
|
72 |
+
SVM_Predicted = SVM_model.predict(processed_text).tolist()
|
73 |
+
Logistic_Predicted = logistic_model.predict(processed_text).tolist()
|
74 |
+
|
75 |
+
# predicted_label_index = np.argmax(Seq_Predicted)
|
76 |
+
return {
|
77 |
+
'Article_Content': text,
|
78 |
+
'SVM_Predicted': int(SVM_Predicted[0]),
|
79 |
+
'Logistic_Predicted': int(Logistic_Predicted[0])
|
80 |
+
}
|
81 |
+
|
82 |
+
# Using Model to handle and return Category Route
|
83 |
+
def categorize(url):
|
84 |
+
try:
|
85 |
+
article_content = crawURL(url)
|
86 |
+
result = process_api(article_content)
|
87 |
+
return result
|
88 |
+
except:
|
89 |
+
return "No text found in the response body"
|
90 |
+
|
91 |
+
url = st.text_input("enter your CNN's URL here")
|
92 |
# Test
|
93 |
x = st.slider('Select a value')
|
94 |
st.write(x, 'squared is', x * x)
|