Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,12 +8,17 @@ from tensorflow.keras.models import load_model # load a pre-trained Keras model
|
|
8 |
import numpy as np # scientific computing in Python
|
9 |
import streamlit as st
|
10 |
|
|
|
|
|
|
|
|
|
|
|
11 |
# load all the models and vectorizer (global vocabulary)
|
12 |
# Seq_model = load_model('./LSTM.h5') # Sequential
|
13 |
-
SVM_Linear_model = joblib.load(
|
14 |
-
logistic_model = joblib.load(
|
15 |
-
vectorizer = joblib.load(
|
16 |
-
tokenizer = joblib.load(
|
17 |
|
18 |
def crawURL(url):
|
19 |
print(f"Crawling page: {url}")
|
@@ -26,41 +31,41 @@ def crawURL(url):
|
|
26 |
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
|
27 |
|
28 |
# Crawl pages and extract data
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
# Parse page content with BeautifulSoup
|
36 |
-
soup = BeautifulSoup(page_content, 'html.parser')
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
|
41 |
-
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
|
42 |
-
url = soup.find("meta", {"property": "og:url"}).attrs['content']
|
43 |
-
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
|
44 |
-
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
|
45 |
-
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
|
46 |
-
text = soup.find(itemprop="articleBody")
|
47 |
-
# Find all <p> tags with class "paragraph inline-placeholder"
|
48 |
-
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
|
49 |
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
paragraph_texts.append(paragraph.text.strip())
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def process_api(text):
|
66 |
# Vectorize the text data
|
|
|
8 |
import numpy as np # scientific computing in Python
|
9 |
import streamlit as st
|
10 |
|
11 |
+
from . import SVM_Linear_Model
|
12 |
+
from . import Logistic_Model
|
13 |
+
from . import vectorizer
|
14 |
+
from . import tokenizer
|
15 |
+
|
16 |
# load all the models and vectorizer (global vocabulary)
|
17 |
# Seq_model = load_model('./LSTM.h5') # Sequential
|
18 |
+
SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
|
19 |
+
logistic_model = joblib.load(Logistic_Model) # Logistic
|
20 |
+
vectorizer = joblib.load(vectorizer) # global vocabulary
|
21 |
+
tokenizer = joblib.load(tokenizer)
|
22 |
|
23 |
def crawURL(url):
|
24 |
print(f"Crawling page: {url}")
|
|
|
31 |
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
|
32 |
|
33 |
# Crawl pages and extract data
|
34 |
+
try:
|
35 |
+
print(f"Crawling page: {url}")
|
36 |
+
# Fetch page content
|
37 |
+
page_response = requests.get(url)
|
38 |
+
page_content = page_response.content
|
|
|
|
|
|
|
39 |
|
40 |
+
# Parse page content with BeautifulSoup
|
41 |
+
soup = BeautifulSoup(page_content, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
# Extract data you need from the page
|
44 |
+
author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
|
45 |
+
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
|
46 |
+
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
|
47 |
+
url = soup.find("meta", {"property": "og:url"}).attrs['content']
|
48 |
+
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
|
49 |
+
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
|
50 |
+
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
|
51 |
+
text = soup.find(itemprop="articleBody")
|
52 |
+
# Find all <p> tags with class "paragraph inline-placeholder"
|
53 |
+
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
|
54 |
|
55 |
+
# Initialize an empty list to store the text content of each paragraph
|
56 |
+
paragraph_texts = []
|
|
|
57 |
|
58 |
+
# Iterate over each <p> tag and extract its text content
|
59 |
+
for paragraph in paragraphs:
|
60 |
+
paragraph_texts.append(paragraph.text.strip())
|
61 |
|
62 |
+
# Join the text content of all paragraphs into a single string
|
63 |
+
full_text = ''.join(paragraph_texts)
|
64 |
+
return full_text
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Failed to crawl page: {url}, Error: {str(e)}")
|
68 |
+
return null
|
69 |
|
70 |
def process_api(text):
|
71 |
# Vectorize the text data
|