Spaces:

MINHCT
/

Classification

Sleeping

App Files Files Community

MINHCT commited on Apr 27

Commit

84d8e35

•

1 Parent(s): d6a49a1

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -34

app.py CHANGED Viewed

@@ -8,12 +8,17 @@ from tensorflow.keras.models import load_model # load a pre-trained Keras model
 import numpy as np # scientific computing in Python
 import streamlit as st
 # load all the models and vectorizer (global vocabulary)
 # Seq_model = load_model('./LSTM.h5') # Sequential
-SVM_Linear_model = joblib.load("SVM_Linear_Model.joblib") # SVM
-logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
-vectorizer = joblib.load('vectorizer.joblib') # global vocabulary
-tokenizer = joblib.load('tokenizer.joblib')
 def crawURL(url):
     print(f"Crawling page: {url}")
@@ -26,41 +31,41 @@ def crawURL(url):
     urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
     # Crawl pages and extract data
-            try:
-                print(f"Crawling page: {url}")
-                # Fetch page content
-                page_response = requests.get(url)
-                page_content = page_response.content
-                # Parse page content with BeautifulSoup
-                soup = BeautifulSoup(page_content, 'html.parser')
-                # Extract data you need from the page
-                author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
-                date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
-                article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
-                url = soup.find("meta", {"property": "og:url"}).attrs['content']
-                headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
-                description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
-                keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
-                text = soup.find(itemprop="articleBody")
-                # Find all <p> tags with class "paragraph inline-placeholder"
-                paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
-                # Initialize an empty list to store the text content of each paragraph
-                paragraph_texts = []
-                # Iterate over each <p> tag and extract its text content
-                for paragraph in paragraphs:
-                    paragraph_texts.append(paragraph.text.strip())
-                # Join the text content of all paragraphs into a single string
-                full_text = ''.join(paragraph_texts)
-                return full_text
-            except Exception as e:
-                print(f"Failed to crawl page: {url}, Error: {str(e)}")
-                return null
 def process_api(text):
     # Vectorize the text data

 import numpy as np # scientific computing in Python
 import streamlit as st
+from . import SVM_Linear_Model
+from . import Logistic_Model
+from . import vectorizer
+from . import tokenizer
 # load all the models and vectorizer (global vocabulary)
 # Seq_model = load_model('./LSTM.h5') # Sequential
+SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
+logistic_model = joblib.load(Logistic_Model) # Logistic
+vectorizer = joblib.load(vectorizer) # global vocabulary
+tokenizer = joblib.load(tokenizer)
 def crawURL(url):
     print(f"Crawling page: {url}")
     urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
     # Crawl pages and extract data
+    try:
+        print(f"Crawling page: {url}")
+        # Fetch page content
+        page_response = requests.get(url)
+        page_content = page_response.content
+        # Parse page content with BeautifulSoup
+        soup = BeautifulSoup(page_content, 'html.parser')
+        # Extract data you need from the page
+        author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
+        date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
+        article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
+        url = soup.find("meta", {"property": "og:url"}).attrs['content']
+        headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
+        description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
+        keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
+        text = soup.find(itemprop="articleBody")
+        # Find all <p> tags with class "paragraph inline-placeholder"
+        paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
+        # Initialize an empty list to store the text content of each paragraph
+        paragraph_texts = []
+        # Iterate over each <p> tag and extract its text content
+        for paragraph in paragraphs:
+            paragraph_texts.append(paragraph.text.strip())
+        # Join the text content of all paragraphs into a single string
+        full_text = ''.join(paragraph_texts)
+        return full_text
+    except Exception as e:
+        print(f"Failed to crawl page: {url}, Error: {str(e)}")
+        return null
 def process_api(text):
     # Vectorize the text data