Spaces:
Running
Running
File size: 4,266 Bytes
f8f4b5f 1548124 84d8e35 1548124 84d8e35 f8f4b5f 05c7436 1548124 84d8e35 1548124 f8f4b5f d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 1548124 d6a49a1 1548124 d6a49a1 1548124 d6a49a1 9e29b10 f8f4b5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import joblib # Load Joblib file
import json # Load JSON file
from sklearn.feature_extraction.text import CountVectorizer # Convert text to BOW format
from tensorflow.keras.preprocessing.text import Tokenizer # tokenizing text documents into sequences of tokens (Seq Model)
from tensorflow.keras.preprocessing.sequence import pad_sequences # ensure that all sequences in a dataset have the same length (Seq Model)
from tensorflow.keras.models import load_model # load a pre-trained Keras model saved in the Hierarchical Data Format (HDF5) file format
import numpy as np # scientific computing in Python
import streamlit as st
# from . import SVM_Linear_Model
from . import Logistic_Model
from . import vectorizer
# from . import tokenizer
# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model('./LSTM.h5') # Sequential
# SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
logistic_model = joblib.load(Logistic_Model) # Logistic
vectorizer = joblib.load(vectorizer) # global vocabulary
# tokenizer = joblib.load(tokenizer)
def crawURL(url):
print(f"Crawling page: {url}")
# Fetch the sitemap
response = requests.get(sitemap_url)
# Parse the sitemap HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find all anchor tags that are children of span tags with class 'sitemap-link'
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
# Crawl pages and extract data
try:
print(f"Crawling page: {url}")
# Fetch page content
page_response = requests.get(url)
page_content = page_response.content
# Parse page content with BeautifulSoup
soup = BeautifulSoup(page_content, 'html.parser')
# Extract data you need from the page
author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
url = soup.find("meta", {"property": "og:url"}).attrs['content']
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
text = soup.find(itemprop="articleBody")
# Find all <p> tags with class "paragraph inline-placeholder"
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
# Initialize an empty list to store the text content of each paragraph
paragraph_texts = []
# Iterate over each <p> tag and extract its text content
for paragraph in paragraphs:
paragraph_texts.append(paragraph.text.strip())
# Join the text content of all paragraphs into a single string
full_text = ''.join(paragraph_texts)
return full_text
except Exception as e:
print(f"Failed to crawl page: {url}, Error: {str(e)}")
return null
def process_api(text):
# Vectorize the text data
processed_text = vectorizer.transform([text])
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
# Get the predicted result from models
# Seq_Predicted = Seq_model.predict(padded_sequence)
# SVM_Predicted = SVM_model.predict(processed_text).tolist()
Logistic_Predicted = logistic_model.predict(processed_text).tolist()
# predicted_label_index = np.argmax(Seq_Predicted)
return {
'Article_Content': text,
# 'SVM_Predicted': int(SVM_Predicted[0]),
'Logistic_Predicted': int(Logistic_Predicted[0])
}
# Using Model to handle and return Category Route
def categorize(url):
try:
article_content = crawURL(url)
result = process_api(article_content)
return result
except:
return "No text found in the response body"
url = st.text_input("enter your CNN's URL here")
# Test
x = st.slider('Select a value')
st.write(x, 'squared is', x * x)
if url:
result = categorize(url)
st.json(result) |