Spaces:
Runtime error
Runtime error
File size: 3,884 Bytes
77f904f 69a88fc e024b69 fca5286 92ed581 fca5286 1548124 84d8e35 f8f4b5f 05c7436 1548124 75dc5f6 1548124 f8f4b5f d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 84d8e35 d6a49a1 1548124 d6a49a1 1548124 d6a49a1 1548124 d6a49a1 74f2a8e d6a49a1 74f2a8e d6a49a1 74f2a8e d6a49a1 74f2a8e d6a49a1 a42e0f5 fd5cbde d6a49a1 9e29b10 f8f4b5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import joblib
import streamlit as st
import json
#from . import SVM_Linear_Model
#import Logistic_Model
#from . import vectorizer
# from . import tokenizer
# load all the models and vectorizer (global vocabulary)
# Seq_model = load_model('./LSTM.h5') # Sequential
# SVM_Linear_model = joblib.load(SVM_Linear_Model) # SVM
logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
vectorizer = joblib.load("vectorizer.joblib") # global vocabulary
# tokenizer = joblib.load(tokenizer)
def crawURL(url):
# Fetch the sitemap
response = requests.get(sitemap_url)
# Parse the sitemap HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find all anchor tags that are children of span tags with class 'sitemap-link'
urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
# Crawl pages and extract data
try:
print(f"Crawling page: {url}")
# Fetch page content
page_response = requests.get(url)
page_content = page_response.content
# Parse page content with BeautifulSoup
soup = BeautifulSoup(page_content, 'html.parser')
# Extract data you need from the page
author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
url = soup.find("meta", {"property": "og:url"}).attrs['content']
headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
text = soup.find(itemprop="articleBody")
# Find all <p> tags with class "paragraph inline-placeholder"
paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
# Initialize an empty list to store the text content of each paragraph
paragraph_texts = []
# Iterate over each <p> tag and extract its text content
for paragraph in paragraphs:
paragraph_texts.append(paragraph.text.strip())
# Join the text content of all paragraphs into a single string
full_text = ''.join(paragraph_texts)
return full_text
except Exception as e:
print(f"Failed to crawl page: {url}, Error: {str(e)}")
return null
def process_api(text):
# Vectorize the text data
processed_text = vectorizer.transform([text])
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
# Get the predicted result from models
# Seq_Predicted = Seq_model.predict(padded_sequence)
# SVM_Predicted = SVM_model.predict(processed_text).tolist()
Logistic_Predicted = logistic_model.predict(processed_text).tolist()
# predicted_label_index = np.argmax(Seq_Predicted)
return {
'Article_Content': text,
# 'SVM_Predicted': int(SVM_Predicted[0]),
'Logistic_Predicted': int(Logistic_Predicted[0])
}
# Using Model to handle and return Category Route
def categorize(url):
print("outside try", url)
try:
print("inside try", url)
article_content = crawURL(url)
print("inside try article_content", article_content)
result = process_api(article_content)
print("inside try result", result)
return result
except Exception as error:
if hasattr(error, 'message'):
return {"error_message": error.message}
else:
return {"error_message": error}
url = st.text_input("enter your CNN's URL here")
if url:
result = categorize(url)
st.json(result) |