MINHCT commited on
Commit
d6a49a1
1 Parent(s): 4a7c4bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py CHANGED
@@ -15,6 +15,80 @@ logistic_model = joblib.load("Logistic_Model.joblib") # Logistic
15
  vectorizer = joblib.load('vectorizer.joblib') # global vocabulary
16
  tokenizer = joblib.load('tokenizer.joblib')
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Test
19
  x = st.slider('Select a value')
20
  st.write(x, 'squared is', x * x)
 
15
  vectorizer = joblib.load('vectorizer.joblib') # global vocabulary
16
  tokenizer = joblib.load('tokenizer.joblib')
17
 
18
+ def crawURL(url):
19
+ print(f"Crawling page: {url}")
20
+ # Fetch the sitemap
21
+ response = requests.get(sitemap_url)
22
+ # Parse the sitemap HTML
23
+ soup = BeautifulSoup(response.content, 'html.parser')
24
+
25
+ # Find all anchor tags that are children of span tags with class 'sitemap-link'
26
+ urls = [span.a['href'] for span in soup.find_all('span', class_='sitemap-link') if span.a]
27
+
28
+ # Crawl pages and extract data
29
+ try:
30
+ print(f"Crawling page: {url}")
31
+ # Fetch page content
32
+ page_response = requests.get(url)
33
+ page_content = page_response.content
34
+
35
+ # Parse page content with BeautifulSoup
36
+ soup = BeautifulSoup(page_content, 'html.parser')
37
+
38
+ # Extract data you need from the page
39
+ author = soup.find("meta", {"name": "author"}).attrs['content'].strip()
40
+ date_published = soup.find("meta", {"property": "article:published_time"}).attrs['content'].strip()
41
+ article_section = soup.find("meta", {"name": "meta-section"}).attrs['content']
42
+ url = soup.find("meta", {"property": "og:url"}).attrs['content']
43
+ headline = soup.find("h1", {"data-editable": "headlineText"}).text.strip()
44
+ description = soup.find("meta", {"name": "description"}).attrs['content'].strip()
45
+ keywords = soup.find("meta", {"name": "keywords"}).attrs['content'].strip()
46
+ text = soup.find(itemprop="articleBody")
47
+ # Find all <p> tags with class "paragraph inline-placeholder"
48
+ paragraphs = text.find_all('p', class_="paragraph inline-placeholder")
49
+
50
+ # Initialize an empty list to store the text content of each paragraph
51
+ paragraph_texts = []
52
+
53
+ # Iterate over each <p> tag and extract its text content
54
+ for paragraph in paragraphs:
55
+ paragraph_texts.append(paragraph.text.strip())
56
+
57
+ # Join the text content of all paragraphs into a single string
58
+ full_text = ''.join(paragraph_texts)
59
+ return full_text
60
+
61
+ except Exception as e:
62
+ print(f"Failed to crawl page: {url}, Error: {str(e)}")
63
+ return null
64
+
65
+ def process_api(text):
66
+ # Vectorize the text data
67
+ processed_text = vectorizer.transform([text])
68
+ sequence = tokenizer.texts_to_sequences([text])
69
+ padded_sequence = pad_sequences(sequence, maxlen=1000, padding='post')
70
+ # Get the predicted result from models
71
+ # Seq_Predicted = Seq_model.predict(padded_sequence)
72
+ SVM_Predicted = SVM_model.predict(processed_text).tolist()
73
+ Logistic_Predicted = logistic_model.predict(processed_text).tolist()
74
+
75
+ # predicted_label_index = np.argmax(Seq_Predicted)
76
+ return {
77
+ 'Article_Content': text,
78
+ 'SVM_Predicted': int(SVM_Predicted[0]),
79
+ 'Logistic_Predicted': int(Logistic_Predicted[0])
80
+ }
81
+
82
+ # Using Model to handle and return Category Route
83
+ def categorize(url):
84
+ try:
85
+ article_content = crawURL(url)
86
+ result = process_api(article_content)
87
+ return result
88
+ except:
89
+ return "No text found in the response body"
90
+
91
+ url = st.text_input("enter your CNN's URL here")
92
  # Test
93
  x = st.slider('Select a value')
94
  st.write(x, 'squared is', x * x)