m. polinsky commited on
Commit
5cab1a3
1 Parent(s): 80b5ef0

Create scrape_sources.py

Browse files
Files changed (1) hide show
  1. scrape_sources.py +140 -0
scrape_sources.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nprSource.py is an implementation of the abstract Source object.
2
+ from dataclasses import dataclass
3
+ from collections import namedtuple
4
+ from typing import List, Tuple, Any
5
+ from gazpacho import Soup, get
6
+ from source import Source, Summary
7
+ import streamlit as st
8
+
9
+ stub = namedtuple('npr_stub',[ 'link','hed','entities', 'source'])
10
+ stub.__doc__ = f"""
11
+ • A namedtuple to represent an unscraped news article.
12
+
13
+ • link is the extension of the article. Added to the source's source_url
14
+ it is used to retrieve the full article and data.
15
+ • hed is the headline ('hed' is journalism jargon, as is 'dek' for 'subheader')
16
+ • entities is the list of entity names discovered in this headline,
17
+ each entity representing one cluster the article is in.
18
+ • source is a reference to the Source object that created the stub.
19
+ """
20
+
21
+
22
+ @dataclass
23
+ class NPRLite(Source):
24
+ """Implementation of abstract Source class that retrieves via webscraping at text.npr.org/1001"""
25
+ # Creates the initial namedtuple that holds the hed, link,
26
+ # and identified entities for each article.
27
+ # Chosen articles will have their data stored in a Summary object.
28
+ def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
29
+ print("retrieving NPR article stub")
30
+ """Creates article stubs for articles listed on text.npr.org"""
31
+ # Scrape NPR for headlines and links
32
+ soup = Soup(get(self.source_url))
33
+ # extract each headline
34
+ npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
35
+ #npr_hed = [i for i in npr_hed if 'Opinion:' not in i]
36
+ # links scraped are just the extension to the site's base link.
37
+ npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
38
+ # limit amount of data being returned for clustering
39
+ if limit is not None:
40
+ npr_hed = npr_hed[:limit]
41
+ npr_links = npr_links[:limit]
42
+ # Create stubs with heds and links
43
+ # Test: do the headlines and links zipped together lineup correctly?
44
+ article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
45
+ print(f"Number of npr articles: {len(npr_hed)}")
46
+ return article_tuples, len(npr_hed)
47
+
48
+ # Returns None if article is only 1 line.
49
+ def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
50
+ """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
51
+ st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
52
+ container = Soup(get(self.source_url[:-5] + indata.link))
53
+ text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
54
+ if isinstance(text_container, Soup):
55
+ return None, None
56
+ whole_text = ''.join([art.strip() for art in text_container])
57
+ story_head = container.find('div', {'class':'story-head'})
58
+ auth_and_date = [i.text for i in story_head.find('p')]
59
+ author = auth_and_date[0]
60
+ story_date = auth_and_date[1]
61
+ author = author[3:]
62
+
63
+ # return whole text and data for summary
64
+ return whole_text, [
65
+ self,
66
+ indata.entities,
67
+ indata.link,
68
+ indata.hed,
69
+ None,
70
+ story_date,
71
+ [author],
72
+ len(whole_text.split(' ')),
73
+ ]
74
+
75
+
76
+ @dataclass
77
+ class CNNText(Source):
78
+ """Implementation of abstract Source class that retrieves via webscraping at lite.cnn.com"""
79
+
80
+ # Creates the initial namedtuple that holds the hed, link,
81
+ # and identified entities for each article.
82
+ # Chosen articles will have their data stored in a Summary object.
83
+ def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
84
+ """Creates a stub for each article listed on lite.cnn.com"""
85
+ print("retrieving CNN article stub")
86
+ soup = Soup(get(self.source_url))
87
+ # Scrape NPR for headlines and links
88
+ cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
89
+ cnn_links = [i.attrs['href'] for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
90
+ # limit amount of data returned for clustering
91
+ if limit is not None:
92
+ cnn_heds = cnn_heds[:limit]
93
+ cnn_links = cnn_links[:limit]
94
+ #cnn = [i for i in cnn_heds if 'Analysis:' not in i and 'Opinion:' not in i]
95
+ # Take this next line out of this function and place it where this data is used.
96
+ article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
97
+ print(f"Number of cnn articles: {len(cnn_heds)}")
98
+ st.write(f"CNN articles done...")
99
+ return article_tuples, len(cnn_heds)
100
+
101
+ # Returns None if article is only 1 line.
102
+ def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
103
+ """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
104
+ print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
105
+ st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
106
+ repeat = 0
107
+ good = False
108
+ while repeat < 2 and not good:
109
+ try:
110
+ container = Soup(get(self.source_url + indata.link))
111
+ good = True
112
+ except Exception as e:
113
+ print(f"Error:\n{e}")
114
+ print(f"Problem url: \n\t{self.source_url + indata.link}")
115
+ repeat += 1
116
+ if good:
117
+ story_container = container.find('div', {'class': 'afe4286c'})
118
+ print(story_container)
119
+ author = story_container.find('p',{'id':'byline'}).text
120
+ story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
121
+ #if isinstance(story_container, Soup):
122
+ # return None, None
123
+ scp = story_container.find('p')[4:]
124
+ print(f"story_container.find('p')...\n\t{scp}")
125
+ whole_text = ''.join([i.text for i in scp if i.text is not None])
126
+ article_data = [
127
+ self,
128
+ indata.entities,
129
+ indata.link,
130
+ indata.hed,
131
+ None,
132
+ story_date,
133
+ [author],
134
+ len(whole_text.split(' ')),
135
+ ]
136
+ else:
137
+ whole_text = None
138
+ article_data = None
139
+ # return whole text and data for summary
140
+ return whole_text, article_data