green commited on
Commit
1f8826e
1 Parent(s): 3e11f3e

Removed print statements and comments

Browse files
Files changed (1) hide show
  1. scrape_sources.py +6 -9
scrape_sources.py CHANGED
@@ -26,13 +26,12 @@ class NPRLite(Source):
26
  # and identified entities for each article.
27
  # Chosen articles will have their data stored in a Summary object.
28
  def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
29
- print("retrieving NPR article stub")
30
  """Creates article stubs for articles listed on text.npr.org"""
31
  # Scrape NPR for headlines and links
32
  soup = Soup(get(self.source_url))
33
  # extract each headline
34
  npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
35
- #npr_hed = [i for i in npr_hed if 'Opinion:' not in i]
36
  # links scraped are just the extension to the site's base link.
37
  npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
38
  # limit amount of data being returned for clustering
@@ -42,13 +41,13 @@ class NPRLite(Source):
42
  # Create stubs with heds and links
43
  # Test: do the headlines and links zipped together lineup correctly?
44
  article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
45
- print(f"Number of npr articles: {len(npr_hed)}")
46
  return article_tuples, len(npr_hed)
47
 
48
  # Returns None if article is only 1 line.
49
  def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
50
  """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
51
- st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
52
  container = Soup(get(self.source_url[:-5] + indata.link))
53
  text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
54
  if isinstance(text_container, Soup):
@@ -82,7 +81,6 @@ class CNNText(Source):
82
  # Chosen articles will have their data stored in a Summary object.
83
  def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
84
  """Creates a stub for each article listed on lite.cnn.com"""
85
- print("retrieving CNN article stub")
86
  soup = Soup(get(self.source_url))
87
  # Scrape NPR for headlines and links
88
  cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
@@ -91,16 +89,15 @@ class CNNText(Source):
91
  if limit is not None:
92
  cnn_heds = cnn_heds[:limit]
93
  cnn_links = cnn_links[:limit]
94
- #cnn = [i for i in cnn_heds if 'Analysis:' not in i and 'Opinion:' not in i]
95
  # Take this next line out of this function and place it where this data is used.
96
  article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
97
- print(f"Number of cnn articles: {len(cnn_heds)}")
98
  return article_tuples, len(cnn_heds)
99
 
100
  # Returns None if article is only 1 line.
101
  def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
102
  """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
103
- print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
104
  st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
105
  repeat = 0
106
  good = False
@@ -114,7 +111,7 @@ class CNNText(Source):
114
  repeat += 1
115
  if good:
116
  story_container = container.find('div', {'class': 'afe4286c'})
117
- print(story_container)
118
  author = story_container.find('p',{'id':'byline'}).text
119
  story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
120
  #if isinstance(story_container, Soup):
26
  # and identified entities for each article.
27
  # Chosen articles will have their data stored in a Summary object.
28
  def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
29
+ #print("retrieving NPR article stub")
30
  """Creates article stubs for articles listed on text.npr.org"""
31
  # Scrape NPR for headlines and links
32
  soup = Soup(get(self.source_url))
33
  # extract each headline
34
  npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
 
35
  # links scraped are just the extension to the site's base link.
36
  npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
37
  # limit amount of data being returned for clustering
41
  # Create stubs with heds and links
42
  # Test: do the headlines and links zipped together lineup correctly?
43
  article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
44
+ #print(f"Number of npr articles: {len(npr_hed)}")
45
  return article_tuples, len(npr_hed)
46
 
47
  # Returns None if article is only 1 line.
48
  def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
49
  """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
50
+ #st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
51
  container = Soup(get(self.source_url[:-5] + indata.link))
52
  text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
53
  if isinstance(text_container, Soup):
81
  # Chosen articles will have their data stored in a Summary object.
82
  def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
83
  """Creates a stub for each article listed on lite.cnn.com"""
 
84
  soup = Soup(get(self.source_url))
85
  # Scrape NPR for headlines and links
86
  cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
89
  if limit is not None:
90
  cnn_heds = cnn_heds[:limit]
91
  cnn_links = cnn_links[:limit]
 
92
  # Take this next line out of this function and place it where this data is used.
93
  article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
94
+
95
  return article_tuples, len(cnn_heds)
96
 
97
  # Returns None if article is only 1 line.
98
  def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
99
  """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
100
+ #print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
101
  st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
102
  repeat = 0
103
  good = False
111
  repeat += 1
112
  if good:
113
  story_container = container.find('div', {'class': 'afe4286c'})
114
+ #print(story_container)
115
  author = story_container.find('p',{'id':'byline'}).text
116
  story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
117
  #if isinstance(story_container, Soup):