File size: 6,315 Bytes
5cab1a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f8826e
5cab1a3
 
 
 
 
 
 
 
 
 
 
 
 
 
1f8826e
5cab1a3
 
 
 
 
f4f149b
5cab1a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f8826e
5cab1a3
 
 
 
 
1f8826e
5cab1a3
 
 
 
 
 
 
 
 
 
 
 
 
1f8826e
5cab1a3
 
 
 
 
c0ff6c3
5cab1a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# nprSource.py is an implementation of the abstract Source object.
from dataclasses import dataclass
from collections import namedtuple
from typing import List, Tuple, Any
from gazpacho import Soup, get
from source import Source, Summary
import streamlit as st

stub = namedtuple('npr_stub',[ 'link','hed','entities', 'source'])
stub.__doc__ = f"""
        • A namedtuple to represent an unscraped news article.

        • link is the extension of the article.  Added to the source's source_url
                it is used to retrieve the full article and data.
        • hed is the headline ('hed' is journalism jargon, as is 'dek' for 'subheader')
        • entities is the list of entity names discovered in this headline,
                each entity representing one cluster the article is in.
        • source is a reference to the Source object that created the stub.
        """


@dataclass
class NPRLite(Source):
    """Implementation of abstract Source class that retrieves via webscraping at text.npr.org/1001"""
    # Creates the initial namedtuple that holds the hed, link,
    # and identified entities for each article.
    # Chosen articles will have their data stored in a Summary object.
    def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
        #print("retrieving NPR article stub")
        """Creates article stubs for articles listed on text.npr.org"""
        # Scrape NPR for headlines and links
        soup = Soup(get(self.source_url))
        # extract each headline
        npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
        # links scraped are just the extension to the site's base link.
        npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
        # limit amount of data being returned for clustering 
        if limit is not None:
            npr_hed = npr_hed[:limit]
            npr_links = npr_links[:limit]
        # Create stubs with heds and links
        # Test: do the headlines and links zipped together lineup correctly?
        article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
        #print(f"Number of npr articles: {len(npr_hed)}")
        return article_tuples, len(npr_hed)

    # Returns None if article is only 1 line.
    def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
        """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
        st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
        container = Soup(get(self.source_url[:-5] + indata.link))
        text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
        if isinstance(text_container, Soup):
            return None, None
        whole_text = ''.join([art.strip() for art in text_container])
        story_head = container.find('div', {'class':'story-head'})
        auth_and_date = [i.text for i in story_head.find('p')]
        author = auth_and_date[0]
        story_date = auth_and_date[1]
        author = author[3:]
        
        # return whole text and data for summary
        return whole_text, [
            self,
            indata.entities,
            indata.link,
            indata.hed,
            None,
            story_date,
            [author],
            len(whole_text.split(' ')),
        ]
     

@dataclass
class CNNText(Source):
    """Implementation of abstract Source class that retrieves via webscraping at lite.cnn.com"""

    # Creates the initial namedtuple that holds the hed, link,
    # and identified entities for each article.
    # Chosen articles will have their data stored in a Summary object.
    def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
        """Creates a stub for each article listed on lite.cnn.com"""
        soup = Soup(get(self.source_url))
        # Scrape NPR for headlines and links
        cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
        cnn_links = [i.attrs['href'] for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
        # limit amount of data returned for clustering
        if limit is not None:
            cnn_heds = cnn_heds[:limit]
            cnn_links = cnn_links[:limit]
        # Take this next line out of this function and place it where this data is used.
        article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
        
        return article_tuples, len(cnn_heds)

    # Returns None if article is only 1 line.
    def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
        """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
        #print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
        st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
        repeat = 0
        good = False
        while repeat < 2 and not good:
            try:
                container = Soup(get(self.source_url + indata.link))
                good = True
            except Exception as e:
                print(f"Error:\n{e}")
                print(f"Problem url: \n\t{self.source_url + indata.link}")
                repeat += 1
        if good:
            story_container = container.find('div', {'class': 'afe4286c'})
            #print(story_container)
            author = story_container.find('p',{'id':'byline'}).text
            story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
            #if isinstance(story_container, Soup):
            #    return None, None
            scp = story_container.find('p')[4:]
     
            whole_text = ''.join([i.text for i in scp if i.text is not None])
            article_data = [
            self,
            indata.entities,
            indata.link,
            indata.hed,
            None,
            story_date,
            [author],
            len(whole_text.split(' ')),
        ]
        else:
            whole_text = None
            article_data = None
        # return whole text and data for summary
        return whole_text, article_data