File size: 4,114 Bytes
b396e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
from bs4 import BeautifulSoup

# NewsAPI Key
NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"

def extract_news(company, num_articles=2):
    """Fetch multiple news articles from NewsAPI and return titles and contents."""
    url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
    response = requests.get(url)

    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        return []

    data = response.json()
    articles = data.get("articles", [])

    if not articles:
        print("No articles found.")
        return []

    extracted_articles = []

    for article in articles[:num_articles]:  # Get the required number of articles
        article_url = article.get("url", "No URL available.")

        # Scrape the article for title and content
        article_response = requests.get(article_url)
        if article_response.status_code == 200:
            soup = BeautifulSoup(article_response.content, 'html.parser')
            title = soup.title.string if soup.title else "No Title Found"
            
            # Extract paragraphs and clean the content
            paragraphs = soup.find_all('p')
            content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())

            # Optionally, filter out unwanted text patterns
            unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
            for pattern in unwanted_patterns:
                content = content.replace(pattern, "")
            
            # Clean up extra spaces
            content = ' '.join(content.split())

            extracted_articles.append({"title": title, "content": content})

    return extracted_articles


# import requests
# from bs4 import BeautifulSoup

# # NewsAPI Key
# NEWS_API_KEY = "04a9ea0fe9874092a57d547f4d0e43c6"

# def fetch_articles(company, num_articles=11):
#     """Fetch multiple news articles from NewsAPI and return their titles and content."""
#     url = f"https://newsapi.org/v2/everything?q={company}&apiKey={NEWS_API_KEY}&language=en&pageSize={num_articles}"
#     response = requests.get(url)

#     if response.status_code != 200:
#         print("Error:", response.status_code, response.text)
#         return []

#     data = response.json()
#     articles = data.get("articles", [])

#     if not articles:
#         print("No articles found.")
#         return []

#     fetched_articles = []

#     for article in articles[:num_articles]:  # Fetch only the required number of articles
#         article_url = article.get("url")
#         if not article_url:
#             continue

#         # Scrape the article for title and content
#         try:
#             article_response = requests.get(article_url, timeout=5)  # Removed headers
#             if article_response.status_code == 200:
#                 soup = BeautifulSoup(article_response.content, 'html.parser')
#                 title = soup.title.string if soup.title else "No Title Found"

#                 # Extract paragraphs and clean the content
#                 paragraphs = soup.find_all('p')
#                 content = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())

#                 # Remove unwanted text patterns
#                 unwanted_patterns = ["Want to read", "Nickname:", "Password:", "The Fine Print:"]
#                 for pattern in unwanted_patterns:
#                     content = content.replace(pattern, "")

#                 # Clean up extra spaces
#                 content = ' '.join(content.split())

#                 # Store the article's title and content
#                 fetched_articles.append({"title": title, "content": content})
#         except requests.exceptions.RequestException as e:
#             print(f"Error fetching article: {e}")

#     return fetched_articles

# if __name__ == "__main__":
#     company = input("Enter the company name for analysis: ").strip()
#     articles = fetch_articles(company, num_articles=11)
#     print(articles)