| | |
| | import sys |
| | !pip install feedparser requests beautifulsoup4 |
| |
|
| | import pandas as pd |
| | from datetime import datetime, timedelta |
| | from bs4 import BeautifulSoup |
| | import requests |
| | import re |
| | import os |
| | import time |
| |
|
| | |
| | rss_url = 'https://vecherka.su/rss/' |
| | csv_file_path = 'bd.csv' |
| |
|
| | def check_for_new_articles(): |
| | print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...") |
| |
|
| | |
| | today_date = datetime.now() |
| | yesterday_date = today_date - timedelta(days=1) |
| | today_str = today_date.strftime('%d-%m-%Y') |
| | yesterday_str = yesterday_date.strftime('%d-%m-%Y') |
| |
|
| | |
| | processed_links = set() |
| | existing_df = None |
| | if os.path.exists(csv_file_path): |
| | try: |
| | existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';') |
| | processed_links = set(existing_df['link'].tolist()) |
| | print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.") |
| | except Exception as e: |
| | print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.") |
| |
|
| | |
| | feed = feedparser.parse(rss_url) |
| | if not feed.entries: |
| | print("No entries found in the RSS feed.") |
| | return 0 |
| |
|
| | new_articles_data = [] |
| | articles_added_count = 0 |
| |
|
| | for entry in feed.entries: |
| | title = getattr(entry, 'title', 'No Title') |
| | news_link = getattr(entry, 'link', None) |
| |
|
| | if not news_link or news_link in processed_links: |
| | continue |
| |
|
| | published_date_str = getattr(entry, 'published', None) |
| | if not published_date_str: |
| | print(f"Skipping entry '{title}' due to missing publication date.") |
| | continue |
| |
|
| | |
| | parsed_date = None |
| | try: |
| | parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') |
| | except ValueError: |
| | try: |
| | parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z') |
| | except ValueError: |
| | try: |
| | parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') |
| | except ValueError: |
| | try: |
| | |
| | date_parts = published_date_str.split(' ')[1:4] |
| | if len(date_parts) == 3: |
| | parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y') |
| | except ValueError: |
| | print(f"Could not parse date for entry: '{title}' - '{published_date_str}'") |
| | continue |
| |
|
| | if parsed_date: |
| | article_date_str = parsed_date.strftime('%d-%m-%Y') |
| |
|
| | |
| | if article_date_str == today_str or article_date_str == yesterday_str: |
| | image_urls = [] |
| |
|
| | |
| | if 'media_content' in entry and len(image_urls) < 3: |
| | for media in entry.media_content: |
| | if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls: |
| | image_urls.append(media['url']) |
| | if len(image_urls) == 3: break |
| |
|
| | |
| | if 'links' in entry and len(image_urls) < 3: |
| | for link_entry in entry.links: |
| | if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls: |
| | image_urls.append(link_entry['href']) |
| | if len(image_urls) == 3: break |
| |
|
| | |
| | html_content = '' |
| | if 'summary' in entry: |
| | html_content = entry.summary |
| | elif 'content' in entry and entry.content: |
| | html_content = entry.content[0].value |
| |
|
| | if html_content and len(image_urls) < 3: |
| | soup = BeautifulSoup(html_content, 'html.parser') |
| | img_tags = soup.find_all('img') |
| | for img in img_tags: |
| | if img.get('src') and img.get('src') not in image_urls: |
| | image_urls.append(img['src']) |
| | if len(image_urls) == 3: break |
| |
|
| | |
| | full_text = "" |
| | try: |
| | response = requests.get(news_link, timeout=10) |
| | response.raise_for_status() |
| | article_soup = BeautifulSoup(response.text, 'html.parser') |
| | detail_text_div = article_soup.find('div', class_='detail-text') |
| | if detail_text_div: |
| | full_text = detail_text_div.get_text(separator=' ', strip=True) |
| | |
| | full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE) |
| | full_text = re.sub(r'\s+', ' ', full_text).strip() |
| |
|
| | |
| | if re.search(r'\bРеклама\b', full_text, re.IGNORECASE): |
| | print(f"Skipping article '{title}' due to 'Реклама' in full text.") |
| | continue |
| | else: |
| | print(f"Could not find 'detail-text' div for article: '{title}'") |
| | except requests.exceptions.RequestException as e: |
| | print(f"Error fetching content for {news_link}: {e}") |
| | except Exception as e: |
| | print(f"Error parsing content for {news_link}: {e}") |
| |
|
| | |
| | short_text = '' |
| |
|
| | |
| | |
| | if full_text: |
| | new_articles_data.append({ |
| | 'title': title, |
| | 'published': article_date_str, |
| | 'image_urls': image_urls, |
| | 'link': news_link, |
| | 'full_text': full_text, |
| | 'Status': 'Off', |
| | 'short_text': short_text, |
| | 'Constant': '' |
| | }) |
| | processed_links.add(news_link) |
| | articles_added_count += 1 |
| |
|
| | if new_articles_data: |
| | new_df = pd.DataFrame(new_articles_data) |
| | new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x)) |
| |
|
| | if existing_df is not None and not existing_df.empty: |
| | |
| | new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';') |
| | else: |
| | |
| | new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';') |
| | print(f"Added {articles_added_count} new articles to {csv_file_path}.") |
| | else: |
| | print("No new articles found to add.") |
| |
|
| | return articles_added_count |
| |
|
| | |
| | print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.") |
| | while True: |
| | try: |
| | new_count = check_for_new_articles() |
| | print(f"Found and added {new_count} new articles.") |
| | time.sleep(1800) |
| | except KeyboardInterrupt: |
| | print("Monitoring stopped by user.") |
| | break |
| | except Exception as e: |
| | print(f"An unexpected error occurred in the main loop: {e}") |
| | time.sleep(60) |
| |
|