Spaces:

MoroccanDS
/

A8-Moroccan-News-Aggregator

Running

App Files Files Community

A8-Moroccan-News-Aggregator / al9anat_ar.py

S-11

Upload 13 files

e45d093 verified 8 months ago

raw

history blame contribute delete

6.83 kB

	from typing_extensions import Self
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options as ChromeOptions
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.common.by import By
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.action_chains import ActionChains
	from selenium.common.exceptions import NoSuchElementException, TimeoutException
	from selenium.common.exceptions import ElementClickInterceptedException
	from bs4 import BeautifulSoup
	import time
	import os
	import re
	import requests
	import json
	import csv
	from urllib.parse import urljoin




	# Set up Chrome WebDriver with options
	options = ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('log-level=3')


	# Initialize the Chrome WebDriver
	wd = webdriver.Chrome(options=options)

	def download_image(img_url):
	return img_url

	def sanitize_filename(filename):
	return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')

	def scroll_page(wd, max_scrolls=7, articles_per_load=6, max_attempts=5):
	scroll_pause_time = 5
	attempts = 0

	for _ in range(max_scrolls):
	current_articles = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
	wd.execute_script("window.scrollBy(0, document.body.scrollHeight);")
	time.sleep(scroll_pause_time)

	try:
	load_more_button = WebDriverWait(wd, 10).until(
	EC.presence_of_element_located((By.XPATH, '//a[@class="ts-button load-button load-button-a ts-button-alt" and @href="#"]'))
	)
	wd.execute_script("arguments[0].scrollIntoView();", load_more_button)
	wd.execute_script("arguments[0].click();", load_more_button)
	attempts = 0 # Reset attempts after successful button click
	except TimeoutException:
	attempts += 1
	if attempts >= max_attempts:
	print("Maximum attempts reached without new articles. Exiting.")
	return False # Exit the function

	new_article_count = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
	if new_article_count > current_articles:
	attempts = 0 # Reset attempts after successfully loading new articles
	else:
	attempts += 1
	if attempts >= max_attempts:
	print("No new articles found after several attempts. Exiting.")
	return False # Exit the function

	return True



	def scrape_article_details(article_url, wd):
	try:
	# Validate the URL
	if not article_url.startswith("http"):
	article_url = "https://" + article_url
	print("Navigating to:", article_url)

	wd.get(article_url)
	WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'the-post-tags'))) # Wait for a specific element to ensure the page has loaded

	soup = BeautifulSoup(wd.page_source, 'html.parser')
	content_tag = soup.find('div', class_='post-content cf entry-content content-spacious')
	content = content_tag.get_text().strip() if content_tag else ""

	category_tag = soup.find('span', class_='meta-item cat-labels')
	category_from_article = category_tag.get_text().strip() if category_tag else "Uncategorized"

	title_tag = soup.find('h1', class_='is-title post-title')
	art_title = title_tag.get_text().strip() if title_tag else ""

	date_tag = soup.find('span', class_='meta-item has-next-icon date')
	date = date_tag.get_text().strip() if date_tag else ""

	image_tag = soup.find('a', class_='image-link')
	image_url = image_tag['href'] if image_tag else None
	img_url = urljoin(article_url, image_url)
	image_path = download_image(img_url) if image_url else None

	return content, date, image_path, art_title, category_from_article
	except TimeoutException:
	print("Timed out waiting for page elements to load for URL:", article_url)
	return "", "", None, "", ""
	except Exception as e:
	print(f"An error occurred while scraping article details at {article_url}: {str(e)}")
	return "", "", None, "", ""


	def scrape_category(category_url,num_articles):
	# Set up Chrome WebDriver with options
	options = ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('log-level=3')

	# Initialize the Chrome WebDriver
	wd = webdriver.Chrome(options=options)
	print("Attempting to scrape:", category_url)
	articles_data = []
	articles_count = 0
	wd.get(category_url)

	# Adjusted to use num_articles for scrolling and loading articles
	scroll_page(wd, max_scrolls=int(num_articles/6), articles_per_load=6)

	soup = BeautifulSoup(wd.page_source, 'html.parser')
	articles = soup.find_all('article', class_='l-post grid-base-post grid-post')

	for article in articles[:num_articles]: # Limit to num_articles
	link_tag = article.find('a', class_='image-link media-ratio ratio-16-9')
	link = link_tag['href'] if link_tag else ""
	if link:
	wd.get(link)
	article_data = scrape_article_details(link, wd)
	if article_data[0]: # Check if content is non-empty
	articles_data.append({
	"art_id": articles_count,
	"Title": article_data[3],
	"Date": article_data[1],
	"Category": article_data[4],
	"Content": article_data[0],
	"Link": link,
	"Image": article_data[2],
	})
	articles_count += 1
	print(f"Article #{articles_count} scraped: {article_data[3]}")

	category_name = sanitize_filename(category_url.split("/")[-1])
	csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data.csv')
	try:
	with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
	fieldnames = ["art_id", "Title", "Date", "Category", "Content", "Link", "Image"]
	writer = csv.DictWriter(file, fieldnames=fieldnames)
	writer.writeheader()
	for article in articles_data:
	writer.writerow(article)
	print(f"Data written to {csv_file_path} successfully.")
	except Exception as e:
	print(f"Error writing data to file: {e}")

	wd.quit() # Close the WebDriver

	print(f"Total articles scraped: {len(articles_data)}")
	return csv_file_path