File size: 2,683 Bytes
e45d093
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from google_drive_handle import authenticate_google_drive
drive = authenticate_google_drive()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def scrape_article(article_url):
    response = requests.get(article_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No Title'

    content_div = soup.find('div', id='article_holder')  # Ensure this is the correct ID
    if content_div:
        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))
    else:
        content = 'Content not found'

    return {
        'Title': title,
        'Content': content
    }

def scrape_category(category_url, num_articles):
    articles_scraped = 0
    all_articles = []
    page_num = 1

    # Extract site and category from the URL
    site_name = category_url.split('/')[2]  # This gets 'www.akhbarona.com' from the URL
    site_name = site_name.replace('www.', '')
    category_name = category_url.split('/')[-1]  # This gets the category name from the URL

    while articles_scraped < num_articles:
        paginated_url = f"{category_url}/index.{page_num}.html"

        response = requests.get(paginated_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        article_links = soup.find_all('h2', class_='article_title')
        for article_link in article_links:
            a_tag = article_link.find('a')
            if a_tag and 'href' in a_tag.attrs:
                full_article_url = a_tag['href']
                if not full_article_url.startswith('http'):
                    full_article_url = f"{category_url}/{full_article_url}"
                article_data = scrape_article(full_article_url)

                all_articles.append(article_data)
                articles_scraped += 1

                if articles_scraped >= num_articles:
                    break

        if articles_scraped >= num_articles:
            break

        print(f"Going to next page: {paginated_url}")
        page_num += 1  # Increment the page number


    #csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
    df = pd.DataFrame(all_articles)
    csv_file_name = f"{site_name}_{category_name}_articles.csv"
    csv_file_path = os.path.join(os.getcwd(), csv_file_name)  # Full file path
    df.to_csv(csv_file_path, index=False)
    print(f"Articles saved to {csv_file_path}")

    return csv_file_path