import re import csv import time import requests from typing import List import pandas as pd from tqdm import tqdm from bs4 import BeautifulSoup def scrape_question_with_answers(question_url: str) -> List[str]: url = 'https://stackoverflow.com/' + question_url response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') title = soup.find('title').text.replace(' - Stack Overflow', '') question_div = soup.find('div', {'class': 'postcell post-layout--right'}) question = question_div.find('p').text answers_div = soup.find('div', {'class': 'answercell post-layout--right'}) answer = answers_div.find('div', {'class': 's-prose js-post-body'}).text return [title, question, answer, url] def scrape_questions_page(url: str, min_votes: int, min_answers: int) -> List[List[str]]: response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') posts_summaries = soup.find_all('div', {'class':'s-post-summary js-post-summary'}) qa_data = [] for summary in posts_summaries: stats_div = summary.find('div', {'class': 's-post-summary--stats'}) vote_div = stats_div.find('div', { 'class': 's-post-summary--stats-item s-post-summary--stats-item__emphasized', 'title': re.compile(r'^Score of \d+$')}) if vote_div: vote_number = int(vote_div.find('span', {'class': 's-post-summary--stats-item-number'}).text) else: vote_number = 0 answer_div = stats_div.find('div', { 'class': 's-post-summary--stats-item', 'title': re.compile(r'^\d+ answers$')}) if answer_div: answer_number = int(answer_div.find('span', {'class': 's-post-summary--stats-item-number'}).text) else: answer_number = 0 question_href = summary.find('a', {'class': 's-link'})['href'] if vote_number >= min_votes and answer_number >= min_answers: try: qa_data.append(scrape_question_with_answers(question_href)) except Exception as error: print(error) time.sleep(1.5) return qa_data def crawl_and_save_qa( filename: str, base_url: str, start_page: int, n_pages: int=10, min_votes: int=1, min_answers: int=1 ): with open(filename, 'a', newline='') as f: writer = csv.writer(f) if start_page == 1: writer.writerow(['title', 'question', 'answer', 'url']) for page_num in tqdm(range(start_page, start_page+n_pages)): page_data = scrape_questions_page( base_url.format(page_num), min_votes, min_answers ) if page_data: for qa_data in page_data: writer.writerow(qa_data) if __name__ == '__main__': filename = '../datasets/stackoverflow_linux.csv' url = 'https://stackoverflow.com/questions/tagged/linux?tab=votes&page={}&pagesize=15' crawl_and_save_qa( filename=filename, base_url=url, start_page=21, n_pages=10, min_votes=1, min_answers=1 )