File size: 1,709 Bytes
c69cba4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from datetime import datetime
from datasets import load_dataset
from bs4 import BeautifulSoup


def preprocess_dataset():
    """
    Preprocesses the 'koutch/stackoverflow_python' dataset.

    Returns:
        datasets.arrow_dataset.Dataset: The preprocessed dataset.
    """
    dataset = load_dataset('koutch/stackoverflow_python', split='train')
    dataset = dataset.filter(
        lambda example:
            example['question_score'] > 100 and
            example['answer_score'] > 5 and
            datetime.strptime(example['answer_date'], '%Y-%m-%dT%H:%M:%SZ').year > 2010
        )

    def html2text(example):
        soup = BeautifulSoup(example, 'html.parser')
        return ''.join(soup.findAll(string=True))

    def transforms(example):
        example['answer'] = html2text(example['answer_body'])
        example['question'] = html2text(example['question_body'])
        return example

    dataset = dataset.map(lambda example: transforms(example))
    dataset = dataset.remove_columns([
        'question_score', 'question_date', 'question_id',
        'answer_date', 'answer_id', 'answer_score', 'tags',
        'question_body', 'answer_body'
    ])
    return dataset


def show_info(dataset):
    """
    Print information about the dataset.

    Args:
        dataset (datasets.arrow_dataset.Dataset): The dataset.
    """
    print(dataset.info, '\n')
    print(f'dataset len: {len(dataset)}')
    print(f"example question: {dataset[0]['question']}")
    print(f"example answer: {dataset[0]['answer']}")


if __name__ == '__main__':
    dataset = preprocess_dataset()
    dataset.push_to_hub('KonradSzafer/stackoverflow_python_preprocessed', private=False)
    show_info(dataset)