File size: 929 Bytes
1f30dbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from functools import partial

import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

from perplexity import KenlmModel


def hub_dataset_to_dataframe(path: str, name: str, split: str, sample: int, text_column: str, model: KenlmModel, seed: int = 0) -> pd.DataFrame:
    load_dataset_fn = partial(load_dataset, path=path)
    if name:
        load_dataset_fn = partial(load_dataset_fn, name=name)
    if split:
        load_dataset_fn = partial(load_dataset_fn, split=split)
    dataset = (
        load_dataset_fn(streaming=True)
        .shuffle(buffer_size=10000, seed=seed)
        .map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
    )
    instances = []
    count = 0
    for instance in tqdm(dataset, total=sample):
        instances.append(instance)
        count += 1
        if count == sample:
            break
    return pd.DataFrame(instances)