lm-swedish / build_n_gram.py
birgermoell's picture
WIP updated lm
c498527
from datasets import load_dataset
target_lang="sv" # change to your target lang
username = "hf-test" # change to your username
dataset = load_dataset(f"{username}/{target_lang}_corpora_parliament_processed", split="train")
with open("text.txt", "w") as file:
file.write(" ".join(dataset["text"]))