Shankhdhar commited on
Commit
2ec76c9
1 Parent(s): 5944e36

Some changes

Browse files
.gitattributes CHANGED
@@ -15,6 +15,8 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
18
  Train.tsv filter=lfs diff=lfs merge=lfs -text
19
  train/train2.tsv filter=lfs diff=lfs merge=lfs -text
20
  train/train2.txt filter=lfs diff=lfs merge=lfs -text
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ “*.tsv” filter=lfs diff=lfs merge=lfs -text
19
+ “*.txt” filter=lfs diff=lfs merge=lfs -text
20
  Train.tsv filter=lfs diff=lfs merge=lfs -text
21
  train/train2.tsv filter=lfs diff=lfs merge=lfs -text
22
  train/train2.txt filter=lfs diff=lfs merge=lfs -text
text_collection/text_collection.py DELETED
@@ -1,104 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Large-scale Indonesian Summarization Dataset"""
16
- import glob
17
- import json
18
- import os
19
- import re
20
- from pathlib import Path
21
- import datasets
22
- logger = datasets.logging.get_logger(__name__)
23
- _CITATION = """\
24
- """
25
- _DESCRIPTION = """\
26
- This module load text dataset from local directory. The text dataset should have the format like Oscar dataset
27
- where each new entry is separated by empty lines.
28
- """
29
- _HOMEPAGE = ""
30
- _LICENSE = ""
31
- class TextCollectionConfig(datasets.BuilderConfig):
32
- """BuilderConfig for TextCollection"""
33
- def __init__(self, **kwargs):
34
- """BuilderConfig for TextCollection.
35
- Args:
36
- **kwargs: keyword arguments forwarded to super.
37
- """
38
- super(TextCollectionConfig, self).__init__(**kwargs)
39
- class TextCollection(datasets.GeneratorBasedBuilder):
40
- VERSION = datasets.Version("1.0.0")
41
- BUILDER_CONFIGS = [
42
- TextCollectionConfig(
43
- name="text_collection",
44
- version=VERSION,
45
- description="Id Collection dataset",
46
- ),
47
- ]
48
- @property
49
- def manual_download_instructions(self):
50
- return """\
51
- You need to manually collect text datasets in a directory. The text dataset can then be loaded
52
- using the following command:
53
- `datasets.load_dataset("text_collection", data_dir="<path/to/dataset>")`.
54
- """
55
- def _info(self):
56
- return datasets.DatasetInfo(
57
- description=_DESCRIPTION,
58
- features=datasets.Features({"id": datasets.Value("int64"), "text": datasets.Value("string")}),
59
- supervised_keys=None,
60
- homepage=_HOMEPAGE,
61
- license=_LICENSE,
62
- citation=_CITATION,
63
- )
64
- def _split_generators(self, dl_manager):
65
- data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
66
- print("# Data directory", data_dir)
67
- if not os.path.exists(data_dir):
68
- raise FileNotFoundError(
69
- "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('id_liputan6', "
70
- "'canonical', data_dir=...)`. Manual download instructions:\n{}".format(
71
- data_dir, self.manual_download_instructions
72
- )
73
- )
74
- split_generators = [
75
- datasets.SplitGenerator(
76
- name=datasets.Split.TRAIN,
77
- gen_kwargs={
78
- "article_dir": os.path.join(data_dir, ""),
79
- "split": "train",
80
- },
81
- )
82
- ]
83
- return split_generators
84
- def _generate_examples(self, article_dir, split):
85
- logger.info("⏳ Generating %s examples from = %s", split, article_dir)
86
- id_ = 0
87
- current_lines = []
88
- for path in sorted(glob.glob(os.path.join(article_dir, "**/*.txt"), recursive=True)):
89
- with open(path, "r") as f:
90
- print("# Reading", path)
91
- for line in f:
92
- if len(line.strip()) > 0:
93
- current_lines.append(line)
94
- elif current_lines:
95
- feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
96
- yield feature
97
- id_ += 1
98
- current_lines = []
99
- # last paragraph
100
- if current_lines:
101
- feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
102
- yield feature
103
- id_ += 1
104
- current_lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text_collection/text_collection.py.lock DELETED
File without changes
train/train2.txt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5874cf342a7153d6949d6ecdbe72cabfde2bede960a4409bd7f82e88d6d4ed0f
3
- size 19715139
 
 
 
 
train/val2.txt DELETED
The diff for this file is too large to render. See raw diff
 
val/val2.txt DELETED
The diff for this file is too large to render. See raw diff