Younes Belkada commited on
Commit
f2449c8
1 Parent(s): f342985

add python file

Browse files
Files changed (1) hide show
  1. tokenizer.py +1 -7
tokenizer.py CHANGED
@@ -1,8 +1,5 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer
3
- from huggingface_hub import Repository
4
-
5
- repo = Repository(".", clone_from="ybelkada/japanese-dummy-tokenizer")
6
 
7
  def get_training_corpus(dataset):
8
  """
@@ -22,7 +19,4 @@ print("Old Tokenizer:", old_tokenizer.tokenize("誰が一番に着くか私に
22
  new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000)
23
 
24
  print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
25
- new_tokenizer.save_pretrained("japanese-dummy-tokenizer")
26
- repo.git_add()
27
- repo.git_commit("Add tokenizer file")
28
- repo.git_push()
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer
 
 
 
3
 
4
  def get_training_corpus(dataset):
5
  """
 
19
  new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000)
20
 
21
  print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
22
+ new_tokenizer.save_pretrained("japanese-dummy-tokenizer")