alvin commited on
Commit
34c19b7
1 Parent(s): 08d39dc

update <|endoftext|> tokenizer id from 50257 to 50256

Browse files
.idea/.gitignore ADDED
File without changes
.idea/gpt2-medium-indonesian.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="PyDocumentationSettings">
9
+ <option name="format" value="PLAIN" />
10
+ <option name="myDocStringFormat" value="Plain" />
11
+ </component>
12
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
5
+ </profile>
6
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/gpt2-medium-indonesian.iml" filepath="$PROJECT_DIR$/.idea/gpt2-medium-indonesian.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ChangeListManager">
4
+ <list default="true" id="9048d30b-80ad-481e-a9f3-3c0d058dcd65" name="Changes" comment="">
5
+ <change beforePath="$PROJECT_DIR$/replace_token_script.py" beforeDir="false" afterPath="$PROJECT_DIR$/replace_token_script.py" afterDir="false" />
6
+ <change beforePath="$PROJECT_DIR$/tokenizer.json" beforeDir="false" afterPath="$PROJECT_DIR$/tokenizer.json" afterDir="false" />
7
+ </list>
8
+ <option name="SHOW_DIALOG" value="false" />
9
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
10
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
11
+ <option name="LAST_RESOLUTION" value="IGNORE" />
12
+ </component>
13
+ <component name="Git.Settings">
14
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
15
+ </component>
16
+ <component name="ProjectId" id="1wVKJnvEaHY26dnjKns17akLxD6" />
17
+ <component name="ProjectViewState">
18
+ <option name="hideEmptyMiddlePackages" value="true" />
19
+ <option name="showLibraryContents" value="true" />
20
+ </component>
21
+ <component name="PropertiesComponent">
22
+ <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
23
+ <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
24
+ <property name="last_opened_file_path" value="$PROJECT_DIR$" />
25
+ </component>
26
+ <component name="RunManager">
27
+ <configuration name="replace_token_script" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
28
+ <module name="gpt2-medium-indonesian" />
29
+ <option name="INTERPRETER_OPTIONS" value="" />
30
+ <option name="PARENT_ENVS" value="true" />
31
+ <envs>
32
+ <env name="PYTHONUNBUFFERED" value="1" />
33
+ </envs>
34
+ <option name="SDK_HOME" value="$USER_HOME$/alvin_research/task/Hardy/transformers/venv/bin/python" />
35
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
36
+ <option name="IS_MODULE_SDK" value="false" />
37
+ <option name="ADD_CONTENT_ROOTS" value="true" />
38
+ <option name="ADD_SOURCE_ROOTS" value="true" />
39
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/replace_token_script.py" />
40
+ <option name="PARAMETERS" value="" />
41
+ <option name="SHOW_COMMAND_LINE" value="false" />
42
+ <option name="EMULATE_TERMINAL" value="false" />
43
+ <option name="MODULE_MODE" value="false" />
44
+ <option name="REDIRECT_INPUT" value="false" />
45
+ <option name="INPUT_FILE" value="" />
46
+ <method v="2" />
47
+ </configuration>
48
+ </component>
49
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
50
+ <component name="TaskManager">
51
+ <task active="true" id="Default" summary="Default task">
52
+ <changelist id="9048d30b-80ad-481e-a9f3-3c0d058dcd65" name="Changes" comment="" />
53
+ <created>1628538247361</created>
54
+ <option name="number" value="Default" />
55
+ <option name="presentableId" value="Default" />
56
+ <updated>1628538247361</updated>
57
+ </task>
58
+ <servers />
59
+ </component>
60
+ </project>
replace_token_script.py CHANGED
@@ -18,10 +18,11 @@ with open(model_config_path, "r") as f:
18
 
19
  model_vocab_size = model_config['vocab_size']
20
  tokenizer_vocab = tokenizer_data['model']['vocab']
 
21
  mergeslength = len(tokenizer_data['model']['merges'])
22
 
23
- #readjust added_tokens 'id' to model_vocab_size
24
- tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size = model_config['vocab_size']
25
 
26
  final_index = model_vocab_size - 1
27
  eos = '<|endoftext|>'
 
18
 
19
  model_vocab_size = model_config['vocab_size']
20
  tokenizer_vocab = tokenizer_data['model']['vocab']
21
+
22
  mergeslength = len(tokenizer_data['model']['merges'])
23
 
24
+ #readjust added_tokens 'id' to model_vocab_size - 1
25
+ tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size - 1
26
 
27
  final_index = model_vocab_size - 1
28
  eos = '<|endoftext|>'
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff