metricv commited on
Commit
64eaa7b
1 Parent(s): 5c71086

Update model

Browse files
Files changed (3) hide show
  1. data +1 -1
  2. segmenter.ckpt +1 -1
  3. utils.py +61 -16
data CHANGED
@@ -1 +1 @@
1
- Subproject commit 733ac504b6f80dd11244534aa2820333cd0e0176
 
1
+ Subproject commit 83ccdae5afe7eaf7f88b0ceb4933544e445b7841
segmenter.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bc9ddcfd7a654b4a86d7bb711a9d3e9c126269186033f5bc63695f88d4aaa77
3
  size 2665888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e616108c71de535ed24386f1aaf2e38ab9cdf0dd123517aa5cef72c9ec019ed9
3
  size 2665888
utils.py CHANGED
@@ -76,22 +76,67 @@ def tag_training_data(filename: str):
76
  return reconstructed_tags
77
 
78
  def get_upenn_tags_dict():
79
- tagger = PerceptronTagger()
80
-
81
- tags = list(tagger.tagdict.values())
82
-
83
- # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
84
- tags.extend(["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"])
85
- tags = list(set(tags))
86
- tags.sort()
87
- tags.append("BREAK")
88
-
89
- tags_dict = dict()
90
-
91
- for index, tag in enumerate(tags):
92
- tags_dict[tag] = index
93
-
94
- return tags_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
 
97
  def parse_tags(reconstructed_tags):
 
76
  return reconstructed_tags
77
 
78
  def get_upenn_tags_dict():
79
+ # tagger = PerceptronTagger()
80
+
81
+ # tags = list(tagger.tagdict.values())
82
+
83
+ # # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
84
+ # tags.extend(["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"])
85
+ # tags = list(set(tags))
86
+ # tags.sort()
87
+ # tags.append("BREAK")
88
+
89
+ # tags_dict = dict()
90
+
91
+ # for index, tag in enumerate(tags):
92
+ # tags_dict[tag] = index
93
+
94
+ return {'#': 0,
95
+ '$': 1,
96
+ "''": 2,
97
+ '(': 3,
98
+ ')': 4,
99
+ ',': 5,
100
+ '.': 6,
101
+ ':': 7,
102
+ 'CC': 8,
103
+ 'CD': 9,
104
+ 'DT': 10,
105
+ 'EX': 11,
106
+ 'FW': 12,
107
+ 'IN': 13,
108
+ 'JJ': 14,
109
+ 'JJR': 15,
110
+ 'JJS': 16,
111
+ 'LS': 17,
112
+ 'MD': 18,
113
+ 'NN': 19,
114
+ 'NNP': 20,
115
+ 'NNPS': 21,
116
+ 'NNS': 22,
117
+ 'PDT': 23,
118
+ 'POS': 24,
119
+ 'PRP': 25,
120
+ 'PRP$': 26,
121
+ 'RB': 27,
122
+ 'RBR': 28,
123
+ 'RBS': 29,
124
+ 'RP': 30,
125
+ 'SYM': 31,
126
+ 'TO': 32,
127
+ 'UH': 33,
128
+ 'VB': 34,
129
+ 'VBD': 35,
130
+ 'VBG': 36,
131
+ 'VBN': 37,
132
+ 'VBP': 38,
133
+ 'VBZ': 39,
134
+ 'WDT': 40,
135
+ 'WP': 41,
136
+ 'WP$': 42,
137
+ 'WRB': 43,
138
+ '``': 44,
139
+ 'BREAK': 45}
140
 
141
 
142
  def parse_tags(reconstructed_tags):