KoichiYasuoka
commited on
Commit
•
dd1a82e
1
Parent(s):
3b0aa2e
cached_file support
Browse files- setup.py +3 -3
- suparkanbun/download.py +13 -7
- suparkanbun/suparkanbun.py +22 -12
setup.py
CHANGED
@@ -5,7 +5,7 @@ URL="https://github.com/KoichiYasuoka/SuPar-Kanbun"
|
|
5 |
|
6 |
setuptools.setup(
|
7 |
name="suparkanbun",
|
8 |
-
version="1.4.
|
9 |
description="Tokenizer POS-tagger and Dependency-parser for Classical Chinese",
|
10 |
long_description=long_description,
|
11 |
long_description_content_type="text/markdown",
|
@@ -16,10 +16,10 @@ setuptools.setup(
|
|
16 |
keywords="NLP Chinese",
|
17 |
packages=setuptools.find_packages(),
|
18 |
install_requires=[
|
19 |
-
"supar>=1.1.
|
20 |
"transformers>=4.0.1",
|
21 |
"spacy>=2.2.2",
|
22 |
-
"deplacy>=2.0.
|
23 |
],
|
24 |
python_requires=">=3.7",
|
25 |
package_data={"suparkanbun":["models/*.txt","models/*/*.txt","models/*/*.json"]},
|
|
|
5 |
|
6 |
setuptools.setup(
|
7 |
name="suparkanbun",
|
8 |
+
version="1.4.6",
|
9 |
description="Tokenizer POS-tagger and Dependency-parser for Classical Chinese",
|
10 |
long_description=long_description,
|
11 |
long_description_content_type="text/markdown",
|
|
|
16 |
keywords="NLP Chinese",
|
17 |
packages=setuptools.find_packages(),
|
18 |
install_requires=[
|
19 |
+
"supar>=1.1.4",
|
20 |
"transformers>=4.0.1",
|
21 |
"spacy>=2.2.2",
|
22 |
+
"deplacy>=2.0.3"
|
23 |
],
|
24 |
python_requires=">=3.7",
|
25 |
package_data={"suparkanbun":["models/*.txt","models/*/*.txt","models/*/*.json"]},
|
suparkanbun/download.py
CHANGED
@@ -3,24 +3,30 @@
|
|
3 |
|
4 |
import os
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
import shutil
|
8 |
-
from transformers.file_utils import cached_path
|
9 |
t=os.path.join(dir,"filesize.txt")
|
10 |
-
shutil.copy(
|
11 |
with open(t,"r") as f:
|
12 |
r=f.read()
|
|
|
13 |
ft=0
|
14 |
for t in r.split("\n"):
|
15 |
s=t.split()
|
16 |
if len(s)==2:
|
17 |
-
if s[0]==
|
18 |
ft=int(s[1])
|
19 |
if ft==0:
|
20 |
return
|
21 |
-
shutil.copy(
|
22 |
|
23 |
-
def checkdownload(
|
24 |
while True:
|
25 |
t=os.path.join(dir,"filesize.txt")
|
26 |
with open(t,"r") as f:
|
@@ -35,7 +41,7 @@ def checkdownload(url,dir="."):
|
|
35 |
except:
|
36 |
j=-1
|
37 |
if i!=j:
|
38 |
-
download(
|
39 |
break
|
40 |
else:
|
41 |
return
|
|
|
3 |
|
4 |
import os
|
5 |
|
6 |
+
try:
|
7 |
+
from transformers.utils import cached_file
|
8 |
+
except:
|
9 |
+
from transformers.file_utils import cached_path,hf_bucket_url
|
10 |
+
cached_file=lambda x,y:cached_path(hf_bucket_url(x,y))
|
11 |
+
|
12 |
+
def download(rootdir,file,dir="."):
|
13 |
import shutil
|
|
|
14 |
t=os.path.join(dir,"filesize.txt")
|
15 |
+
shutil.copy(cached_file(rootdir,os.path.dirname(file)+"/filesize.txt"),t)
|
16 |
with open(t,"r") as f:
|
17 |
r=f.read()
|
18 |
+
f=os.path.basename(file)
|
19 |
ft=0
|
20 |
for t in r.split("\n"):
|
21 |
s=t.split()
|
22 |
if len(s)==2:
|
23 |
+
if s[0]==f:
|
24 |
ft=int(s[1])
|
25 |
if ft==0:
|
26 |
return
|
27 |
+
shutil.copy(cached_file(rootdir,file),os.path.join(dir,f))
|
28 |
|
29 |
+
def checkdownload(rootdir,model,dir="."):
|
30 |
while True:
|
31 |
t=os.path.join(dir,"filesize.txt")
|
32 |
with open(t,"r") as f:
|
|
|
41 |
except:
|
42 |
j=-1
|
43 |
if i!=j:
|
44 |
+
download(rootdir,model+s[0],dir)
|
45 |
break
|
46 |
else:
|
47 |
return
|
suparkanbun/suparkanbun.py
CHANGED
@@ -5,9 +5,6 @@ import os
|
|
5 |
PACKAGE_DIR=os.path.abspath(os.path.dirname(__file__))
|
6 |
DOWNLOAD_DIR=os.path.join(PACKAGE_DIR,"models")
|
7 |
|
8 |
-
from transformers.file_utils import hf_bucket_url
|
9 |
-
MODEL_URL=hf_bucket_url("KoichiYasuoka/SuPar-Kanbun","suparkanbun/models/")
|
10 |
-
|
11 |
import numpy
|
12 |
from spacy.language import Language
|
13 |
from spacy.symbols import LANG,NORM,LEMMA,POS,TAG,DEP,HEAD
|
@@ -36,7 +33,7 @@ class SuParKanbunLanguage(Language):
|
|
36 |
"name":"SuParKanbun_lzh",
|
37 |
"parent_package":"suparkanbun",
|
38 |
"pipeline":"Tokenizer, POS-Tagger, Parser",
|
39 |
-
"spacy_version":">=2.
|
40 |
}
|
41 |
self._path=None
|
42 |
|
@@ -59,7 +56,7 @@ class SuParKanbunTokenizer(object):
|
|
59 |
self.supar=Parser.load(f)
|
60 |
if danku:
|
61 |
d=os.path.join(DOWNLOAD_DIR,bert+".danku")
|
62 |
-
self.danku=AutoModelTagger(d,["B","E","E2","E3","M","S"])
|
63 |
else:
|
64 |
self.danku=None
|
65 |
self.gloss=MakeGloss()
|
@@ -167,19 +164,32 @@ class SuParKanbunTokenizer(object):
|
|
167 |
return doc
|
168 |
|
169 |
class AutoModelTagger(object):
|
170 |
-
def __init__(self,dir,label=None):
|
171 |
from suparkanbun.download import checkdownload
|
172 |
from transformers import AutoModelForTokenClassification,AutoTokenizer
|
173 |
-
|
|
|
174 |
self.model=AutoModelForTokenClassification.from_pretrained(dir)
|
175 |
self.tokenizer=AutoTokenizer.from_pretrained(dir)
|
176 |
self.label=label if label else self.model.config.id2label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
def __call__(self,text):
|
178 |
-
import torch
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
class MakeGloss(object):
|
185 |
def __init__(self,file=None):
|
|
|
5 |
PACKAGE_DIR=os.path.abspath(os.path.dirname(__file__))
|
6 |
DOWNLOAD_DIR=os.path.join(PACKAGE_DIR,"models")
|
7 |
|
|
|
|
|
|
|
8 |
import numpy
|
9 |
from spacy.language import Language
|
10 |
from spacy.symbols import LANG,NORM,LEMMA,POS,TAG,DEP,HEAD
|
|
|
33 |
"name":"SuParKanbun_lzh",
|
34 |
"parent_package":"suparkanbun",
|
35 |
"pipeline":"Tokenizer, POS-Tagger, Parser",
|
36 |
+
"spacy_version":">=2.2.2"
|
37 |
}
|
38 |
self._path=None
|
39 |
|
|
|
56 |
self.supar=Parser.load(f)
|
57 |
if danku:
|
58 |
d=os.path.join(DOWNLOAD_DIR,bert+".danku")
|
59 |
+
self.danku=AutoModelTagger(d,["B","E","E2","E3","M","S"],[("B","E"),("B","E2"),("B","E3"),("B","M"),("E","B"),("E","S"),("E2","E"),("E3","E2"),("M","E3"),("M","M"),("S","B"),("S","S")])
|
60 |
else:
|
61 |
self.danku=None
|
62 |
self.gloss=MakeGloss()
|
|
|
164 |
return doc
|
165 |
|
166 |
class AutoModelTagger(object):
|
167 |
+
def __init__(self,dir,label=None,links=None):
|
168 |
from suparkanbun.download import checkdownload
|
169 |
from transformers import AutoModelForTokenClassification,AutoTokenizer
|
170 |
+
import numpy
|
171 |
+
checkdownload("KoichiYasuoka/SuPar-Kanbun","suparkanbun/models/"+os.path.basename(dir)+"/",dir)
|
172 |
self.model=AutoModelForTokenClassification.from_pretrained(dir)
|
173 |
self.tokenizer=AutoTokenizer.from_pretrained(dir)
|
174 |
self.label=label if label else self.model.config.id2label
|
175 |
+
if links:
|
176 |
+
self.transition=numpy.full((len(self.label),len(self.label)),numpy.nan)
|
177 |
+
x=self.model.config.label2id
|
178 |
+
for f,t in links:
|
179 |
+
self.transition[x[f],x[t]]=0
|
180 |
+
else:
|
181 |
+
self.transition=numpy.zeros((len(self.label),len(self.label)))
|
182 |
def __call__(self,text):
|
183 |
+
import torch,numpy
|
184 |
+
v=self.tokenizer(text,return_offsets_mapping=True)
|
185 |
+
with torch.no_grad():
|
186 |
+
m=self.model(torch.tensor([v["input_ids"]])).logits[0].numpy()
|
187 |
+
for i in range(m.shape[0]-1,0,-1):
|
188 |
+
m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)
|
189 |
+
p=[numpy.nanargmax(m[0])]
|
190 |
+
for i in range(1,m.shape[0]):
|
191 |
+
p.append(numpy.nanargmax(m[i]+self.transition[p[-1]]))
|
192 |
+
return [(text[t[0]:t[1]],self.label[q]) for t,q in zip(v["offset_mapping"],p) if t[0]<t[1]]
|
193 |
|
194 |
class MakeGloss(object):
|
195 |
def __init__(self,file=None):
|