KoichiYasuoka commited on
Commit
dd1a82e
1 Parent(s): 3b0aa2e

cached_file support

Browse files
Files changed (3) hide show
  1. setup.py +3 -3
  2. suparkanbun/download.py +13 -7
  3. suparkanbun/suparkanbun.py +22 -12
setup.py CHANGED
@@ -5,7 +5,7 @@ URL="https://github.com/KoichiYasuoka/SuPar-Kanbun"
5
 
6
  setuptools.setup(
7
  name="suparkanbun",
8
- version="1.4.2",
9
  description="Tokenizer POS-tagger and Dependency-parser for Classical Chinese",
10
  long_description=long_description,
11
  long_description_content_type="text/markdown",
@@ -16,10 +16,10 @@ setuptools.setup(
16
  keywords="NLP Chinese",
17
  packages=setuptools.find_packages(),
18
  install_requires=[
19
- "supar>=1.1.3",
20
  "transformers>=4.0.1",
21
  "spacy>=2.2.2",
22
- "deplacy>=2.0.2"
23
  ],
24
  python_requires=">=3.7",
25
  package_data={"suparkanbun":["models/*.txt","models/*/*.txt","models/*/*.json"]},
 
5
 
6
  setuptools.setup(
7
  name="suparkanbun",
8
+ version="1.4.6",
9
  description="Tokenizer POS-tagger and Dependency-parser for Classical Chinese",
10
  long_description=long_description,
11
  long_description_content_type="text/markdown",
 
16
  keywords="NLP Chinese",
17
  packages=setuptools.find_packages(),
18
  install_requires=[
19
+ "supar>=1.1.4",
20
  "transformers>=4.0.1",
21
  "spacy>=2.2.2",
22
+ "deplacy>=2.0.3"
23
  ],
24
  python_requires=">=3.7",
25
  package_data={"suparkanbun":["models/*.txt","models/*/*.txt","models/*/*.json"]},
suparkanbun/download.py CHANGED
@@ -3,24 +3,30 @@
3
 
4
  import os
5
 
6
- def download(url,file,dir="."):
 
 
 
 
 
 
7
  import shutil
8
- from transformers.file_utils import cached_path
9
  t=os.path.join(dir,"filesize.txt")
10
- shutil.copy(cached_path(url+"filesize.txt"),t)
11
  with open(t,"r") as f:
12
  r=f.read()
 
13
  ft=0
14
  for t in r.split("\n"):
15
  s=t.split()
16
  if len(s)==2:
17
- if s[0]==file:
18
  ft=int(s[1])
19
  if ft==0:
20
  return
21
- shutil.copy(cached_path(url+file),os.path.join(dir,file))
22
 
23
- def checkdownload(url,dir="."):
24
  while True:
25
  t=os.path.join(dir,"filesize.txt")
26
  with open(t,"r") as f:
@@ -35,7 +41,7 @@ def checkdownload(url,dir="."):
35
  except:
36
  j=-1
37
  if i!=j:
38
- download(url,s[0],dir)
39
  break
40
  else:
41
  return
 
3
 
4
  import os
5
 
6
+ try:
7
+ from transformers.utils import cached_file
8
+ except:
9
+ from transformers.file_utils import cached_path,hf_bucket_url
10
+ cached_file=lambda x,y:cached_path(hf_bucket_url(x,y))
11
+
12
+ def download(rootdir,file,dir="."):
13
  import shutil
 
14
  t=os.path.join(dir,"filesize.txt")
15
+ shutil.copy(cached_file(rootdir,os.path.dirname(file)+"/filesize.txt"),t)
16
  with open(t,"r") as f:
17
  r=f.read()
18
+ f=os.path.basename(file)
19
  ft=0
20
  for t in r.split("\n"):
21
  s=t.split()
22
  if len(s)==2:
23
+ if s[0]==f:
24
  ft=int(s[1])
25
  if ft==0:
26
  return
27
+ shutil.copy(cached_file(rootdir,file),os.path.join(dir,f))
28
 
29
+ def checkdownload(rootdir,model,dir="."):
30
  while True:
31
  t=os.path.join(dir,"filesize.txt")
32
  with open(t,"r") as f:
 
41
  except:
42
  j=-1
43
  if i!=j:
44
+ download(rootdir,model+s[0],dir)
45
  break
46
  else:
47
  return
suparkanbun/suparkanbun.py CHANGED
@@ -5,9 +5,6 @@ import os
5
  PACKAGE_DIR=os.path.abspath(os.path.dirname(__file__))
6
  DOWNLOAD_DIR=os.path.join(PACKAGE_DIR,"models")
7
 
8
- from transformers.file_utils import hf_bucket_url
9
- MODEL_URL=hf_bucket_url("KoichiYasuoka/SuPar-Kanbun","suparkanbun/models/")
10
-
11
  import numpy
12
  from spacy.language import Language
13
  from spacy.symbols import LANG,NORM,LEMMA,POS,TAG,DEP,HEAD
@@ -36,7 +33,7 @@ class SuParKanbunLanguage(Language):
36
  "name":"SuParKanbun_lzh",
37
  "parent_package":"suparkanbun",
38
  "pipeline":"Tokenizer, POS-Tagger, Parser",
39
- "spacy_version":">=2.1.0"
40
  }
41
  self._path=None
42
 
@@ -59,7 +56,7 @@ class SuParKanbunTokenizer(object):
59
  self.supar=Parser.load(f)
60
  if danku:
61
  d=os.path.join(DOWNLOAD_DIR,bert+".danku")
62
- self.danku=AutoModelTagger(d,["B","E","E2","E3","M","S"])
63
  else:
64
  self.danku=None
65
  self.gloss=MakeGloss()
@@ -167,19 +164,32 @@ class SuParKanbunTokenizer(object):
167
  return doc
168
 
169
  class AutoModelTagger(object):
170
- def __init__(self,dir,label=None):
171
  from suparkanbun.download import checkdownload
172
  from transformers import AutoModelForTokenClassification,AutoTokenizer
173
- checkdownload(MODEL_URL+os.path.basename(dir)+"/",dir)
 
174
  self.model=AutoModelForTokenClassification.from_pretrained(dir)
175
  self.tokenizer=AutoTokenizer.from_pretrained(dir)
176
  self.label=label if label else self.model.config.id2label
 
 
 
 
 
 
 
177
  def __call__(self,text):
178
- import torch
179
- input=self.tokenizer.encode(text,return_tensors="pt")
180
- output=self.model(input)
181
- predict=torch.argmax(output[0],dim=2)
182
- return [(t,self.label[p]) for t,p in zip(text,predict[0].tolist()[1:])]
 
 
 
 
 
183
 
184
  class MakeGloss(object):
185
  def __init__(self,file=None):
 
5
  PACKAGE_DIR=os.path.abspath(os.path.dirname(__file__))
6
  DOWNLOAD_DIR=os.path.join(PACKAGE_DIR,"models")
7
 
 
 
 
8
  import numpy
9
  from spacy.language import Language
10
  from spacy.symbols import LANG,NORM,LEMMA,POS,TAG,DEP,HEAD
 
33
  "name":"SuParKanbun_lzh",
34
  "parent_package":"suparkanbun",
35
  "pipeline":"Tokenizer, POS-Tagger, Parser",
36
+ "spacy_version":">=2.2.2"
37
  }
38
  self._path=None
39
 
 
56
  self.supar=Parser.load(f)
57
  if danku:
58
  d=os.path.join(DOWNLOAD_DIR,bert+".danku")
59
+ self.danku=AutoModelTagger(d,["B","E","E2","E3","M","S"],[("B","E"),("B","E2"),("B","E3"),("B","M"),("E","B"),("E","S"),("E2","E"),("E3","E2"),("M","E3"),("M","M"),("S","B"),("S","S")])
60
  else:
61
  self.danku=None
62
  self.gloss=MakeGloss()
 
164
  return doc
165
 
166
  class AutoModelTagger(object):
167
+ def __init__(self,dir,label=None,links=None):
168
  from suparkanbun.download import checkdownload
169
  from transformers import AutoModelForTokenClassification,AutoTokenizer
170
+ import numpy
171
+ checkdownload("KoichiYasuoka/SuPar-Kanbun","suparkanbun/models/"+os.path.basename(dir)+"/",dir)
172
  self.model=AutoModelForTokenClassification.from_pretrained(dir)
173
  self.tokenizer=AutoTokenizer.from_pretrained(dir)
174
  self.label=label if label else self.model.config.id2label
175
+ if links:
176
+ self.transition=numpy.full((len(self.label),len(self.label)),numpy.nan)
177
+ x=self.model.config.label2id
178
+ for f,t in links:
179
+ self.transition[x[f],x[t]]=0
180
+ else:
181
+ self.transition=numpy.zeros((len(self.label),len(self.label)))
182
  def __call__(self,text):
183
+ import torch,numpy
184
+ v=self.tokenizer(text,return_offsets_mapping=True)
185
+ with torch.no_grad():
186
+ m=self.model(torch.tensor([v["input_ids"]])).logits[0].numpy()
187
+ for i in range(m.shape[0]-1,0,-1):
188
+ m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)
189
+ p=[numpy.nanargmax(m[0])]
190
+ for i in range(1,m.shape[0]):
191
+ p.append(numpy.nanargmax(m[i]+self.transition[p[-1]]))
192
+ return [(text[t[0]:t[1]],self.label[q]) for t,q in zip(v["offset_mapping"],p) if t[0]<t[1]]
193
 
194
  class MakeGloss(object):
195
  def __init__(self,file=None):