Spaces:
Runtime error
Runtime error
fix(predictor): fix some error
Browse files- docker-compose.yml +1 -0
- predictor/__init__.py +45 -23
- server.py +16 -3
docker-compose.yml
CHANGED
@@ -8,4 +8,5 @@ services:
|
|
8 |
- "50050:50051"
|
9 |
environment:
|
10 |
- HF_Token=${HF_Token}
|
|
|
11 |
|
|
|
8 |
- "50050:50051"
|
9 |
environment:
|
10 |
- HF_Token=${HF_Token}
|
11 |
+
- DEVICE=cpu
|
12 |
|
predictor/__init__.py
CHANGED
@@ -46,7 +46,7 @@ class Predictor():
|
|
46 |
+ r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
|
47 |
+ r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
|
48 |
+ r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
|
49 |
-
+ r"
|
50 |
first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
|
51 |
self.name_pattern = re.compile(last_name + first_name)
|
52 |
self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
|
@@ -106,14 +106,14 @@ class Predictor():
|
|
106 |
def to_date(self, datestr:str):
|
107 |
if re.match("^\d{4}$",datestr):
|
108 |
return date(int(datestr),1,1)
|
109 |
-
match = re.match("
|
110 |
if match is not None:
|
111 |
try:
|
112 |
-
|
113 |
-
|
|
|
114 |
except ValueError:
|
115 |
-
print(
|
116 |
-
raise
|
117 |
if datestr=="至今":
|
118 |
return self.today
|
119 |
return None
|
@@ -206,25 +206,44 @@ class Predictor():
|
|
206 |
# 获取名字,先过滤所有空白字符,防止名字中间有空格
|
207 |
remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
|
208 |
start_time = time.perf_counter()
|
|
|
209 |
for block in self.split_to_blocks(remove_blanks_text):
|
210 |
block_text,block_l = block['text'],block['start']
|
211 |
entities = self.pipelines['name'](block_text)
|
212 |
for entity in entities:
|
213 |
-
if entity['entity']=='NAME'
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
end_time = time.perf_counter()
|
229 |
self.logger.info(f"process name time: {end_time-start_time}")
|
230 |
# 获取年龄
|
@@ -301,6 +320,8 @@ class Predictor():
|
|
301 |
break
|
302 |
if not repeat:
|
303 |
obj['origin'] = text[obj['start']:obj['end']]
|
|
|
|
|
304 |
return_obj['schools'].append(obj)
|
305 |
# 正则找学校
|
306 |
for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
|
@@ -309,7 +330,7 @@ class Predictor():
|
|
309 |
'start': index_mapper[start],
|
310 |
'end': index_mapper[end-1]+1,
|
311 |
'entity': 'SCHOOL',
|
312 |
-
'text': school_match.group(),
|
313 |
}
|
314 |
repeat = False
|
315 |
for o in return_obj['schools']:
|
@@ -320,6 +341,7 @@ class Predictor():
|
|
320 |
obj['origin'] = text[obj['start']:obj['end']]
|
321 |
obj['level'] = self.schools[obj['text']]
|
322 |
return_obj['schools'].append(obj)
|
|
|
323 |
end_time = time.perf_counter()
|
324 |
self.logger.info(f"process school time: {end_time-start_time}")
|
325 |
start_time = time.perf_counter()
|
@@ -410,7 +432,7 @@ class Predictor():
|
|
410 |
diff_m = end.month-start.month
|
411 |
work_month += diff_y * 12 + diff_m
|
412 |
last_end = end
|
413 |
-
return_obj['work_time'] = math.ceil(work_month/12)
|
414 |
end_time = time.perf_counter()
|
415 |
self.logger.info(f"process work time: {end_time-start_time}")
|
416 |
start_time = time.perf_counter()
|
|
|
46 |
+ r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
|
47 |
+ r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
|
48 |
+ r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
|
49 |
+
+ r"帅,缑,亢,况,后,有,琴,梁丘,左丘,东门,西门,商,牟,佘,佴,伯,赏,南宫,墨,哈,谯,笪,年,爱,阳,佟,第五,言,福,邱,钟]"
|
50 |
first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
|
51 |
self.name_pattern = re.compile(last_name + first_name)
|
52 |
self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
|
|
|
106 |
def to_date(self, datestr:str):
|
107 |
if re.match("^\d{4}$",datestr):
|
108 |
return date(int(datestr),1,1)
|
109 |
+
match = re.match("^(\d{4})\D(\d{1,2})",datestr)
|
110 |
if match is not None:
|
111 |
try:
|
112 |
+
y = int(match.group(1))
|
113 |
+
m = min(max(int(match.group(2)),1),12)
|
114 |
+
return date(y,m,1)
|
115 |
except ValueError:
|
116 |
+
print(datestr)
|
|
|
117 |
if datestr=="至今":
|
118 |
return self.today
|
119 |
return None
|
|
|
206 |
# 获取名字,先过滤所有空白字符,防止名字中间有空格
|
207 |
remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
|
208 |
start_time = time.perf_counter()
|
209 |
+
backup_name = []
|
210 |
for block in self.split_to_blocks(remove_blanks_text):
|
211 |
block_text,block_l = block['text'],block['start']
|
212 |
entities = self.pipelines['name'](block_text)
|
213 |
for entity in entities:
|
214 |
+
if entity['entity']=='NAME':
|
215 |
+
if self.name_pattern.match(entity['word']) is not None:
|
216 |
+
obj = {
|
217 |
+
'start': index_mapper[block_l+entity['start']],
|
218 |
+
'end': index_mapper[block_l+entity['end']-1]+1,
|
219 |
+
'entity': 'NAME',
|
220 |
+
'text': entity['word']
|
221 |
+
}
|
222 |
+
repeat = False
|
223 |
+
for o in return_obj['name']:
|
224 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
225 |
+
repeat = True
|
226 |
+
break
|
227 |
+
if not repeat:
|
228 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
229 |
+
return_obj['name'].append(obj)
|
230 |
+
else:
|
231 |
+
obj = {
|
232 |
+
'start': index_mapper[block_l+entity['start']],
|
233 |
+
'end': index_mapper[block_l+entity['end']-1]+1,
|
234 |
+
'entity': 'NAME',
|
235 |
+
'text': entity['word']
|
236 |
+
}
|
237 |
+
repeat = False
|
238 |
+
for o in return_obj['name']:
|
239 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
240 |
+
repeat = True
|
241 |
+
break
|
242 |
+
if not repeat:
|
243 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
244 |
+
backup_name.append(obj)
|
245 |
+
if len(return_obj['name'])==0:
|
246 |
+
return_obj['name'] = backup_name
|
247 |
end_time = time.perf_counter()
|
248 |
self.logger.info(f"process name time: {end_time-start_time}")
|
249 |
# 获取年龄
|
|
|
320 |
break
|
321 |
if not repeat:
|
322 |
obj['origin'] = text[obj['start']:obj['end']]
|
323 |
+
if "text" not in obj:
|
324 |
+
obj['text'] = obj['origin'].split("\n")[-1]
|
325 |
return_obj['schools'].append(obj)
|
326 |
# 正则找学校
|
327 |
for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
|
|
|
330 |
'start': index_mapper[start],
|
331 |
'end': index_mapper[end-1]+1,
|
332 |
'entity': 'SCHOOL',
|
333 |
+
'text': school_match.group().split('\n')[-1],
|
334 |
}
|
335 |
repeat = False
|
336 |
for o in return_obj['schools']:
|
|
|
341 |
obj['origin'] = text[obj['start']:obj['end']]
|
342 |
obj['level'] = self.schools[obj['text']]
|
343 |
return_obj['schools'].append(obj)
|
344 |
+
return_obj['schools'] = sorted(return_obj['schools'], key=lambda x: x['start'])
|
345 |
end_time = time.perf_counter()
|
346 |
self.logger.info(f"process school time: {end_time-start_time}")
|
347 |
start_time = time.perf_counter()
|
|
|
432 |
diff_m = end.month-start.month
|
433 |
work_month += diff_y * 12 + diff_m
|
434 |
last_end = end
|
435 |
+
return_obj['work_time'] = max(math.ceil(work_month/12),0)
|
436 |
end_time = time.perf_counter()
|
437 |
self.logger.info(f"process work time: {end_time-start_time}")
|
438 |
start_time = time.perf_counter()
|
server.py
CHANGED
@@ -11,6 +11,7 @@ from datetime import date
|
|
11 |
|
12 |
HF_TOKEN = os.environ["HF_Token"]
|
13 |
PORT = os.environ.get("PORT", "50051")
|
|
|
14 |
login(HF_TOKEN)
|
15 |
|
16 |
class Resume(protos.resume_pb2_grpc.ResumeServicer):
|
@@ -22,15 +23,27 @@ class Resume(protos.resume_pb2_grpc.ResumeServicer):
|
|
22 |
pipeline=pipeline(
|
23 |
"textencode",
|
24 |
model="minskiter/cossim-bert-chinese-wwm-ext",
|
25 |
-
device=
|
26 |
trust_remote_code=True,
|
27 |
use_auth_token=True
|
28 |
)
|
29 |
)
|
30 |
self.predictor = Predictor(
|
31 |
pipelines={
|
32 |
-
"name": pipeline(
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
},
|
35 |
paths=[
|
36 |
"data/W020230619818476939351.xls",
|
|
|
11 |
|
12 |
HF_TOKEN = os.environ["HF_Token"]
|
13 |
PORT = os.environ.get("PORT", "50051")
|
14 |
+
DEVICE = os.environ.get("DEVICE", "cpu")
|
15 |
login(HF_TOKEN)
|
16 |
|
17 |
class Resume(protos.resume_pb2_grpc.ResumeServicer):
|
|
|
23 |
pipeline=pipeline(
|
24 |
"textencode",
|
25 |
model="minskiter/cossim-bert-chinese-wwm-ext",
|
26 |
+
device=DEVICE,
|
27 |
trust_remote_code=True,
|
28 |
use_auth_token=True
|
29 |
)
|
30 |
)
|
31 |
self.predictor = Predictor(
|
32 |
pipelines={
|
33 |
+
"name": pipeline(
|
34 |
+
"nerpipe",
|
35 |
+
device=DEVICE,
|
36 |
+
model="minskiter/resume-token-classification-name-0708",
|
37 |
+
trust_remote_code=True,
|
38 |
+
use_auth_token=True
|
39 |
+
),
|
40 |
+
"common": pipeline(
|
41 |
+
"nerpipe",
|
42 |
+
model="minskiter/resume-token-classification",
|
43 |
+
device=DEVICE,
|
44 |
+
trust_remote_code=True,
|
45 |
+
use_auth_token=True
|
46 |
+
)
|
47 |
},
|
48 |
paths=[
|
49 |
"data/W020230619818476939351.xls",
|