xusong28
commited on
Commit
•
c10350f
1
Parent(s):
6e6846f
update
Browse files- app.py +66 -14
- app2.py +22 -0
- demo_chatbot.py +38 -0
- demo_corrector.py +67 -29
- demo_mlm.py +1 -2
- demo_ner.py +47 -0
- demo_sum.py +2 -18
- draft/bert_corrector_test.py +10 -0
- draft/demo_mlm2.py +19 -0
- draft/register.py +32 -0
- kplug/__init__.py +3 -0
- kplug/configuration_kplug.py +9 -0
- modeling_kplug.py → kplug/modeling_kplug.py +0 -0
- modeling_kplug_s2s_patch.py → kplug/modeling_kplug_s2s_patch.py +0 -0
- kplug_lm_test.py +0 -14
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,22 +1,74 @@
|
|
1 |
# coding=utf-8
|
2 |
# author: xusong <xusong28@jd.com>
|
3 |
-
# time: 2022/8/
|
4 |
|
5 |
-
"""
|
6 |
-
https://gradio.app/docs/#tabbedinterface-header
|
7 |
-
|
8 |
-
## 更多任务
|
9 |
-
- 抽取式摘要
|
10 |
-
- 检索式对话 、 抽取式问答
|
11 |
-
-
|
12 |
-
"""
|
13 |
|
14 |
import gradio as gr
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
|
|
|
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# coding=utf-8
|
2 |
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/26 13:14
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
+
import operator
|
8 |
+
import torch
|
9 |
+
from transformers import BertTokenizer, BertForMaskedLM
|
10 |
+
|
11 |
+
tokenizer = BertTokenizer.from_pretrained("shibing624/macbert4csc-base-chinese")
|
12 |
+
model = BertForMaskedLM.from_pretrained("shibing624/macbert4csc-base-chinese")
|
13 |
+
|
14 |
+
|
15 |
+
def ai_text(text):
|
16 |
+
with torch.no_grad():
|
17 |
+
outputs = model(**tokenizer([text], padding=True, return_tensors='pt'))
|
18 |
+
|
19 |
+
def to_ner(corrected_sent, errs):
|
20 |
+
output = [{"entity": "纠错", "word": err[1], "start": err[2], "end": err[3]} for i, err in
|
21 |
+
enumerate(errs)]
|
22 |
+
return {"text": corrected_sent, "entities": output}
|
23 |
+
|
24 |
+
def get_errors(corrected_text, origin_text):
|
25 |
+
sub_details = []
|
26 |
+
for i, ori_char in enumerate(origin_text):
|
27 |
+
if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']:
|
28 |
+
# add unk word
|
29 |
+
corrected_text = corrected_text[:i] + ori_char + corrected_text[i:]
|
30 |
+
continue
|
31 |
+
if i >= len(corrected_text):
|
32 |
+
continue
|
33 |
+
if ori_char != corrected_text[i]:
|
34 |
+
if ori_char.lower() == corrected_text[i]:
|
35 |
+
# pass english upper char
|
36 |
+
corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
|
37 |
+
continue
|
38 |
+
sub_details.append((ori_char, corrected_text[i], i, i + 1))
|
39 |
+
sub_details = sorted(sub_details, key=operator.itemgetter(2))
|
40 |
+
return corrected_text, sub_details
|
41 |
+
|
42 |
+
_text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
|
43 |
+
corrected_text = _text[:len(text)]
|
44 |
+
corrected_text, details = get_errors(corrected_text, text)
|
45 |
+
print(text, ' => ', corrected_text, details)
|
46 |
+
return to_ner(corrected_text, details), details
|
47 |
+
|
48 |
|
49 |
+
if __name__ == '__main__':
|
50 |
+
print(ai_text('少先队员因该为老人让坐'))
|
51 |
|
52 |
+
examples = [
|
53 |
+
['真麻烦你了。希望你们好好的跳无'],
|
54 |
+
['少先队员因该为老人让坐'],
|
55 |
+
['机七学习是人工智能领遇最能体现智能的一个分知'],
|
56 |
+
['今天心情很好'],
|
57 |
+
['他法语说的很好,的语也不错'],
|
58 |
+
['他们的吵翻很不错,再说他们做的咖喱鸡也好吃'],
|
59 |
+
]
|
60 |
|
61 |
+
gr.Interface(
|
62 |
+
ai_text,
|
63 |
+
inputs="textbox",
|
64 |
+
outputs=[
|
65 |
+
gr.outputs.HighlightedText(
|
66 |
+
label="Output",
|
67 |
+
show_legend=True,
|
68 |
+
),
|
69 |
+
gr.outputs.JSON()
|
70 |
+
],
|
71 |
+
title="Chinese Spelling Correction Model shibing624/macbert4csc-base-chinese",
|
72 |
+
description="Copy or input error Chinese text. Submit and the machine will correct text.",
|
73 |
+
article="Link to <a href='https://github.com/shibing624/pycorrector' style='color:blue;' target='_blank\'>Github REPO</a>",
|
74 |
+
examples=examples).launch()
|
app2.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/23 16:06
|
4 |
+
|
5 |
+
"""
|
6 |
+
https://gradio.app/docs/#tabbedinterface-header
|
7 |
+
|
8 |
+
## 更多任务
|
9 |
+
- 抽取式摘要
|
10 |
+
- 检索式对话 、 抽取式问答
|
11 |
+
-
|
12 |
+
"""
|
13 |
+
|
14 |
+
import gradio as gr
|
15 |
+
from demo_sum import sum_iface
|
16 |
+
from demo_mlm import mlm_iface
|
17 |
+
|
18 |
+
|
19 |
+
demo = gr.TabbedInterface([sum_iface, mlm_iface], ["生成式摘要", "文本填词", "句子纠错"])
|
20 |
+
|
21 |
+
if __name__ == "__main__":
|
22 |
+
demo.launch()
|
demo_chatbot.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 16:57
|
4 |
+
|
5 |
+
"""
|
6 |
+
|
7 |
+
https://gradio.app/creating_a_chatbot/
|
8 |
+
"""
|
9 |
+
|
10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
11 |
+
import torch
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
|
15 |
+
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
|
16 |
+
|
17 |
+
|
18 |
+
def predict(input, history=[]):
|
19 |
+
# tokenize the new input sentence
|
20 |
+
new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt')
|
21 |
+
|
22 |
+
# append the new user input tokens to the chat history
|
23 |
+
bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
|
24 |
+
|
25 |
+
# generate a response
|
26 |
+
history = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).tolist()
|
27 |
+
|
28 |
+
# convert the tokens to text, and then split the responses into lines
|
29 |
+
response = tokenizer.decode(history[0]).split("<|endoftext|>")
|
30 |
+
response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
|
31 |
+
return response, history
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
gr.Interface(fn=predict,
|
37 |
+
inputs=["text", "state"],
|
38 |
+
outputs=["chatbot", "state"]).launch()
|
demo_corrector.py
CHANGED
@@ -2,48 +2,86 @@
|
|
2 |
# author: xusong <xusong28@jd.com>
|
3 |
# time: 2022/8/23 17:08
|
4 |
|
5 |
-
|
|
|
6 |
import gradio as gr
|
7 |
from transformers import FillMaskPipeline
|
8 |
from transformers import BertTokenizer
|
9 |
-
from modeling_kplug import KplugForMaskedLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
14 |
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
18 |
|
19 |
-
# fill mask
|
20 |
-
def fill_mask(text):
|
21 |
-
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
22 |
-
outputs = fill_masker(text)
|
23 |
-
return {i["token_str"]: i["score"] for i in outputs}
|
24 |
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
]
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
inputs=gr.inputs.Textbox(
|
35 |
label="输入文本",
|
36 |
-
default="
|
37 |
-
outputs=
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
45 |
)
|
46 |
|
47 |
if __name__ == "__main__":
|
48 |
-
#
|
49 |
-
|
|
|
|
2 |
# author: xusong <xusong28@jd.com>
|
3 |
# time: 2022/8/23 17:08
|
4 |
|
5 |
+
import time
|
6 |
+
import torch
|
7 |
import gradio as gr
|
8 |
from transformers import FillMaskPipeline
|
9 |
from transformers import BertTokenizer
|
10 |
+
from kplug.modeling_kplug import KplugForMaskedLM
|
11 |
+
from pycorrector.bert.bert_corrector import BertCorrector
|
12 |
+
from pycorrector import config
|
13 |
+
from loguru import logger
|
14 |
+
|
15 |
+
device_id = 0 if torch.cuda.is_available() else -1
|
16 |
+
|
17 |
+
|
18 |
+
class KplugCorrector(BertCorrector):
|
19 |
|
20 |
+
def __init__(self, bert_model_dir=config.bert_model_dir, device=device_id):
|
21 |
+
super(BertCorrector, self).__init__()
|
22 |
+
self.name = 'kplug_corrector'
|
23 |
+
t1 = time.time()
|
24 |
|
25 |
+
model_dir = "models/pretrain/"
|
26 |
+
tokenizer = BertTokenizer.from_pretrained(model_dir)
|
27 |
+
model = KplugForMaskedLM.from_pretrained(model_dir)
|
28 |
|
29 |
+
self.model = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device)
|
30 |
+
if self.model:
|
31 |
+
self.mask = self.model.tokenizer.mask_token
|
32 |
+
logger.debug('Loaded bert model: %s, spend: %.3f s.' % (bert_model_dir, time.time() - t1))
|
33 |
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
# corrector = KplugCorrector()
|
36 |
|
37 |
+
error_sentences = [
|
38 |
+
'少先队员因该为老人让坐',
|
39 |
+
'机七学习是人工智能领遇最能体现智能的一个分知',
|
40 |
+
'今天心情很好',
|
41 |
]
|
42 |
|
43 |
+
|
44 |
+
def mock_data():
|
45 |
+
corrected_sent = '机器学习是人工智能领域最能体现智能的一个分知'
|
46 |
+
errs = [('七', '器', 1, 2), ('遇', '域', 10, 11)]
|
47 |
+
return corrected_sent, errs
|
48 |
+
|
49 |
+
|
50 |
+
def correct(sent):
|
51 |
+
|
52 |
+
# corrected_sent, errs = corrector.bert_correct(sent)
|
53 |
+
corrected_sent, errs = mock_data()
|
54 |
+
print("original sentence:{} => {}, err:{}".format(sent, corrected_sent, errs))
|
55 |
+
output = [{"entity": "纠错", "score": 0.5, "word": err[1], "start": err[2], "end": err[3]} for i, err in
|
56 |
+
enumerate(errs)]
|
57 |
+
return {"text": corrected_sent, "entities": output}, errs
|
58 |
+
|
59 |
+
|
60 |
+
def test():
|
61 |
+
for sent in error_sentences:
|
62 |
+
corrected_sent, err = corrector.bert_correct(sent)
|
63 |
+
print("original sentence:{} => {}, err:{}".format(sent, corrected_sent, err))
|
64 |
+
|
65 |
+
|
66 |
+
corr_iface = gr.Interface(
|
67 |
+
fn=correct,
|
68 |
inputs=gr.inputs.Textbox(
|
69 |
label="输入文本",
|
70 |
+
default="少先队员因该为老人让坐"),
|
71 |
+
outputs=[
|
72 |
+
gr.HighlightedText(
|
73 |
+
label="纠错",
|
74 |
+
show_legend=True,
|
75 |
+
# visible=False
|
76 |
+
),
|
77 |
+
gr.JSON()
|
78 |
+
],
|
79 |
+
examples=error_sentences,
|
80 |
+
title="文本纠错(Corrector)",
|
81 |
+
description='自动对汉语文本中的拼写、语法、标点等多种问题进行纠错校对,提示错误位置并返回修改建议'
|
82 |
)
|
83 |
|
84 |
if __name__ == "__main__":
|
85 |
+
# test()
|
86 |
+
# correct("少先队员因该为老人让坐")
|
87 |
+
corr_iface.launch()
|
demo_mlm.py
CHANGED
@@ -20,13 +20,12 @@ interface = gr.Interface.load(
|
|
20 |
import gradio as gr
|
21 |
from transformers import FillMaskPipeline
|
22 |
from transformers import BertTokenizer
|
23 |
-
from modeling_kplug import KplugForMaskedLM
|
24 |
|
25 |
model_dir = "models/pretrain/"
|
26 |
tokenizer = BertTokenizer.from_pretrained(model_dir)
|
27 |
model = KplugForMaskedLM.from_pretrained(model_dir)
|
28 |
|
29 |
-
|
30 |
# fill mask
|
31 |
def fill_mask(text):
|
32 |
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
|
|
20 |
import gradio as gr
|
21 |
from transformers import FillMaskPipeline
|
22 |
from transformers import BertTokenizer
|
23 |
+
from kplug.modeling_kplug import KplugForMaskedLM
|
24 |
|
25 |
model_dir = "models/pretrain/"
|
26 |
tokenizer = BertTokenizer.from_pretrained(model_dir)
|
27 |
model = KplugForMaskedLM.from_pretrained(model_dir)
|
28 |
|
|
|
29 |
# fill mask
|
30 |
def fill_mask(text):
|
31 |
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
demo_ner.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 16:57
|
4 |
+
|
5 |
+
|
6 |
+
"""
|
7 |
+
|
8 |
+
## ner demo
|
9 |
+
- https://gradio.app/named_entity_recognition/
|
10 |
+
- https://huggingface.co/dslim/bert-base-NER?text=My+name+is+Wolfgang+and+I+live+in+Berlin
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
from transformers import pipeline
|
15 |
+
|
16 |
+
import gradio as gr
|
17 |
+
|
18 |
+
ner_pipeline = pipeline("ner")
|
19 |
+
|
20 |
+
examples = [
|
21 |
+
"Does Chicago have any stores and does Joe live here?",
|
22 |
+
]
|
23 |
+
|
24 |
+
import json
|
25 |
+
|
26 |
+
def ner(text):
|
27 |
+
output = ner_pipeline(text)
|
28 |
+
for ent in output:
|
29 |
+
ent["score"] = float(ent["score"])
|
30 |
+
aa = {"text": text, "entities": output}
|
31 |
+
return aa, output
|
32 |
+
|
33 |
+
|
34 |
+
demo = gr.Interface(
|
35 |
+
ner,
|
36 |
+
inputs=gr.Textbox(placeholder="Enter sentence here..."),
|
37 |
+
outputs=
|
38 |
+
[
|
39 |
+
gr.HighlightedText(
|
40 |
+
label="NER",
|
41 |
+
show_legend=True,
|
42 |
+
),
|
43 |
+
gr.JSON(),
|
44 |
+
],
|
45 |
+
examples=examples)
|
46 |
+
|
47 |
+
demo.launch()
|
demo_sum.py
CHANGED
@@ -12,36 +12,20 @@ promp参数
|
|
12 |
|
13 |
import torch
|
14 |
import gradio as gr
|
15 |
-
import modeling_kplug_s2s_patch
|
16 |
-
from transformers import BertTokenizer,
|
17 |
|
18 |
# 改成 huggingface-model自动模型
|
19 |
model_dir = "models/ft_cepsum_jiadian/"
|
20 |
model = BartForConditionalGeneration.from_pretrained(model_dir) # cnn指的是cnn daily mail
|
21 |
tokenizer = BertTokenizer.from_pretrained(model_dir)
|
22 |
|
23 |
-
|
24 |
def summarize(text):
|
25 |
inputs = tokenizer([text], max_length=512, return_tensors="pt")
|
26 |
summary_ids = model.generate(inputs["input_ids"][:, 1:], num_beams=4, min_length=20, max_length=100)
|
27 |
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
28 |
return summary[0]
|
29 |
|
30 |
-
|
31 |
-
# TODO:
|
32 |
-
# 1. 下拉框,选择类目。 gr.Radio(['服饰','箱包', '鞋靴']
|
33 |
-
# 2. 支持NER、LM、Corrector
|
34 |
-
# beam seach参数
|
35 |
-
# promp参数
|
36 |
-
|
37 |
-
"""
|
38 |
-
九阳电水壶热水壶家用电热水壶烧水壶304不锈钢快速加热容量开水壶无缝内胆九阳家用电水壶,贴心细节倾情陪你,沸腾仅需一首歌的时间,严选,以茶会友,早起,一杯柠檬水润肠道,食品级304不锈钢材质,赏心悦目厨房中的艺术品,无缝一体内胆,下班小聚,英国进口STRIX温控器,1.7L大容量,产品全家福,防烫保温杯体,材质,产品信息,可收纳底盘,加热开关,304不锈钢,大口径75°,广角开盖,给你一杯水的关怀,3-6秒确保,英国进口温控器,将多余电源线收纳在,更能确保每一杯水完全沸腾延时沸腾,不锈钢内胆,隔热层减缓热量散失速度,双层防烫,不锈钢滤网,自动断电,起到物理保温效果,不锈钢内盖,内部进行清洁,1800w超大加热底盘,能够让水快速沸腾,经久耐用,优质温控器12000次长使用寿命,碰撞简约优雅气息,满足一家人多种饮水需求,底盘,不锈钢壶
|
39 |
-
长虹2匹变频度精准控温立柜式冷暖空调白色节排,科技美,美,不止一点,健康美,8是,静音美,品质美,全自动,内外双静音安诸宁静,专利降噪,全铜管,0.1度精微感,层层过滤清新健康,美观大方,黑色加密滤网,美ei,宽电压启动,长虹空调所用钢材为攀钢与长虹共同研发多适用高密度钢板,由整张双面镀锌,一次冲压成型,没有焊接口,经过多次喷涂处理,使用寿命长。加密滤网,少风道内涡流现象,在不降低空调运行风量的前提下,还原室内外安湄。减少温度过调,轴流风扇,自动开机,带来的能源浪费,长虹空调的智能全自动,不仅能自动运作,同时空调上所具备的生态功能将,调节室温,省电恒温状态,选择模式,控制风速风向,低噪品牌压缩机,控温水平大大提升,新型风道技术,全部启动,创造理想家居环境。长虹空调
|
40 |
-
马兰士音响音箱家庭影院网络音频播放机双频段蓝牙黑色“乐”享新生活,Hi-Fi新时代,搭载高配置耳放,HEOs无线多房间音乐模块,精制优选元器件,广泛连接数字音乐,全面无线连接,远程控制,套装组合好搭档,氰听无损音乐,前后面板指示图,连接,输出,为原音而生与优雅相伴,全分离式电路,耳机放大器。精妙纯净,声场宽广。技术,尺寸,MarantzNA6006网络音频播放机,解码,声道,拓展,能连接计算机、外部硬盘、智能手机和平板电脑。镀金端口设计,电脑、外部硬盘、NAS、智能移动设备,高品质信号传输,前置USB-A端口可连接USB储存移动设备。音乐推送,模拟音频电路配备马兰士HDAM-SA2模块,低-中-高增益选择,匹配各种不同阻抗的耳机,灵活获得非凡体验。搭载HEOS无线多房间音乐模块可与所有支持该项技术的马兰士功放,宽440mm高106mm(
|
41 |
-
空气道壁挂式新风机系统家用除雾霾除甲醛白色净之本源,产品详情,噪音低至28分贝,键操控,智能辅热无惧严寒,让家中空气尽在掌握,EC电机2天仅耗1度电,0.5度,多种模式选择,五重纯物理过滤,大风量以一抵三,创新时代畅享新风30,空气道在民用新风领域率先采用H14级过滤材料对细颗粒物,二氧化碳浓度,去除有害微生物及过敏源>99%,PM2.5日均浓度,五重纯物理过滤系统将室外的新鲜空气经过物理过滤,后引入室内,无二次污染不产生对人体造成危害的臭,空气道起源于德国,为将先进的新风设计理念注入到空气道产品中,创,的单次通过滤除效率>99%,除去雾钼>99%,小时开机耗电仅0.5度*,绿色节能,氧,在增加室内含氧量的同时长效保护家人健康,在冬日也无惧室外严寒,在拥有室内清新空气,甲醛浓度
|
42 |
-
海尔8公斤节能静音高温消毒烫烫净全自动滚筒洗衣机靠实力��话,一掌控时间掌控自由,i-time智能时间洗,8公斤容量全家衣物一次清洗,细节绝不含糊,真正实力派,自动添加洗衣盒,洗羽绒服,就要专属程序,羊毛,牛仔,习绒,海尔洗衣机蓝晶系列滚筒,个性范儿,按照程序需求自动冲入洗衣机内,灵活旋钮,创新下排水洁净不残留,强力筋内筒,AMT防霉窗垫,LED大屏显示,洗衣液,消毒剂分别置放在洗衣盒中,从根本上解决污水残留问题避免,全新LD面板显示,更宽阔更大气操作信息一目了然,宽阔大气操作信息一目了然,右槽:消毒剂,简化洗衣程序,弹力筋中间的凹槽内分布,无残留排水模块,海尔洗衣机具有专业级羽绒洗护程序,为羽绒服营造洗护,一体化环境彻底告别手洗或者机洗,左槽:洗涤剂,我的智慧生活,中槽:柔顺剂,满足各种洗涤需求,告别昂贵洗衣店,自家
|
43 |
-
"""
|
44 |
-
|
45 |
sum_examples = [
|
46 |
"美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开门俯视图,全开门俯视图,预留参考图",
|
47 |
"爱家乐新加坡电风扇静音无叶风扇健康空气循环扇儿童球形风扇落地扇外观,宁静节能,产品结构,现代科技的结晶,品质,气家,未来风新时代,动里,空让,健康,低至13分贝/DC直流马达/低耗24,亲密玩伴,24W功率,/低耗,别加坡国民品牌,气流通道,增强室内空气运动,过尘栅网,1-12档风力调速,涡轮风扇,吸气口,大于6米随心掌控,电源适配暑,装箱明细,摆头角度,手动摇摆轨道,操作方式,与空调同时使用不仅可以让室温快速均衡作,电源插口,适用环境,还可以在短时间内,导引出风口,产品类型,快件重量,电机,暖空气向上冷空气向下,线长,使房间温度均衡,省电环保,定时,功率,将凉风或热风送给到附近的房间,轻松享受生活,左右自动(上下手动)摇摆9度,进风口,能够很快中和空气温度差",
|
|
|
12 |
|
13 |
import torch
|
14 |
import gradio as gr
|
15 |
+
from kplug import modeling_kplug_s2s_patch
|
16 |
+
from transformers import BertTokenizer, BartForConditionalGeneration
|
17 |
|
18 |
# 改成 huggingface-model自动模型
|
19 |
model_dir = "models/ft_cepsum_jiadian/"
|
20 |
model = BartForConditionalGeneration.from_pretrained(model_dir) # cnn指的是cnn daily mail
|
21 |
tokenizer = BertTokenizer.from_pretrained(model_dir)
|
22 |
|
|
|
23 |
def summarize(text):
|
24 |
inputs = tokenizer([text], max_length=512, return_tensors="pt")
|
25 |
summary_ids = model.generate(inputs["input_ids"][:, 1:], num_beams=4, min_length=20, max_length=100)
|
26 |
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
27 |
return summary[0]
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
sum_examples = [
|
30 |
"美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开门俯视图,全开门俯视图,预留参考图",
|
31 |
"爱家乐新加坡电风扇静音无叶风扇健康空气循环扇儿童球形风扇落地扇外观,宁静节能,产品结构,现代科技的结晶,品质,气家,未来风新时代,动里,空让,健康,低至13分贝/DC直流马达/低耗24,亲密玩伴,24W功率,/低耗,别加坡国民品牌,气流通道,增强室内空气运动,过尘栅网,1-12档风力调速,涡轮风扇,吸气口,大于6米随心掌控,电源适配暑,装箱明细,摆头角度,手动摇摆轨道,操作方式,与空调同时使用不仅可以让室温快速均衡作,电源插口,适用环境,还可以在短时间内,导引出风口,产品类型,快件重量,电机,暖空气向上冷空气向下,线长,使房间温度均衡,省电环保,定时,功率,将凉风或热风送给到附近的房间,轻松享受生活,左右自动(上下手动)摇摆9度,进风口,能够很快中和空气温度差",
|
draft/bert_corrector_test.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 15:49
|
4 |
+
|
5 |
+
|
6 |
+
from transformers import pipeline
|
7 |
+
|
8 |
+
if __name__ == "__main__":
|
9 |
+
classifier = pipeline("fill-mask")
|
10 |
+
classifier("Paris is the <mask> of France.")
|
draft/demo_mlm2.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 10:19
|
4 |
+
|
5 |
+
"""
|
6 |
+
未完成
|
7 |
+
|
8 |
+
https://github.com/shibing624/pycorrector/blob/6f1b6ea35c60700f08463950e11ae0963ed512b2/pycorrector/bert/bert_corrector.py#L30
|
9 |
+
"""
|
10 |
+
import register
|
11 |
+
from transformers import pipeline
|
12 |
+
|
13 |
+
bert_model_dir = "models/pretrain/"
|
14 |
+
model = pipeline('fill-mask',
|
15 |
+
model=bert_model_dir,
|
16 |
+
tokenizer=bert_model_dir)
|
17 |
+
|
18 |
+
predicts = model("这款连[MASK]裙真漂亮")
|
19 |
+
print(predicts)
|
draft/register.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 10:49
|
4 |
+
|
5 |
+
"""
|
6 |
+
|
7 |
+
注册后,才能被AutoConfig识别
|
8 |
+
https://github.com/huggingface/transformers/blob/main/src/transformers/models/auto/configuration_auto.py
|
9 |
+
https://github.com/huggingface/transformers/blob/main/src/transformers/models/auto/modeling_auto.py
|
10 |
+
"""
|
11 |
+
|
12 |
+
import transformers
|
13 |
+
# import kplug
|
14 |
+
#
|
15 |
+
# transformers.models.kplug = kplug
|
16 |
+
|
17 |
+
from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES, CONFIG_MAPPING_NAMES
|
18 |
+
|
19 |
+
|
20 |
+
CONFIG_MAPPING_NAMES["kplug"] = "BertConfig"
|
21 |
+
MODEL_MAPPING_NAMES["kplug"] = "KplugModel"
|
22 |
+
MODEL_WITH_LM_HEAD_MAPPING_NAMES["kplug"] = "KplugForMaskedLM"
|
23 |
+
MODEL_FOR_MASKED_LM_MAPPING_NAMES["kplug"] = "KplugForMaskedLM"
|
24 |
+
|
25 |
+
# CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
|
26 |
+
|
27 |
+
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
|
28 |
+
|
29 |
+
|
30 |
+
CONFIG_MAPPING["kplug"] = "BertConfig"
|
31 |
+
|
32 |
+
|
kplug/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 15:31
|
kplug/configuration_kplug.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong <xusong28@jd.com>
|
3 |
+
# time: 2022/8/25 15:33
|
4 |
+
|
5 |
+
|
6 |
+
from transformers.models.bert.configuration_bert import BertConfig
|
7 |
+
|
8 |
+
class KplugConfig(BertConfig):
|
9 |
+
pass
|
modeling_kplug.py → kplug/modeling_kplug.py
RENAMED
File without changes
|
modeling_kplug_s2s_patch.py → kplug/modeling_kplug_s2s_patch.py
RENAMED
File without changes
|
kplug_lm_test.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
# coding=utf-8
|
2 |
-
# author: xusong <xusong28@jd.com>
|
3 |
-
# time: 2021/9/17 17:43
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
#
|
8 |
-
# from transformers import pipeline
|
9 |
-
# nlp_fill = pipeline('fill-mask', topk=10)
|
10 |
-
#
|
11 |
-
#
|
12 |
-
# def from_url():
|
13 |
-
# MODEL_PATH = "http://storage.jd.com/language-models/kplug/huggingface/pytorch_model.bin"
|
14 |
-
# model = KplugForMaskedLM.from_pretrained(MODEL_PATH)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
transformers
|
2 |
-
torch
|
|
|
|
1 |
transformers
|
2 |
+
torch
|
3 |
+
pycorrector
|