xusong28
commited on
Commit
•
dcf7e4b
1
Parent(s):
01920f9
add decoding strategy
Browse files- demo_chatbot_jddc.py +4 -0
- demo_sum.py +136 -21
- demo_sum_diverse.py +0 -67
- info.py +2 -0
demo_chatbot_jddc.py
CHANGED
@@ -2,6 +2,10 @@
|
|
2 |
# author: xusong <xusong28@jd.com>
|
3 |
# time: 2022/9/05 14:12
|
4 |
|
|
|
|
|
|
|
|
|
5 |
import torch
|
6 |
import gradio as gr
|
7 |
from info import article
|
2 |
# author: xusong <xusong28@jd.com>
|
3 |
# time: 2022/9/05 14:12
|
4 |
|
5 |
+
"""
|
6 |
+
TODO: 还要能判断是否需要回复。
|
7 |
+
"""
|
8 |
+
|
9 |
import torch
|
10 |
import gradio as gr
|
11 |
from info import article
|
demo_sum.py
CHANGED
@@ -3,11 +3,27 @@
|
|
3 |
# time: 2022/8/23 12:58
|
4 |
|
5 |
"""
|
6 |
-
TODO:
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"""
|
12 |
|
13 |
import torch
|
@@ -19,31 +35,130 @@ from transformers import BertTokenizer, BartForConditionalGeneration
|
|
19 |
model = BartForConditionalGeneration.from_pretrained("eson/kplug-base-cepsum-jiadian") # cnn指的是cnn daily mail
|
20 |
tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-cepsum-jiadian")
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
inputs = tokenizer([text], max_length=512, return_tensors="pt")
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
sum_examples = [
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
]
|
33 |
|
34 |
sum_iface = gr.Interface(
|
35 |
fn=summarize,
|
36 |
-
inputs=
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
label="文本摘要(Summarization)",
|
45 |
-
lines=4,
|
46 |
),
|
|
|
47 |
examples=sum_examples,
|
48 |
title="生成式摘要(Abstractive Summarization)",
|
49 |
description='生成式摘要,用于电商领域的商品营销文案写作。输入商品信息,输出商品的营销文案。',
|
3 |
# time: 2022/8/23 12:58
|
4 |
|
5 |
"""
|
6 |
+
## TODO:
|
7 |
+
|
8 |
+
1. 下拉框,选择类目。 gr.Radio(['服饰','箱包', '鞋靴']
|
9 |
+
2. 支持输入特效
|
10 |
+
- 示例:https://huggingface.co/uer/gpt2-chinese-lyric
|
11 |
+
- 参考 https://github.com/huggingface/hub-docs/blob/main/js/src/lib/components/InferenceWidget/shared/WidgetTextarea/WidgetTextarea.svelte
|
12 |
+
3. 待开放参数:No Repeat Ngram Size、Length Penalty、Number of Beams。topk-sampling, topp-sampling,
|
13 |
+
|
14 |
+
|
15 |
+
num_beam_groups = return_sequences数吗?
|
16 |
+
|
17 |
+
## badcase:
|
18 |
+
|
19 |
+
1. 结尾容易出多个句号。为啥?
|
20 |
+
|
21 |
+
## 参考
|
22 |
+
|
23 |
+
- generate官方文档:https://huggingface.co/blog/how-to-generate
|
24 |
+
- generate参数介绍:https://github.com/huggingface/transformers/blob/7f1cdf18958efef6339040ba91edb32ae7377720/src/transformers/generation/utils.py#L470
|
25 |
+
- https://huggingface.co/spaces/THUDM/GLM-130B
|
26 |
+
|
27 |
"""
|
28 |
|
29 |
import torch
|
35 |
model = BartForConditionalGeneration.from_pretrained("eson/kplug-base-cepsum-jiadian") # cnn指的是cnn daily mail
|
36 |
tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-cepsum-jiadian")
|
37 |
|
38 |
+
gen_mode_params = {
|
39 |
+
"greedy": {
|
40 |
+
"num_beams": 1,
|
41 |
+
"do_sample": False,
|
42 |
+
},
|
43 |
+
# 核心:next_tokens = torch.multinomial(next_token_probs, num_samples=1)
|
44 |
+
"sampling": {
|
45 |
+
"num_beams": 1,
|
46 |
+
"do_sample": True,
|
47 |
+
},
|
48 |
+
"beam search": {
|
49 |
+
"num_beams": 10,
|
50 |
+
"do_sample": False,
|
51 |
+
},
|
52 |
+
"contrastive search": {
|
53 |
+
"top_k": 4,
|
54 |
+
"penalty_alpha": 0.2,
|
55 |
+
},
|
56 |
+
"diverse beam search": {
|
57 |
+
"num_beams": 5,
|
58 |
+
"num_beam_groups": 5,
|
59 |
+
"num_return_sequences": 5,
|
60 |
+
"diversity_penalty": 1.0,
|
61 |
+
}
|
62 |
+
}
|
63 |
+
|
64 |
+
all_decoding_strategys = list(gen_mode_params.keys())
|
65 |
+
|
66 |
+
|
67 |
+
def summarize(text, prefix_text, constrained_text, decoding_strategys):
|
68 |
+
"""
|
69 |
+
prefix_text: 能叫 prompt吗?
|
70 |
+
constrained_text: 受限解码效果怎么这么差.
|
71 |
+
gen_modes: Search Strategy、Decoding strategy、
|
72 |
+
"""
|
73 |
+
# bad_words_ids num_return_sequences=1, no_repeat_ngram_size=1, remove_invalid_values=True,
|
74 |
+
common_params = {"min_length": 20, "max_length": 100}
|
75 |
inputs = tokenizer([text], max_length=512, return_tensors="pt")
|
76 |
+
|
77 |
+
# prompt_text = GPT2里的参数. 这里是 decoder_input_ids。 shape=(batch_size, n)
|
78 |
+
if prefix_text:
|
79 |
+
decoder_input_ids = tokenizer([prefix_text], max_length=30, return_tensors="pt")
|
80 |
+
# decoder_input_ids = tokenizer(["采用优质的"], max_length=30, return_tensors="pt")
|
81 |
+
decoder_input_ids = decoder_input_ids.input_ids[:, :-1]
|
82 |
+
decoder_input_ids[:, 0] = model.config.decoder_start_token_id
|
83 |
+
common_params["decoder_input_ids"] = decoder_input_ids
|
84 |
+
|
85 |
+
#
|
86 |
+
if constrained_text:
|
87 |
+
common_params["force_words_ids"] = tokenizer(
|
88 |
+
[constrained_text], add_special_tokens=False, max_length=30).input_ids
|
89 |
+
|
90 |
+
result = {}
|
91 |
+
print(decoding_strategys)
|
92 |
+
for strategy in decoding_strategys:
|
93 |
+
if constrained_text and strategy in ["greedy", "sampling", "diverse beam search"]:
|
94 |
+
# `num_beams` needs to be greater than 1 for constrained generation.
|
95 |
+
# `num_beam_groups` not supported yet for constrained generation.
|
96 |
+
result[strategy] = "不支持受限解码"
|
97 |
+
continue
|
98 |
+
|
99 |
+
summary_ids = model.generate(inputs["input_ids"][:, 1:], **common_params, **gen_mode_params[strategy])
|
100 |
+
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
|
101 |
+
clean_up_tokenization_spaces=False)
|
102 |
+
print(strategy, summary)
|
103 |
+
result[strategy] = summary[0]
|
104 |
+
|
105 |
+
return result
|
106 |
+
# return pd.DataFrame([result])
|
107 |
+
|
108 |
|
109 |
sum_examples = [
|
110 |
+
[
|
111 |
+
"美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开门俯视图,全开门俯视图,预留参考图",
|
112 |
+
"", "", all_decoding_strategys],
|
113 |
+
[
|
114 |
+
"美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开门俯视图,全开门俯视图,预留参考图",
|
115 |
+
"智能", "", all_decoding_strategys],
|
116 |
+
[
|
117 |
+
"美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开门俯视图,全开门俯视图,预留参考图",
|
118 |
+
"", "风冷无霜", all_decoding_strategys],
|
119 |
+
|
120 |
+
[
|
121 |
+
"爱家乐新加坡电风扇静音无叶风扇健康空气循环扇儿童球形风扇落地扇外观,宁静节能,产品结构,现代科技的结晶,品质,气家,未来风新时代,动里,空让,健康,低至13分贝/DC直流马达/低耗24,亲密玩伴,24W功率,/低耗,别加坡国民品牌,气流通道,增强室内空气运动,过尘栅网,1-12档风力调速,涡轮风扇,吸气口,大于6米随心掌控,电源适配暑,装箱明细,摆头角度,手动摇摆轨道,操作方式,与空调同时使用不仅可以让室温快速均衡作,电源插口,适用环境,还可以在短时间内,导引出风口,产品类型,快件重量,电机,暖空气向上冷空气向下,线长,使房间温度均衡,省电环保,定时,功率,将凉风或热风送给到附近的房间,轻松享受生活,左右自动(上下手动)摇摆9度,进风口,能够很快中和空气温度差",
|
122 |
+
"", "", all_decoding_strategys],
|
123 |
+
[
|
124 |
+
"海尔8公斤节能静音高温消毒烫烫净全自动滚筒洗衣机靠实力说话,一掌控时间掌控自由,i-time智能时间洗,8公斤容量全家衣物一次清洗,细节绝不含糊,真正实力派,自动添加洗衣盒,洗羽绒服,就要专属程序,羊毛,牛仔,习绒,海尔洗衣机蓝晶系列滚筒,个性范儿,按照程序需求自动冲入洗衣机内,灵活旋钮,创新下排水洁净不残留,强力筋���筒,AMT防霉窗垫,LED大屏显示,洗衣液,消毒剂分别置放在洗衣盒中,从根本上解决污水残留问题避免,全新LD面板显示,更宽阔更大气操作信息一目了然,宽阔大气操作信息一目了然,右槽:消毒剂,简化洗衣程序,弹力筋中间的凹槽内分布,无残留排水模块,海尔洗衣机具有专业级羽绒洗护程序,为羽绒服营造洗护,一体化环境彻底告别手洗或者机洗,左槽:洗涤剂,我的智慧生活,中槽:柔顺剂,满足各种洗涤需求,告别昂贵洗衣店,自家",
|
125 |
+
"", "", all_decoding_strategys],
|
126 |
]
|
127 |
|
128 |
sum_iface = gr.Interface(
|
129 |
fn=summarize,
|
130 |
+
inputs=[
|
131 |
+
gr.Textbox(
|
132 |
+
label="商品信息(Product Info)",
|
133 |
+
value="美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:"
|
134 |
+
"M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷"
|
135 |
+
"无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,"
|
136 |
+
"建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开"
|
137 |
+
"门俯视图,全开门俯视图,预留参考图"),
|
138 |
+
gr.Textbox(
|
139 |
+
"",
|
140 |
+
label="prefix text"
|
141 |
+
),
|
142 |
+
gr.Textbox(
|
143 |
+
"",
|
144 |
+
label="constrained text"
|
145 |
+
),
|
146 |
+
gr.Checkboxgroup(
|
147 |
+
all_decoding_strategys, value=all_decoding_strategys[0:1],
|
148 |
+
label="decoding strategy"
|
149 |
+
),
|
150 |
+
],
|
151 |
+
# outputs=gr.Textbox(
|
152 |
+
# label="文本摘要(Summarization)",
|
153 |
+
# lines=4,
|
154 |
+
# ),
|
155 |
+
# outputs=gr.DataFrame(
|
156 |
+
# label="文本摘要(Summarization)",
|
157 |
+
# ),
|
158 |
+
outputs=gr.JSON(
|
159 |
label="文本摘要(Summarization)",
|
|
|
160 |
),
|
161 |
+
|
162 |
examples=sum_examples,
|
163 |
title="生成式摘要(Abstractive Summarization)",
|
164 |
description='生成式摘要,用于电商领域的商品营销文案写作。输入商品信息,输出商品的营销文案。',
|
demo_sum_diverse.py
DELETED
@@ -1,67 +0,0 @@
|
|
1 |
-
# coding=utf-8
|
2 |
-
# author: xusong <xusong28@jd.com>
|
3 |
-
# time: 2022/8/23 12:58
|
4 |
-
|
5 |
-
"""
|
6 |
-
TODO:
|
7 |
-
1. 下拉框,选择类目。 gr.Radio(['服饰','箱包', '鞋靴']
|
8 |
-
2. 支持NER、LM、Corrector
|
9 |
-
beam seach参数
|
10 |
-
promp参数
|
11 |
-
"""
|
12 |
-
import json
|
13 |
-
|
14 |
-
import torch
|
15 |
-
import gradio as gr
|
16 |
-
from kplug import modeling_kplug_s2s_patch
|
17 |
-
from transformers import BertTokenizer, BartForConditionalGeneration
|
18 |
-
|
19 |
-
# 改成 huggingface-model自动模型
|
20 |
-
model_dir = "models/ft_cepsum_jiadian/"
|
21 |
-
model = BartForConditionalGeneration.from_pretrained(model_dir) # cnn指的是cnn daily mail
|
22 |
-
tokenizer = BertTokenizer.from_pretrained(model_dir)
|
23 |
-
|
24 |
-
def summarize(text):
|
25 |
-
inputs = tokenizer([text], max_length=512, return_tensors="pt")
|
26 |
-
# no_repeat_ngram_size: 3
|
27 |
-
# length_penalty
|
28 |
-
# prefix
|
29 |
-
# remove_invalid_values=True,
|
30 |
-
summary_ids = model.generate(inputs["input_ids"][:, 1:], num_beams=4, min_length=20, max_length=100,
|
31 |
-
num_return_sequences=2, num_beam_groups=4, diversity_penalty=2.0) # beam_groups=4,
|
32 |
-
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
33 |
-
return summary
|
34 |
-
|
35 |
-
sum_examples = [
|
36 |
-
"美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开门俯视图,全开门俯视图,预留参考图",
|
37 |
-
"爱家乐新加坡电风扇静音无叶风扇健康空气循环扇儿童球形风扇落地扇外观,宁静节能,产品结构,现代科技的结晶,品质,气家,未来风新时代,动里,空让,健康,低至13分贝/DC直流马达/低耗24,亲密玩伴,24W功率,/低耗,别加坡国民品牌,气流通道,增强室内空气运动,过尘栅网,1-12档风力调速,涡轮风扇,吸气口,大于6米随心掌控,电源适配暑,装箱明细,摆头角度,手动摇摆轨道,操作方式,与空调同时使用不仅可以让室温快速均衡作,电源插口,适用环境,还可以在短时间内,导引出风口,产品类型,快件重量,电机,暖空气向上冷空气向下,线长,使房间温度均衡,省电环保,定时,功率,将凉风或热风送给到附近的房间,轻松享受生活,左右自动(上下手动)摇摆9度,进风口,能够很快中和空气温度差",
|
38 |
-
"海尔8公斤节能静音高温消毒烫烫净全自动滚筒洗衣机靠实力说话,一掌控时间掌控自由,i-time智能时间洗,8公斤容量全家衣物一次清洗,细节绝不含糊,真正实力派,自动添加洗衣盒,洗羽绒服,就要专属程序,羊毛,牛仔,习绒,海尔洗衣机蓝晶系列滚筒,个性范儿,按照程序需求自动冲入洗衣机内,灵活旋钮,创新下排水洁净不残留,强力筋内筒,AMT防霉窗垫,LED大屏显示,洗衣液,消毒剂分别置放在洗衣盒中,从根本上解决污水残留问题避免,全新LD面板显示,更宽阔更大气操作信息一目了然,宽阔大气操作信息一目了然,右槽:消毒剂,简化洗衣程序,弹力筋中间的凹槽内分布,无残留排水模块,海尔洗衣机具有专业级羽绒洗护程序,为羽绒服营造洗护,一体化环境彻底告别手洗或者机洗,左槽:洗涤剂,我的智慧生活,中槽:柔顺剂,满足各种洗涤需求,告别昂贵洗衣店,自家",
|
39 |
-
]
|
40 |
-
|
41 |
-
sum_iface = gr.Interface(
|
42 |
-
fn=summarize,
|
43 |
-
inputs=gr.Textbox(
|
44 |
-
label="商品信息(Product Info)",
|
45 |
-
default="美的对开门风冷无霜家用智能电冰箱波光金纤薄机身高颜值助力保鲜,美的家居风,尺寸说明:"
|
46 |
-
"M以上的距离尤其是左右两侧距离必须保证。关于尺寸的更多问题可,LED冷光源,纤薄机身,风冷"
|
47 |
-
"无霜,智能操控,远程调温,节能静音,照亮你的视野,535L大容量,系统散热和使用的便利性,"
|
48 |
-
"建议左右两侧、顶部和背部需要预留10C,电源线和调平脚等。冰箱放置时为保证,菜谱推荐,半开"
|
49 |
-
"门俯视图,全开门俯视图,预留参考图"),
|
50 |
-
outputs=gr.JSON(
|
51 |
-
label="文本摘要(Summarization)- diverse beam search"
|
52 |
-
),
|
53 |
-
# gr.Textbox(
|
54 |
-
# label="文本摘要(Summarization)",
|
55 |
-
# lines=4,
|
56 |
-
# ),
|
57 |
-
examples=sum_examples,
|
58 |
-
title="生成式摘要(Abstractive Summarization)",
|
59 |
-
description='<div>这是一个生成式摘要的demo,用于电商领域的商品营销文案写作。'
|
60 |
-
'该demo基于KPLUG预训练语言模型,输入商品信息,输出商品的营销���案。</div>'
|
61 |
-
'<div> Paper: <a href="https://aclanthology.org/2021.findings-emnlp.1/"> K-PLUG: Knowledge-injected Pre-trained Language Model for Natural Language Understanding'
|
62 |
-
' and Generation in E-Commerce (Findings of EMNLP 2021) </a> </div>'
|
63 |
-
'<div>Github: <a href="https://github.com/xu-song/k-plug">https://github.com/xu-song/k-plug </a> </div>'
|
64 |
-
)
|
65 |
-
|
66 |
-
if __name__ == "__main__":
|
67 |
-
sum_iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
info.py
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
|
2 |
article = "<p style='text-align: center'><a href='https://aclanthology.org/2021.findings-emnlp.1/'>K-PLUG: Knowledge-injected Pre-trained Language Model for Natural Language Understanding and Generation in E-Commerce</a> | <a href='https://github.com/xu-song/k-plug'>Github Repo</a></p>"
|
|
|
|
1 |
|
2 |
article = "<p style='text-align: center'><a href='https://aclanthology.org/2021.findings-emnlp.1/'>K-PLUG: Knowledge-injected Pre-trained Language Model for Natural Language Understanding and Generation in E-Commerce</a> | <a href='https://github.com/xu-song/k-plug'>Github Repo</a></p>"
|
3 |
+
|
4 |
+
info = "KPLUG是多任务预训练,知识预训练"
|