|
|
|
|
|
import re |
|
import six |
|
import unicodedata |
|
import torch |
|
import rouge |
|
import numpy as np |
|
import random |
|
|
|
import sys |
|
|
|
sys.path.append('../../../') |
|
|
|
rouge = rouge.Rouge() |
|
|
|
|
|
is_py2 = six.PY2 |
|
|
|
if not is_py2: |
|
basestring = str |
|
|
|
|
|
def _is_chinese_char(cp): |
|
"""Checks whether CP is the codepoint of a CJK character.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) |
|
or (cp >= 0x20000 and cp <= 0x2A6DF) |
|
or (cp >= 0x2A700 and cp <= 0x2B73F) |
|
or (cp >= 0x2B740 and cp <= 0x2B81F) |
|
or (cp >= 0x2B820 and cp <= 0x2CEAF) |
|
or (cp >= 0xF900 and cp <= 0xFAFF) |
|
or (cp >= 0x2F800 and cp <= 0x2FA1F)): |
|
return True |
|
|
|
return False |
|
|
|
|
|
def _is_whitespace(char): |
|
"""Checks whether `char` is a whitespace character.""" |
|
|
|
|
|
if char == " " or char == "\t" or char == "\n" or char == "\r": |
|
return True |
|
cat = unicodedata.category(char) |
|
if cat == "Zs": |
|
return True |
|
return False |
|
|
|
|
|
def _is_control(char): |
|
"""Checks whether `char` is a control character.""" |
|
|
|
|
|
if char == "\t" or char == "\n" or char == "\r": |
|
return False |
|
cat = unicodedata.category(char) |
|
if cat.startswith("C"): |
|
return True |
|
return False |
|
|
|
|
|
def _is_punctuation(char): |
|
"""Checks whether `char` is a punctuation character.""" |
|
cp = ord(char) |
|
|
|
|
|
|
|
|
|
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( |
|
cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): |
|
return True |
|
cat = unicodedata.category(char) |
|
if cat.startswith("P"): |
|
return True |
|
return False |
|
|
|
|
|
def is_string(s): |
|
"""判断是否是字符串 |
|
""" |
|
return isinstance(s, basestring) |
|
|
|
|
|
def is_stopwords(word, stopwords): |
|
if word in stopwords: |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def text_segmentate(text): |
|
en_seg_pattern = '((?:\\!|\\?|\\.|\\n)+(?:\\s)+)' |
|
ch_seg_pattern = '((?:?|!|。|\\n)+)' |
|
try: |
|
text = re.sub(en_seg_pattern, r'\1[SEP]', text) |
|
|
|
except Exception as e: |
|
print("input: ", text) |
|
raise e |
|
text = re.sub(ch_seg_pattern, r'\1[SEP]', text) |
|
|
|
text_list = text.split("[SEP]") |
|
text_list = list(filter(lambda x: len(x) != 0, text_list)) |
|
return text_list |
|
|
|
|
|
def load_stopwords(stopwords_path): |
|
stopwords_dict = {} |
|
with open(stopwords_path, "r") as rf: |
|
for line in rf: |
|
line = line.strip() |
|
if line not in stopwords_dict: |
|
stopwords_dict[line] = 0 |
|
else: |
|
pass |
|
return stopwords_dict |
|
|
|
|
|
def text_process(text, max_length): |
|
"""分割文本 |
|
""" |
|
texts = text_segmentate(text) |
|
|
|
result, length = [], 0 |
|
for text in texts: |
|
if length + len(text) > max_length * 1.3 and len(result) >= 3: |
|
yield result |
|
result, length = [], 0 |
|
result.append(text) |
|
length += len(text) |
|
if result and len(result) >= 3: |
|
yield result |
|
|
|
|
|
def text_process_split_long_content(text, max_length): |
|
"""分割长文本 |
|
""" |
|
texts = text_segmentate(text) |
|
|
|
result, sentence_num = "", 0 |
|
for text in texts: |
|
if len(text) > 500: |
|
if len(result) > 300 and sentence_num >= 3: |
|
yield result |
|
result, sentence_num = "", 0 |
|
else: |
|
result, sentence_num = "", 0 |
|
continue |
|
else: |
|
if len(result) + len(text) > max_length * 1.1 and sentence_num >= 3: |
|
yield result |
|
result, sentence_num = "", 0 |
|
result += text |
|
sentence_num += 1 |
|
|
|
if result and sentence_num >= 3: |
|
yield result |
|
|
|
|
|
def gather_join(texts, idxs): |
|
"""取出对应的text,然后拼接起来 |
|
""" |
|
return ''.join([texts[i] for i in idxs]) |
|
|
|
|
|
def gather_join_f1(texts_token, idsx): |
|
join_texts = [] |
|
for id in idsx: |
|
join_texts.extend(texts_token[id]) |
|
return join_texts |
|
|
|
|
|
def compute_rouge(source, target): |
|
"""计算rouge-1、rouge-2、rouge-l |
|
""" |
|
source, target = ' '.join(source), ' '.join(target) |
|
try: |
|
scores = rouge.get_scores(hyps=source, refs=target) |
|
return { |
|
'rouge-1': scores[0]['rouge-1']['f'], |
|
'rouge-2': scores[0]['rouge-2']['f'], |
|
'rouge-l': scores[0]['rouge-l']['f'], |
|
} |
|
except ValueError: |
|
return { |
|
'rouge-1': 0.0, |
|
'rouge-2': 0.0, |
|
'rouge-l': 0.0, |
|
} |
|
|
|
|
|
def remove_stopwords(texts, stopwords_dict): |
|
for i, text in enumerate(texts): |
|
texts[i] = list(filter(lambda x: x not in stopwords_dict, text)) |
|
return texts |
|
|
|
|
|
def pseudo_summary_f1(texts, |
|
stopwords, |
|
tokenizer, |
|
max_length, |
|
rouge_strategy="rouge-l"): |
|
"""构建伪标签摘要数据集 |
|
""" |
|
summary_rate = 0.25 |
|
max_length = max_length - 1 |
|
texts_tokens = [] |
|
sentece_idxs_vec = [] |
|
for text in texts: |
|
if len(texts) == 0: |
|
continue |
|
try: |
|
ids = tokenizer.encode(text.strip())[:-1] |
|
except ValueError: |
|
print("error, input : ", text) |
|
raise ValueError |
|
sentece_idxs_vec.append(ids) |
|
tokens = [tokenizer._convert_id_to_token(token) for token in ids] |
|
texts_tokens.append(tokens) |
|
|
|
texts_tokens_rm = remove_stopwords(texts_tokens, stopwords) |
|
source_idxs, target_idxs = list(range(len(texts))), [] |
|
|
|
assert len(texts_tokens) == len(texts) |
|
|
|
while True: |
|
sims = [] |
|
for i in source_idxs: |
|
new_source_idxs = [j for j in source_idxs if j != i] |
|
new_target_idxs = sorted(target_idxs + [i]) |
|
new_source = gather_join_f1(texts_tokens_rm, new_source_idxs) |
|
new_target = gather_join_f1(texts_tokens_rm, new_target_idxs) |
|
sim = compute_rouge(new_source, new_target)[rouge_strategy] |
|
sims.append(sim) |
|
new_idx = source_idxs[np.argmax(sims)] |
|
del sims |
|
source_idxs.remove(new_idx) |
|
target_idxs = sorted(target_idxs + [new_idx]) |
|
source = gather_join(texts, source_idxs) |
|
target = gather_join(texts, target_idxs) |
|
try: |
|
if (len(source_idxs) == 1 |
|
or 1.0 * len(target) / len(source) > summary_rate): |
|
break |
|
except ZeroDivisionError as e: |
|
print(e.meesage) |
|
print(texts) |
|
print("source: ", source) |
|
print("target: ", target) |
|
|
|
if len(source) < len(target): |
|
source, target = target, source |
|
source_idxs, target_idxs = target_idxs, source_idxs |
|
|
|
return sentece_idxs_vec, source, target, source_idxs, target_idxs |
|
|
|
|
|
def get_input_mask(sentence_id_vec, indexs): |
|
target_idxs = [] |
|
input_idxs = [] |
|
kMaskSentenceTokenId = 2 |
|
kEosTokenId = 1 |
|
mask_sentence_options_cumulative_prob = [0.9, 0.9, 1, 1] |
|
for index in indexs: |
|
target_idxs.extend(sentence_id_vec[index]) |
|
choice = random.uniform(0, 1) |
|
if choice < mask_sentence_options_cumulative_prob[0]: |
|
|
|
sentence_id_vec[index] = [kMaskSentenceTokenId] |
|
elif choice < mask_sentence_options_cumulative_prob[1]: |
|
|
|
replace_id = random.randint(0, len(sentence_id_vec)) |
|
sentence_id_vec[index] = sentence_id_vec[replace_id] |
|
elif choice < mask_sentence_options_cumulative_prob[2]: |
|
pass |
|
else: |
|
sentence_id_vec[index] = [] |
|
|
|
target_idxs.append(kEosTokenId) |
|
|
|
for index, sentence_id in enumerate(sentence_id_vec): |
|
|
|
if len(sentence_id) == 0: |
|
continue |
|
input_idxs.extend(sentence_id_vec[index]) |
|
|
|
input_idxs.append(kEosTokenId) |
|
return input_idxs, target_idxs |
|
|
|
|
|
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, |
|
decoder_start_token_id: int): |
|
""" |
|
Shift input ids one token to the right. |
|
""" |
|
shifted_input_ids = input_ids.new_zeros(input_ids.shape) |
|
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() |
|
shifted_input_ids[:, 0] = decoder_start_token_id |
|
|
|
if pad_token_id is None: |
|
raise ValueError("self.model.config.pad_token_id has to be defined.") |
|
|
|
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) |
|
|
|
return shifted_input_ids |
|
|
|
|
|
def padding_to_maxlength(ids, max_length, pad_id): |
|
cur_len = len(ids) |
|
len_diff = max_length - cur_len |
|
return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff |
|
|