Spaces:

Hann99
/

paper_recom

Build error

App Files Files Community

Hann99 commited on Mar 5, 2023

Commit

cf9bbdf

•

1 Parent(s): 45ff3f2

Upload data_utils.py

Browse files

Files changed (1) hide show

data_utils.py +319 -0

data_utils.py ADDED Viewed

	@@ -0,0 +1,319 @@

+# -*- coding: utf-8 -*-
+import re
+import six
+import unicodedata
+import torch
+import rouge
+import numpy as np
+import random
+# from fengshen.examples.pegasus.pegasus_utils import text_segmentate
+import sys
+sys.path.append('../../../')
+rouge = rouge.Rouge()
+is_py2 = six.PY2
+if not is_py2:
+    basestring = str
+def _is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+        return True
+    return False
+def _is_whitespace(char):
+    """Checks whether `char` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `char` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `char` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
+            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+def is_string(s):
+    """判断是否是字符串
+    """
+    return isinstance(s, basestring)
+def is_stopwords(word, stopwords):
+    if word in stopwords:
+        return True
+    else:
+        return False
+def text_segmentate(text):
+    en_seg_pattern = '((?:\\!|\\?|\\.|\\n)+(?:\\s)+)'
+    ch_seg_pattern = '((?:？|！|。|\\n)+)'
+    try:
+        text = re.sub(en_seg_pattern, r'\1[SEP]', text)
+        # print("sub text: ", text)
+    except Exception as e:
+        print("input: ", text)
+        raise e
+    text = re.sub(ch_seg_pattern, r'\1[SEP]', text)
+    # print("sub ch text: ", text)
+    text_list = text.split("[SEP]")
+    text_list = list(filter(lambda x: len(x) != 0, text_list))
+    return text_list
+def load_stopwords(stopwords_path):
+    stopwords_dict = {}
+    with open(stopwords_path, "r") as rf:
+        for line in rf:
+            line = line.strip()
+            if line not in stopwords_dict:
+                stopwords_dict[line] = 0
+            else:
+                pass
+    return stopwords_dict
+def text_process(text, max_length):
+    """分割文本
+    """
+    texts = text_segmentate(text)
+    result, length = [], 0
+    for text in texts:
+        if length + len(text) > max_length * 1.3 and len(result) >= 3:
+            yield result
+            result, length = [], 0
+        result.append(text)
+        length += len(text)
+    if result and len(result) >= 3:
+        yield result
+def text_process_split_long_content(text, max_length):
+    """分割长文本
+    """
+    texts = text_segmentate(text)
+    result, sentence_num = "", 0
+    for text in texts:
+        if len(text) > 500:
+            if len(result) > 300 and sentence_num >= 3:
+                yield result
+                result, sentence_num = "", 0
+            else:
+                result, sentence_num = "", 0
+                continue
+        else:
+            if len(result) + len(text) > max_length * 1.1 and sentence_num >= 3:
+                yield result
+                result, sentence_num = "", 0
+            result += text
+            sentence_num += 1
+    if result and sentence_num >= 3:
+        yield result
+def gather_join(texts, idxs):
+    """取出对应的text，然后拼接起来
+    """
+    return ''.join([texts[i] for i in idxs])
+def gather_join_f1(texts_token, idsx):
+    join_texts = []
+    for id in idsx:
+        join_texts.extend(texts_token[id])
+    return join_texts
+def compute_rouge(source, target):
+    """计算rouge-1、rouge-2、rouge-l
+    """
+    source, target = ' '.join(source), ' '.join(target)
+    try:
+        scores = rouge.get_scores(hyps=source, refs=target)
+        return {
+            'rouge-1': scores[0]['rouge-1']['f'],
+            'rouge-2': scores[0]['rouge-2']['f'],
+            'rouge-l': scores[0]['rouge-l']['f'],
+        }
+    except ValueError:
+        return {
+            'rouge-1': 0.0,
+            'rouge-2': 0.0,
+            'rouge-l': 0.0,
+        }
+def remove_stopwords(texts, stopwords_dict):
+    for i, text in enumerate(texts):
+        texts[i] = list(filter(lambda x: x not in stopwords_dict, text))
+    return texts
+def pseudo_summary_f1(texts,
+                      stopwords,
+                      tokenizer,
+                      max_length,
+                      rouge_strategy="rouge-l"):
+    """构建伪标签摘要数据集
+    """
+    summary_rate = 0.25
+    max_length = max_length - 1
+    texts_tokens = []
+    sentece_idxs_vec = []
+    for text in texts:
+        if len(texts) == 0:
+            continue
+        try:
+            ids = tokenizer.encode(text.strip())[:-1]
+        except ValueError:
+            print("error, input : ", text)
+            raise ValueError
+        sentece_idxs_vec.append(ids)
+        tokens = [tokenizer._convert_id_to_token(token) for token in ids]
+        texts_tokens.append(tokens)
+    texts_tokens_rm = remove_stopwords(texts_tokens, stopwords)
+    source_idxs, target_idxs = list(range(len(texts))), []
+    assert len(texts_tokens) == len(texts)
+    # truncate_index = 0
+    while True:
+        sims = []
+        for i in source_idxs:
+            new_source_idxs = [j for j in source_idxs if j != i]
+            new_target_idxs = sorted(target_idxs + [i])
+            new_source = gather_join_f1(texts_tokens_rm, new_source_idxs)
+            new_target = gather_join_f1(texts_tokens_rm, new_target_idxs)
+            sim = compute_rouge(new_source, new_target)[rouge_strategy]
+            sims.append(sim)
+        new_idx = source_idxs[np.argmax(sims)]
+        del sims
+        source_idxs.remove(new_idx)
+        target_idxs = sorted(target_idxs + [new_idx])
+        source = gather_join(texts, source_idxs)
+        target = gather_join(texts, target_idxs)
+        try:
+            if (len(source_idxs) == 1
+                    or 1.0 * len(target) / len(source) > summary_rate):
+                break
+        except ZeroDivisionError as e:
+            print(e.meesage)
+            print(texts)
+            print("source: ", source)
+            print("target: ", target)
+    if len(source) < len(target):
+        source, target = target, source
+        source_idxs, target_idxs = target_idxs, source_idxs
+    return sentece_idxs_vec, source, target, source_idxs, target_idxs
+def get_input_mask(sentence_id_vec, indexs):
+    target_idxs = []
+    input_idxs = []
+    kMaskSentenceTokenId = 2
+    kEosTokenId = 1
+    mask_sentence_options_cumulative_prob = [0.9, 0.9, 1, 1]
+    for index in indexs:
+        target_idxs.extend(sentence_id_vec[index])
+        choice = random.uniform(0, 1)
+        if choice < mask_sentence_options_cumulative_prob[0]:
+            # print("mask index: ", index)
+            sentence_id_vec[index] = [kMaskSentenceTokenId]
+        elif choice < mask_sentence_options_cumulative_prob[1]:
+            # print("replace index: ", index)
+            replace_id = random.randint(0, len(sentence_id_vec))
+            sentence_id_vec[index] = sentence_id_vec[replace_id]
+        elif choice < mask_sentence_options_cumulative_prob[2]:
+            pass
+        else:
+            sentence_id_vec[index] = []
+    target_idxs.append(kEosTokenId)
+    # print(sentence_id_vec)
+    for index, sentence_id in enumerate(sentence_id_vec):
+        # print(index, sentence_id)
+        if len(sentence_id) == 0:
+            continue
+        input_idxs.extend(sentence_id_vec[index])
+    input_idxs.append(kEosTokenId)
+    return input_idxs, target_idxs
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int,
+                       decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+    return shifted_input_ids
+def padding_to_maxlength(ids, max_length, pad_id):
+    cur_len = len(ids)
+    len_diff = max_length - cur_len
+    return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff