Spaces:

svjack
/

English-Context-Dialogue-Generator

Running

File size: 7,862 Bytes

32fcb0c

#### English scope
#device = "cuda:0"
device = "cpu"
assert device.startswith("cpu") or device.startswith("cuda")

import sys
from predict import *

from transformers import (
    T5ForConditionalGeneration,
    MT5ForConditionalGeneration,
    ByT5Tokenizer,
    PreTrainedTokenizer,
    T5TokenizerFast as T5Tokenizer,
    MT5TokenizerFast as MT5Tokenizer,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BertTokenizer,
    GPT2LMHeadModel,
)

import pandas as pd
import numpy as np
import re
from rapidfuzz import fuzz
from tqdm import tqdm
import numpy as np
from transformers import pipeline
import os

def shorten_exists(l, sim_threshold = 80, slice_size = 5):
    req = []
    for ele in l:
        if not req:
            req.append(ele)
        else:
            if max(map(lambda x: fuzz.ratio(x[:slice_size], ele[:slice_size]), req)) < sim_threshold:
                req.append(ele)
    return req

model_path = "svjack/summary-dialogue-eng"
tokenizer0 = T5Tokenizer.from_pretrained(model_path)
model0 = T5ForConditionalGeneration.from_pretrained(model_path)

if device.startswith("cuda"):
    model = Obj(model0, tokenizer0, device = "cuda:0")
else:
    model = Obj(model0, tokenizer0, device = "cpu")

if device.startswith("cuda"):
    prompt_expand_model = pipeline('text-generation', model='daspartho/prompt-extend',
        device = 0
    )
else:
    prompt_expand_model = pipeline('text-generation', model='daspartho/prompt-extend',
    )

def loop_add(l, names = ["Tom", "Jack"]):
    req = []
    for i in range(len(l)):
        ii = int(i % len(names))
        req.append(
            "{}:{}".format(names[ii], l[i])
        )
    return req

#### need some names drop in context(may not have ":")
#### '艾米-亚当斯在《沉睡的空洞》中，全身，双色大眼睛，咬牙切齿，恐怖，复杂的细节，电影，史诗，现实，解剖，汤姆-哈努卡，上光，艺术站，逼真，可怕'
def guess_name_candidates(context, cnt_threshold = 1):
    from copy import deepcopy
    assert type(context) == type("")
    import re
    l = re.findall(r"[\u4e00-\u9fa5a-zA-Z]+:", context)
    l = list(filter(lambda x: x.strip(), l))
    ori_l = deepcopy(l)
    if not l:
        return []
    s = pd.Series(l).value_counts()
    l = pd.Series(s[s > cnt_threshold].index.values.tolist()).map(lambda x: x[:-1]).values.tolist()
    for ele in ori_l:
        if len(ele[:-1]) not in l and (len(ele[:-1]) <= 3 or (
            sum(map(len ,re.findall(r"[a-zA-Z]+:", ele))) == len(ele)
        )):
            l.append(ele[:-1])
    l = list(set(l))
    return l

def stdf_prompt_expander(x):
    assert type(x) == type("")
    return prompt_expand_model(x, num_return_sequences=1)[0]["generated_text"]

def simple_pred(summary, candidates = ["Tom", "Jack"], shorten_it = False,
summary_expander = lambda _:_, do_sample = True):
    assert callable(summary_expander)
    summary = summary_expander(summary)
    pred_text = model.predict(
    "{}\nCandidates:{}".format(summary, " ".join(candidates)),
    do_sample = do_sample
    )[0]
    candidates_ = guess_name_candidates(pred_text)
    l = re.split("{}".format("|".join(map(lambda x: "{}:".format(x), candidates_))) ,pred_text)
    l = list(filter(lambda x: x.strip(), l))
    if shorten_it:
        l = shorten_exists(l)
    #l = loop_add(l, candidates)
    l = list(map(lambda x: x.strip(), l))
    return l

def percentile_sort(df, perc_num = 101):
    score_tuple_s = df["score_tuple"]
    score_array = np.asarray(score_tuple_s.values.tolist())
    perc_list = np.linspace(0, 100, perc_num).tolist()
    low_to_high_perc_array = np.stack(list(map(lambda p: np.percentile(score_array, p, axis = 0), perc_list)))

    def get_rank(array_):
        lookup_list = pd.DataFrame(array_ - low_to_high_perc_array[::-1]).apply(lambda s: min(s) >= 0, axis = 1).tolist()
        if True not in lookup_list:
            return len(lookup_list)
        return lookup_list.index(True)

    rank_list = []
    for i in range(score_array.shape[0]):
        rank_list.append(get_rank(score_array[i, :]))

    rank_s = pd.Series(rank_list)
    return df.iloc[np.argsort(rank_s.values)]

def repeat_score(l, slice_size = 200 ,sim_threshold = 70):
    from copy import deepcopy
    assert type(l) == type([])
    l = deepcopy(l)
    l = sorted(l)
    cnt_num = 0
    set0 = set([])
    for ele in l:
        if ":" in ele:
            ele = "".join(ele.split(":")[1:])
        if set0 and max(map(lambda x: fuzz.ratio(x[:slice_size], ele[:slice_size]), set0)) > sim_threshold:
            #if ele in set0:
            cnt_num += 1
        set0.add(ele)
    return cnt_num

def sample_pred(context, times = 5, stdf_prompt_expander = lambda _: _):
    df_req = []
    for i in tqdm(range(times)):
        ele = stdf_prompt_expander(context)
        #ele = context
        l = simple_pred(ele, do_sample = True)
        df_req.append(
            [ele, l]
        )
    df = pd.DataFrame(df_req)
    df.columns = ["context", "dialogue"]
    df["fuzz"] = df["dialogue"].map(
        lambda x: fuzz.ratio(context, " ".join(x))
    )
    df["max_fuzz"] = df["dialogue"].map(
        lambda x: max(map(lambda y: fuzz.ratio(y, context), x))
    )
    df["length"] = df["dialogue"].map(len)
    df["rpt_score"] = df["dialogue"].map(repeat_score)
    df["score_tuple"] = df.apply(
        lambda x: (x["fuzz"], -1 * x["max_fuzz"], x["length"], -1 * x["rpt_score"]), axis = 1
    )
    df = percentile_sort(df)
    return df

def sample_pred_wrapper(context, i2c_obj, times = 5, extend_by_diffusion = False):
    assert type(context) == type("")
    if any(map(lambda x: context.endswith(x), [".jpg", ".png", ".jpeg"])):
        img_path = context
        i2c_df = i2c_obj.predict_to_df([img_path])
        assert i2c_df.size > 0
        context = i2c_df["caption"].iloc[0]
    else:
        pass
    assert type(context) == type("")
    if extend_by_diffusion:
        req_df = sample_pred(context, times = times, stdf_prompt_expander = stdf_prompt_expander)
    else:
        req_df = sample_pred(context, times = times, stdf_prompt_expander = lambda _: _)
    return req_df

from image2caption import *
i2c_obj = Image2Caption(device = device)

if __name__ == "__main__":
    from image2caption import *
    i2c_obj = Image2Caption(device = device)

    img_path = "../pic/bug.jpg"
    img_path = "../pic/baobao.jpeg"
    img_path = "../pic/cat0.jpg"
    img_path = "../pic/cat.jpg"
    os.path.exists(img_path)

    df = sample_pred_wrapper(img_path, i2c_obj = i2c_obj)
    df["dialogue"].values.tolist()

    img_url = "https://datasets-server.huggingface.co/assets/metashift/--/metashift/train/2/image/image.jpg"
    img_url = "https://datasets-server.huggingface.co/assets/metashift/--/metashift/train/6/image/image.jpg"

    df = sample_pred_wrapper(img_url, i2c_obj = i2c_obj)
    df["dialogue"].values.tolist()


    text = "Goldfinger is the seventh novel in Ian Fleming's James Bond series. First published in 1959, it centres on Bond's investigation into the gold-smuggling activities of Auric Goldfinger, who is suspected of being connected to Soviet counter-intelligence. "
    text

    df = sample_pred_wrapper(text, i2c_obj = i2c_obj, times = 6)
    df["dialogue"].values.tolist()

    en_l = ['a statue of a bird on top of a rock',
     'a woman standing in front of a flower arrangement',
     'people walking down a dirt road',
     'two pictures of a man with a beard',
     'a sign that is on top of a sign',
     'a woman dressed in a costume holding an umbrella',
     'a woman in a red dress holding a flower in her hand',
     'a little girl in a pink dress with a pink flower in her hair']

    df = sample_pred(en_l[0], 5)
    df["dialogue"].values.tolist()

    df = sample_pred(en_l[0], 5, stdf_prompt_expander = stdf_prompt_expander)
    df["dialogue"].values.tolist()