Spaces:

svjack
/

English-Context-Dialogue-Generator

Running

App Files Files Community

svjack commited on Feb 21, 2023

Commit

32fcb0c

•

1 Parent(s): 87feba0

Upload with huggingface_hub

Browse files

Files changed (5) hide show

app.py +33 -0
image2caption.py +70 -0
predict.py +47 -0
requirements.txt +5 -0
summary_reverse_pred_eng_native.py +227 -0

app.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from summary_reverse_pred_eng_native import *
+import gradio as gr
+import os
+#text0 = "飓风格特是1993年9月在墨西哥和整个中美洲引发严重洪灾的大规模热带气旋，源于9月14日西南加勒比海上空一股东风波。次日从尼加拉瓜登岸，经过洪都拉斯后于9月17日在洪都拉斯湾再次达到热带风暴标准，但次日进入伯利兹上空后就减弱成热带低气压。穿过尤卡坦半岛后，在9月20日强化成二级飓风，从韦拉克鲁斯州的图斯潘附近登陆墨西哥。9月21日从纳亚里特州进入太平洋时已降级成热带低气压，最终于5天后在开放水域上空消散。"
+#text1 = "珊瑚坝是长江中的一处河漫滩，位于长江重庆市渝中区区段主航道左侧[1]，靠近渝中半岛，原分属重庆市市中区菜园坝街道和石板坡街道[2]，现属渝中区菜园坝街道石板坡社区[3]，是长江上游缓冲地段自然冲积沙洲，略呈纺锤形[4]或椭圆形，长约1800米，宽约600米，坝上遍布鹅卵石和水草。每年夏季洪水时均被淹没，其余时间常露水面，枯水期则与长江左岸相连[5]。"
+text0 = "The Wisconsin Territorial Centennial half dollar was designed by David Parsons and Benjamin Hawkins and minted by the United States Bureau of the Mint in 1936. The obverse (pictured) depicts a pick axe and lead ore, referring to the lead mining in early Wisconsin"
+#text1 = ""
+example_sample = [
+    [text0, False],
+    #[text1, False],
+]
+def demo_func(prefix, do_sample):
+    l = simple_pred(prefix, do_sample = do_sample)
+    return {
+        "Dialogue Context": l
+    }
+demo = gr.Interface(
+        fn=demo_func,
+        inputs=[gr.Text(label = "Context"),
+                gr.Checkbox(label="do sample"),
+        ],
+        outputs="json",
+        title=f"English Context Dialogue Generator 🦅 demonstration",
+        examples=example_sample if example_sample else None,
+        cache_examples = False
+    )
+demo.launch(server_name=None, server_port=None)

image2caption.py ADDED Viewed

	@@ -0,0 +1,70 @@

+##### image pred
+from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
+import torch
+from PIL import Image
+import pathlib
+import pandas as pd
+import numpy as np
+from IPython.core.display import HTML
+import os
+import requests
+class Image2Caption(object):
+    def __init__(self ,model_path = "nlpconnect/vit-gpt2-image-captioning",
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+                overwrite_encoder_checkpoint_path = None,
+                overwrite_token_model_path = None
+    ):
+        assert type(overwrite_token_model_path) == type("") or overwrite_token_model_path is None
+        assert type(overwrite_encoder_checkpoint_path) == type("") or overwrite_encoder_checkpoint_path is None
+        if overwrite_token_model_path is None:
+            overwrite_token_model_path = model_path
+        if overwrite_encoder_checkpoint_path is None:
+            overwrite_encoder_checkpoint_path = model_path
+        self.device = device
+        self.model = VisionEncoderDecoderModel.from_pretrained(model_path)
+        self.feature_extractor = ViTFeatureExtractor.from_pretrained(overwrite_encoder_checkpoint_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(overwrite_token_model_path)
+        self.model = self.model.to(self.device)
+    def predict_to_df(self, image_paths):
+        img_caption_pred = self.predict_step(image_paths)
+        img_cation_df = pd.DataFrame(list(zip(image_paths, img_caption_pred)))
+        img_cation_df.columns = ["img", "caption"]
+        return img_cation_df
+        #img_cation_df.to_html(escape=False, formatters=dict(Country=path_to_image_html))
+    def predict_step(self ,image_paths, max_length = 128, num_beams = 4):
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+        images = []
+        for image_path in image_paths:
+            #i_image = Image.open(image_path)
+            if image_path.startswith("http"):
+                i_image = Image.open(
+                    requests.get(image_path, stream=True).raw
+                    )
+            else:
+                i_image = Image.open(image_path)
+            if i_image.mode != "RGB":
+                i_image = i_image.convert(mode="RGB")
+            images.append(i_image)
+        pixel_values = self.feature_extractor(images=images, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(self.device)
+        output_ids = self.model.generate(pixel_values, **gen_kwargs)
+        preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        preds = [pred.strip() for pred in preds]
+        return preds
+def path_to_image_html(path):
+    return '<img src="'+ path + '" width="60" >'
+if __name__ == "__main__":
+    i2c_obj = Image2Caption()
+    i2c_tiny_zh_obj = Image2Caption("svjack/vit-gpt-diffusion-zh",
+        overwrite_encoder_checkpoint_path = "google/vit-base-patch16-224",
+        overwrite_token_model_path = "IDEA-CCNL/Wenzhong-GPT2-110M"
+    )

predict.py ADDED Viewed

	@@ -0,0 +1,47 @@

+class Obj:
+    def __init__(self, model, tokenizer, device = "cpu"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.model = self.model.to(self.device)
+    def predict(
+        self,
+        source_text: str,
+        max_length: int = 512,
+        num_return_sequences: int = 1,
+        num_beams: int = 2,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        do_sample: bool = True,
+        repetition_penalty: float = 2.5,
+        length_penalty: float = 1.0,
+        early_stopping: bool = True,
+        skip_special_tokens: bool = True,
+        clean_up_tokenization_spaces: bool = True,
+    ):
+        input_ids = self.tokenizer.encode(
+            source_text, return_tensors="pt", add_special_tokens=True
+        )
+        input_ids = input_ids.to(self.device)
+        generated_ids = self.model.generate(
+            input_ids=input_ids,
+            num_beams=num_beams,
+            max_length=max_length,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            early_stopping=early_stopping,
+            top_p=top_p,
+            top_k=top_k,
+            num_return_sequences=num_return_sequences,
+            do_sample = do_sample
+        )
+        preds = [
+            self.tokenizer.decode(
+                g,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            )
+            for g in generated_ids
+        ]
+        return preds

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+jieba
+rapidfuzz
+ipykernel

summary_reverse_pred_eng_native.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#### English scope
+#device = "cuda:0"
+device = "cpu"
+assert device.startswith("cpu") or device.startswith("cuda")
+import sys
+from predict import *
+from transformers import (
+    T5ForConditionalGeneration,
+    MT5ForConditionalGeneration,
+    ByT5Tokenizer,
+    PreTrainedTokenizer,
+    T5TokenizerFast as T5Tokenizer,
+    MT5TokenizerFast as MT5Tokenizer,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    BertTokenizer,
+    GPT2LMHeadModel,
+)
+import pandas as pd
+import numpy as np
+import re
+from rapidfuzz import fuzz
+from tqdm import tqdm
+import numpy as np
+from transformers import pipeline
+import os
+def shorten_exists(l, sim_threshold = 80, slice_size = 5):
+    req = []
+    for ele in l:
+        if not req:
+            req.append(ele)
+        else:
+            if max(map(lambda x: fuzz.ratio(x[:slice_size], ele[:slice_size]), req)) < sim_threshold:
+                req.append(ele)
+    return req
+model_path = "svjack/summary-dialogue-eng"
+tokenizer0 = T5Tokenizer.from_pretrained(model_path)
+model0 = T5ForConditionalGeneration.from_pretrained(model_path)
+if device.startswith("cuda"):
+    model = Obj(model0, tokenizer0, device = "cuda:0")
+else:
+    model = Obj(model0, tokenizer0, device = "cpu")
+if device.startswith("cuda"):
+    prompt_expand_model = pipeline('text-generation', model='daspartho/prompt-extend',
+        device = 0
+    )
+else:
+    prompt_expand_model = pipeline('text-generation', model='daspartho/prompt-extend',
+    )
+def loop_add(l, names = ["Tom", "Jack"]):
+    req = []
+    for i in range(len(l)):
+        ii = int(i % len(names))
+        req.append(
+            "{}:{}".format(names[ii], l[i])
+        )
+    return req
+#### need some names drop in context(may not have ":")
+#### '艾米-亚当斯在《沉睡的空洞》中，全身，双色大眼睛，咬牙切齿，恐怖，复杂的细节，电影，史诗，现实，解剖，汤姆-哈努卡，上光，艺术站，逼真，可怕'
+def guess_name_candidates(context, cnt_threshold = 1):
+    from copy import deepcopy
+    assert type(context) == type("")
+    import re
+    l = re.findall(r"[\u4e00-\u9fa5a-zA-Z]+:", context)
+    l = list(filter(lambda x: x.strip(), l))
+    ori_l = deepcopy(l)
+    if not l:
+        return []
+    s = pd.Series(l).value_counts()
+    l = pd.Series(s[s > cnt_threshold].index.values.tolist()).map(lambda x: x[:-1]).values.tolist()
+    for ele in ori_l:
+        if len(ele[:-1]) not in l and (len(ele[:-1]) <= 3 or (
+            sum(map(len ,re.findall(r"[a-zA-Z]+:", ele))) == len(ele)
+        )):
+            l.append(ele[:-1])
+    l = list(set(l))
+    return l
+def stdf_prompt_expander(x):
+    assert type(x) == type("")
+    return prompt_expand_model(x, num_return_sequences=1)[0]["generated_text"]
+def simple_pred(summary, candidates = ["Tom", "Jack"], shorten_it = False,
+summary_expander = lambda _:_, do_sample = True):
+    assert callable(summary_expander)
+    summary = summary_expander(summary)
+    pred_text = model.predict(
+    "{}\nCandidates:{}".format(summary, " ".join(candidates)),
+    do_sample = do_sample
+    )[0]
+    candidates_ = guess_name_candidates(pred_text)
+    l = re.split("{}".format("|".join(map(lambda x: "{}:".format(x), candidates_))) ,pred_text)
+    l = list(filter(lambda x: x.strip(), l))
+    if shorten_it:
+        l = shorten_exists(l)
+    #l = loop_add(l, candidates)
+    l = list(map(lambda x: x.strip(), l))
+    return l
+def percentile_sort(df, perc_num = 101):
+    score_tuple_s = df["score_tuple"]
+    score_array = np.asarray(score_tuple_s.values.tolist())
+    perc_list = np.linspace(0, 100, perc_num).tolist()
+    low_to_high_perc_array = np.stack(list(map(lambda p: np.percentile(score_array, p, axis = 0), perc_list)))
+    def get_rank(array_):
+        lookup_list = pd.DataFrame(array_ - low_to_high_perc_array[::-1]).apply(lambda s: min(s) >= 0, axis = 1).tolist()
+        if True not in lookup_list:
+            return len(lookup_list)
+        return lookup_list.index(True)
+    rank_list = []
+    for i in range(score_array.shape[0]):
+        rank_list.append(get_rank(score_array[i, :]))
+    rank_s = pd.Series(rank_list)
+    return df.iloc[np.argsort(rank_s.values)]
+def repeat_score(l, slice_size = 200 ,sim_threshold = 70):
+    from copy import deepcopy
+    assert type(l) == type([])
+    l = deepcopy(l)
+    l = sorted(l)
+    cnt_num = 0
+    set0 = set([])
+    for ele in l:
+        if ":" in ele:
+            ele = "".join(ele.split(":")[1:])
+        if set0 and max(map(lambda x: fuzz.ratio(x[:slice_size], ele[:slice_size]), set0)) > sim_threshold:
+            #if ele in set0:
+            cnt_num += 1
+        set0.add(ele)
+    return cnt_num
+def sample_pred(context, times = 5, stdf_prompt_expander = lambda _: _):
+    df_req = []
+    for i in tqdm(range(times)):
+        ele = stdf_prompt_expander(context)
+        #ele = context
+        l = simple_pred(ele, do_sample = True)
+        df_req.append(
+            [ele, l]
+        )
+    df = pd.DataFrame(df_req)
+    df.columns = ["context", "dialogue"]
+    df["fuzz"] = df["dialogue"].map(
+        lambda x: fuzz.ratio(context, " ".join(x))
+    )
+    df["max_fuzz"] = df["dialogue"].map(
+        lambda x: max(map(lambda y: fuzz.ratio(y, context), x))
+    )
+    df["length"] = df["dialogue"].map(len)
+    df["rpt_score"] = df["dialogue"].map(repeat_score)
+    df["score_tuple"] = df.apply(
+        lambda x: (x["fuzz"], -1 * x["max_fuzz"], x["length"], -1 * x["rpt_score"]), axis = 1
+    )
+    df = percentile_sort(df)
+    return df
+def sample_pred_wrapper(context, i2c_obj, times = 5, extend_by_diffusion = False):
+    assert type(context) == type("")
+    if any(map(lambda x: context.endswith(x), [".jpg", ".png", ".jpeg"])):
+        img_path = context
+        i2c_df = i2c_obj.predict_to_df([img_path])
+        assert i2c_df.size > 0
+        context = i2c_df["caption"].iloc[0]
+    else:
+        pass
+    assert type(context) == type("")
+    if extend_by_diffusion:
+        req_df = sample_pred(context, times = times, stdf_prompt_expander = stdf_prompt_expander)
+    else:
+        req_df = sample_pred(context, times = times, stdf_prompt_expander = lambda _: _)
+    return req_df
+from image2caption import *
+i2c_obj = Image2Caption(device = device)
+if __name__ == "__main__":
+    from image2caption import *
+    i2c_obj = Image2Caption(device = device)
+    img_path = "../pic/bug.jpg"
+    img_path = "../pic/baobao.jpeg"
+    img_path = "../pic/cat0.jpg"
+    img_path = "../pic/cat.jpg"
+    os.path.exists(img_path)
+    df = sample_pred_wrapper(img_path, i2c_obj = i2c_obj)
+    df["dialogue"].values.tolist()
+    img_url = "https://datasets-server.huggingface.co/assets/metashift/--/metashift/train/2/image/image.jpg"
+    img_url = "https://datasets-server.huggingface.co/assets/metashift/--/metashift/train/6/image/image.jpg"
+    df = sample_pred_wrapper(img_url, i2c_obj = i2c_obj)
+    df["dialogue"].values.tolist()
+    text = "Goldfinger is the seventh novel in Ian Fleming's James Bond series. First published in 1959, it centres on Bond's investigation into the gold-smuggling activities of Auric Goldfinger, who is suspected of being connected to Soviet counter-intelligence. "
+    text
+    df = sample_pred_wrapper(text, i2c_obj = i2c_obj, times = 6)
+    df["dialogue"].values.tolist()
+    en_l = ['a statue of a bird on top of a rock',
+     'a woman standing in front of a flower arrangement',
+     'people walking down a dirt road',
+     'two pictures of a man with a beard',
+     'a sign that is on top of a sign',
+     'a woman dressed in a costume holding an umbrella',
+     'a woman in a red dress holding a flower in her hand',
+     'a little girl in a pink dress with a pink flower in her hair']
+    df = sample_pred(en_l[0], 5)
+    df["dialogue"].values.tolist()
+    df = sample_pred(en_l[0], 5, stdf_prompt_expander = stdf_prompt_expander)
+    df["dialogue"].values.tolist()