Spaces:

cameltech
/

japanese-gpt-1b-PII-masking-demo

Sleeping

File size: 3,425 Bytes

e7545b6
ba8eb8b
 
e7545b6
ba8eb8b
 
 
 
5ec7bfb
ba8eb8b
 
 
 
 
 
 
 
 
 
0c074a8
 
 
 
 
 
 
 
 
 
ba8eb8b
5ec7bfb
56dc3c9
ba8eb8b
 
 
 
 
 
 
0c074a8
ba8eb8b
 
 
 
157400d
1abc3e9
 
7916eca
1abc3e9
 
 
 
 
 
 
 
 
 
7916eca
 
1abc3e9
7916eca
 
 
157400d
 
 
ba8eb8b
 
3f4d713
157400d
1abc3e9
 
 
157400d
 
 
 
 
 
ba8eb8b
e7545b6

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "cameltech/japanese-gpt-1b-PII-masking"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

instruction = "# タスク\n入力文中の個人情報をマスキングせよ\n\n# 入力文\n"

if torch.cuda.is_available():
    model = model.to("cuda")

def preprocess(text):
    return text.replace("\n", "<LB>")

def postprocess(text):
    return text.replace("<LB>", "\n")

generation_config = {
    "max_new_tokens": 256,
    "num_beams": 3,
    "num_return_sequences": 1,
    "early_stopping": True,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.pad_token_id,
    "repetition_penalty": 3.0
}

def generate(input_text):
    input_text = instruction + input_text
    input_text += "<SEP>"
    input_text = preprocess(input_text)
    
    with torch.no_grad():
        token_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")
    
        output_ids = model.generate(
            token_ids.to(model.device),
            **generation_config
        )
    output = tokenizer.decode(output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True)
    return postprocess(output)

description: str = """本デモサイトでは個人情報自動マスキングモデル「japanese-gpt-1b-PII-masking」を体験できます。
以下の`input`に、個人情報をマスキングしたいテキストデータを入力し「Submit」ボタンを押すと、数秒で処理が完了し`output`に個人情報がマスキングされたテキストデータが表示されます。
"""
article: str = """個人情報は以下の対応関係でマスキングされます。
| タグ | 項目 |
| ---- | ---- |
| \<name\> | 氏名 |
| \<birthday\> | 生年月日 |
| \<phone-number\> | 電話番号 |
| \<mail-address\> | メールアドレス |
| \<customer-id\> | 会員番号・ID |
| \<address\> | 住所 |
| \<post-code\> | 郵便番号 |
| \<company\> | 会社名 |

※なお、本モデルの学習・評価・デモで用いている個人情報データはすべて架空のものを使用しています。
"""
example_input1 = """yabe13@example.co.jpですね。ご確認ありがとうございます。お住まいは東京都江戸川区西瑞江3-1-7、郵便番号は168-5329でよろしいでしょうか？"""
example_input2 = """東尾亮介さま、生年月日は2013年8月1日ということですね。お問い合わせの内容について教えていただけますか？"""
example_input3 = """オペレーター：ありがとうございます。電話番号を03-2788-7631、住所を東京都台東区浅草橋4-3-8プラネ111とお聞きしました。郵便番号もお持ちでしょうか？
中島純治様：あ、郵便番号は、137-6077です。
オペレーター：137-6077、了解しました。それでは、中島様のご質問について伺ってもよろしいでしょうか？"""

iface = gr.Interface(
    fn=generate,
    inputs=gr.Textbox(label = "input"),
    outputs=gr.Textbox(label = "output"),
    title = "Japanese GPT 1B PII Masking DEMO",
    description=description,
    article=article,
    examples = [
        [example_input1], 
        [example_input2],
        [example_input3]
    ],
    cache_examples=True
)

iface.launch()