Spaces:

cameltech
/

japanese-gpt-1b-PII-masking-demo

Sleeping

App Files Files Community

japanese-gpt-1b-PII-masking-demo / app.py

ksuzuki01

Update app.py

7916eca verified 8 months ago

raw

history blame

3.43 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model_name = "cameltech/japanese-gpt-1b-PII-masking"
	model = AutoModelForCausalLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	instruction = "# タスク\n入力文中の個人情報をマスキングせよ\n\n# 入力文\n"

	if torch.cuda.is_available():
	model = model.to("cuda")

	def preprocess(text):
	return text.replace("\n", "<LB>")

	def postprocess(text):
	return text.replace("<LB>", "\n")

	generation_config = {
	"max_new_tokens": 256,
	"num_beams": 3,
	"num_return_sequences": 1,
	"early_stopping": True,
	"eos_token_id": tokenizer.eos_token_id,
	"pad_token_id": tokenizer.pad_token_id,
	"repetition_penalty": 3.0
	}

	def generate(input_text):
	input_text = instruction + input_text
	input_text += "<SEP>"
	input_text = preprocess(input_text)

	with torch.no_grad():
	token_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors="pt")

	output_ids = model.generate(
	token_ids.to(model.device),
	**generation_config
	)
	output = tokenizer.decode(output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True)
	return postprocess(output)

	description: str = """本デモサイトでは個人情報自動マスキングモデル「japanese-gpt-1b-PII-masking」を体験できます。
	以下の`input`に、個人情報をマスキングしたいテキストデータを入力し「Submit」ボタンを押すと、数秒で処理が完了し`output`に個人情報がマスキングされたテキストデータが表示されます。
	"""
	article: str = """個人情報は以下の対応関係でマスキングされます。
	\| タグ \| 項目 \|
	\| ---- \| ---- \|
	\| \<name\> \| 氏名 \|
	\| \<birthday\> \| 生年月日 \|
	\| \<phone-number\> \| 電話番号 \|
	\| \<mail-address\> \| メールアドレス \|
	\| \<customer-id\> \| 会員番号・ID \|
	\| \<address\> \| 住所 \|
	\| \<post-code\> \| 郵便番号 \|
	\| \<company\> \| 会社名 \|

	※なお、本モデルの学習・評価・デモで用いている個人情報データはすべて架空のものを使用しています。
	"""
	example_input1 = """yabe13@example.co.jpですね。ご確認ありがとうございます。お住まいは東京都江戸川区西瑞江3-1-7、郵便番号は168-5329でよろしいでしょうか？"""
	example_input2 = """東尾亮介さま、生年月日は2013年8月1日ということですね。お問い合わせの内容について教えていただけますか？"""
	example_input3 = """オペレーター：ありがとうございます。電話番号を03-2788-7631、住所を東京都台東区浅草橋4-3-8プラネ111とお聞きしました。郵便番号もお持ちでしょうか？
	中島純治様：あ、郵便番号は、137-6077です。
	オペレーター：137-6077、了解しました。それでは、中島様のご質問について伺ってもよろしいでしょうか？"""

	iface = gr.Interface(
	fn=generate,
	inputs=gr.Textbox(label = "input"),
	outputs=gr.Textbox(label = "output"),
	title = "Japanese GPT 1B PII Masking DEMO",
	description=description,
	article=article,
	examples = [
	[example_input1],
	[example_input2],
	[example_input3]
	],
	cache_examples=True
	)

	iface.launch()