Spaces:

llm-book
/

simcse-demo

Running

App Files Files Community

singletongue commited on Jul 1, 2023

Commit

5b29d9a

•

1 Parent(s): 27212ed

Add files

Browse files

Files changed (19) hide show

app.py +97 -0
outputs_unsup_simcse/embedded_paragraphs/data-00000-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00001-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00002-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00003-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00004-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00005-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00006-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00007-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00008-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/data-00009-of-00010.arrow +3 -0
outputs_unsup_simcse/embedded_paragraphs/dataset_info.json +83 -0
outputs_unsup_simcse/embedded_paragraphs/state.json +40 -0
outputs_unsup_simcse/encoder/config.json +25 -0
outputs_unsup_simcse/encoder/pytorch_model.bin +3 -0
outputs_unsup_simcse/encoder/special_tokens_map.json +7 -0
outputs_unsup_simcse/encoder/tokenizer_config.json +21 -0
outputs_unsup_simcse/encoder/vocab.txt +0 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import datasets
+import faiss
+import numpy as np
+import streamlit as st
+import torch
+from datasets import Dataset
+from transformers import FeatureExtractionPipeline, pipeline
+@st.cache_resource
+def load_encoder_pipeline(encoder_path: str) -> FeatureExtractionPipeline:
+    """訓練済みの教師なしSimCSEのエンコーダを読み込む"""
+    encoder_pipeline = pipeline("feature-extraction", model=encoder_path)
+    return encoder_pipeline
+@st.cache_resource
+def load_dataset(dataset_dir: str) -> Dataset:
+    """文埋め込み適用済みのデータセットを読み込み、Faissのインデックスを構築"""
+    # ディスクに保存されたデータセットを読み込む
+    dataset = datasets.load_from_disk(dataset_dir)
+    # データセットの"embeddings"フィールドの値からFaissのインデックスを構築する
+    emb_dim = len(dataset[0]["embeddings"])
+    index = faiss.IndexFlatIP(emb_dim)
+    dataset.add_faiss_index("embeddings", custom_index=index)
+    return dataset
+def embed_text(
+    text: str, encoder_pipeline: FeatureExtractionPipeline
+) -> np.ndarray:
+    """教師なしSimCSEのエンコーダを用いてテキストの埋め込みを計算"""
+    with torch.inference_mode():
+        # encoder_pipelineが返すTensorのsizeは(1, トークン数, 埋め込みの次元数)
+        encoded_text = encoder_pipeline(text, return_tensors="pt")[0][0]
+    # ベクトルをNumPyのarrayに変換
+    emb = encoded_text.cpu().numpy().astype(np.float32)
+    # ベクトルのノルムが1になるように正規化
+    emb = emb / np.linalg.norm(emb)
+    return emb
+def search_similar_texts(
+    query_text: str,
+    dataset: Dataset,
+    encoder_pipeline: FeatureExtractionPipeline,
+    k: int = 5,
+) -> list[dict[str, float | str]]:
+    """モデルとデータセットを用いてクエリの類似文検索を実行"""
+    # クエリに対して類似テキストをk件取得する
+    scores, retrieved_examples = dataset.get_nearest_examples(
+        "embeddings", embed_text(query_text, encoder_pipeline), k=k
+    )
+    titles = retrieved_examples["title"]
+    texts = retrieved_examples["text"]
+    # 検索された類似テキストをdictのlistにして返す
+    results = [
+        {"score": score, "title": title, "text": text}
+        for score, title, text in zip(scores, titles, texts)
+    ]
+    return results
+# 訓練済みの教師なしSimCSEのモデルを読み込む
+encoder_pipeline = load_encoder_pipeline("outputs_unsup_simcse/encoder")
+# 文埋め込み適用済みのデータセットを読み込む
+dataset = load_dataset("outputs_unsup_simcse/embedded_paragraphs")
+# デモページのタイトルを表示する
+st.title(":mag: Wikipedia Paragraph Search")
+# デモページのフォームを表示する
+with st.form("input_form"):
+    # クエリの入力欄を表示し、入力された値を受け取る
+    query_text = st.text_input(
+        "クエリを入力:", value="日本語は、主に日本で話されている言語である。", max_chars=150
+    )
+    # 検索する段落数のスライダーを表示し、設定された値を受け取る
+    k = st.slider("検索する段落数:", min_value=1, max_value=100, value=10)
+    # 検索を実行するボタンを表示し、押下されたらTrueを受け取る
+    is_submitted = st.form_submit_button("Search")
+# 検索結果を表示する
+if is_submitted and len(query_text) > 0:
+    # クエリに対して類似文検索を実行し、検索結果を受け取る
+    serach_results = search_similar_texts(
+        query_text, dataset, encoder_pipeline, k=k
+    )
+    # 検索結果を表示する
+    st.subheader("検索結果")
+    st.dataframe(serach_results, use_container_width=True)
+    st.caption("セルのダブルクリックで全体が表示されます")

outputs_unsup_simcse/embedded_paragraphs/data-00000-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57b90263da12e6f9eaa44d91172dd1f5f015ef6c0c54d61e7d54bccc6b79b759
+size 458351816

outputs_unsup_simcse/embedded_paragraphs/data-00001-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3fde1ef4827099de7d8689bdbf02e3180c5227f0ca2b03e2c24da46bacbb49d
+size 458002304

outputs_unsup_simcse/embedded_paragraphs/data-00002-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91f72a3f71b068f9008a289e4a361cba0a880bb25aa4da5453bb2463d3b3f454
+size 456771176

outputs_unsup_simcse/embedded_paragraphs/data-00003-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5807ae91181b76a6836b65cf5c1092314cc126935be10bafc7e85b79500bc76a
+size 457297584

outputs_unsup_simcse/embedded_paragraphs/data-00004-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:305256fabd2246f74dcd4a980d9ab6c3dced5327e64f7f992f2ee0eebb8a8d18
+size 456882896

outputs_unsup_simcse/embedded_paragraphs/data-00005-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76cb3d35b85b02119e0d5de32782f1b53ce20e166b312f028288b95fdce6e2e5
+size 456954640

outputs_unsup_simcse/embedded_paragraphs/data-00006-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3936501890f32e56a54c5ec091891421e1511e6b0c3d43d7a5511c326182998f
+size 458542088

outputs_unsup_simcse/embedded_paragraphs/data-00007-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81167192200edbb2e2b947d5a6bd0437ddf42b01679bf8f34e3b5067f86ed53a
+size 457251296

outputs_unsup_simcse/embedded_paragraphs/data-00008-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a2bf1d222cf15cb91d8789b7a1bbf17e349c39292303d70a0cdc2d29966d29f
+size 458474520

outputs_unsup_simcse/embedded_paragraphs/data-00009-of-00010.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:027b919374c985209a89cd99a849de70fb75ffcf3c5c4b610cac21d938c59d3e
+size 458407928

outputs_unsup_simcse/embedded_paragraphs/dataset_info.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "builder_name": "jawiki-paragraphs",
+  "citation": "",
+  "config_name": "default",
+  "dataset_size": 4417130987,
+  "description": "\u66f8\u7c4d\u300e\u5927\u898f\u6a21\u8a00\u8a9e\u30e2\u30c7\u30eb\u5165\u9580\u300f\u3067\u4f7f\u7528\u3059\u308b Wikipedia \u6bb5\u843d\u306e\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3067\u3059\u3002GitHub \u30ea\u30dd\u30b8\u30c8\u30ea singletongue/wikipedia-utils \u3067\u516c\u958b\u3055\u308c\u3066\u3044\u308b\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3092\u5229\u7528\u3057\u3066\u3044\u307e\u3059\u3002",
+  "download_checksums": {
+    "https://github.com/singletongue/wikipedia-utils/releases/download/2023-04-03/paragraphs-jawiki-20230403.json.gz": {
+      "num_bytes": 1489512230,
+      "checksum": null
+    }
+  },
+  "download_size": 1489512230,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "pageid": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "revid": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "paragraph_index": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "title": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "section": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "html_tag": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embeddings": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "https://github.com/singletongue/wikipedia-utils",
+  "license": "\u672c\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3067\u4f7f\u7528\u3057\u3066\u3044\u308b Wikipedia \u306e\u30b3\u30f3\u30c6\u30f3\u30c4\u306f\u3001\u30af\u30ea\u30a8\u30a4\u30c6\u30a3\u30d6\u30fb\u30b3\u30e2\u30f3\u30ba\u8868\u793a\u30fb\u7d99\u627f\u30e9\u30a4\u30bb\u30f3\u30b9 3.0 (CC BY-SA 3.0) \u304a\u3088\u3073 GNU \u81ea\u7531\u6587\u66f8\u30e9\u30a4\u30bb\u30f3\u30b9 (GFDL) \u306e\u4e0b\u306b\u914d\u5e03\u3055\u308c\u3066\u3044\u308b\u3082\u306e\u3067\u3059\u3002",
+  "size_in_bytes": 5906643217,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 4417130987,
+      "num_examples": 9668476,
+      "shard_lengths": [
+        984321,
+        1031799,
+        1101914,
+        1132906,
+        1123001,
+        1143878,
+        1138063,
+        1139173,
+        873421
+      ],
+      "dataset_name": "jawiki-paragraphs"
+    }
+  },
+  "version": {
+    "version_str": "1.0.0",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}

outputs_unsup_simcse/embedded_paragraphs/state.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00010.arrow"
+    },
+    {
+      "filename": "data-00001-of-00010.arrow"
+    },
+    {
+      "filename": "data-00002-of-00010.arrow"
+    },
+    {
+      "filename": "data-00003-of-00010.arrow"
+    },
+    {
+      "filename": "data-00004-of-00010.arrow"
+    },
+    {
+      "filename": "data-00005-of-00010.arrow"
+    },
+    {
+      "filename": "data-00006-of-00010.arrow"
+    },
+    {
+      "filename": "data-00007-of-00010.arrow"
+    },
+    {
+      "filename": "data-00008-of-00010.arrow"
+    },
+    {
+      "filename": "data-00009-of-00010.arrow"
+    }
+  ],
+  "_fingerprint": "8ff2a1214e978197",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

outputs_unsup_simcse/encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "cl-tohoku/bert-base-japanese-v3",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32768
+}

outputs_unsup_simcse/encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aca39ff56e5bdf8e331de99f48bc049bd2763b327f64457aa98c79bc8e98367e
+size 444899885

outputs_unsup_simcse/encoder/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

outputs_unsup_simcse/encoder/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "do_subword_tokenize": true,
+  "do_word_tokenize": true,
+  "jumanpp_kwargs": null,
+  "mask_token": "[MASK]",
+  "mecab_kwargs": {
+    "mecab_dic": "unidic_lite"
+  },
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "subword_tokenizer_type": "wordpiece",
+  "sudachi_kwargs": null,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "unk_token": "[UNK]",
+  "word_tokenizer_type": "mecab"
+}

outputs_unsup_simcse/encoder/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets
+faiss-cpu
+numpy
+torch
+transformers[ja,torch]