Spaces:

ando55
/

clinical_segment_splitter

Runtime error

App Files Files Community

kenichiro commited on Feb 10, 2023

Commit

46a030d

•

1 Parent(s): b615e10

commit

Browse files

Files changed (22) hide show

LICENSE +201 -0
README.md +1 -1
__pycache__/chat.cpython-38.pyc +0 -0
__pycache__/functionforDownloadButtons.cpython-36.pyc +0 -0
__pycache__/functionforDownloadButtons.cpython-38.pyc +0 -0
__pycache__/model.cpython-36.pyc +0 -0
__pycache__/model.cpython-38.pyc +0 -0
__pycache__/model2.cpython-36.pyc +0 -0
__pycache__/run_segbot.cpython-36.pyc +0 -0
__pycache__/run_segbot.cpython-38.pyc +0 -0
__pycache__/solver.cpython-36.pyc +0 -0
__pycache__/solver.cpython-38.pyc +0 -0
__pycache__/solver2.cpython-36.pyc +0 -0
app.py +117 -14
credata.py +653 -0
fm.pickle +3 -0
functionforDownloadButtons.py +171 -0
logo.png +0 -0
model.py +465 -0
requirements.txt +7 -0
run_segbot.py +61 -94
solver.py +16 -282

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Clinical Segnemt
-emoji: 🚑
 colorFrom: purple
 colorTo: yellow
 sdk: streamlit

 ---
 title: Clinical Segnemt
+emoji: 🌖
 colorFrom: purple
 colorTo: yellow
 sdk: streamlit

__pycache__/chat.cpython-38.pyc DELETED Viewed

Binary file (1.46 kB)

__pycache__/functionforDownloadButtons.cpython-36.pyc ADDED Viewed

Binary file (4.54 kB). View file

__pycache__/functionforDownloadButtons.cpython-38.pyc ADDED Viewed

Binary file (4.59 kB). View file

__pycache__/model.cpython-36.pyc ADDED Viewed

Binary file (7.24 kB). View file

__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (7.26 kB). View file

__pycache__/model2.cpython-36.pyc ADDED Viewed

Binary file (7.04 kB). View file

__pycache__/run_segbot.cpython-36.pyc ADDED Viewed

Binary file (1.9 kB). View file

__pycache__/run_segbot.cpython-38.pyc ADDED Viewed

Binary file (1.9 kB). View file

__pycache__/solver.cpython-36.pyc ADDED Viewed

Binary file (4.81 kB). View file

__pycache__/solver.cpython-38.pyc ADDED Viewed

Binary file (4.83 kB). View file

__pycache__/solver2.cpython-36.pyc ADDED Viewed

Binary file (4.43 kB). View file

app.py CHANGED Viewed

@@ -1,19 +1,122 @@
-from flask import Flask, render_template, request, jsonify
-from chat import get_response
-app = Flask(__name__)
-@app.get("/")
-def index_get():
-    return render_template("base.html")
-@app.post("/predict")
-def predict():
-    text = request.get_json().get("message")
-    response = get_response(text)
-    message = {"answer": response}
-    return jsonify(message)
-if __name__=="__main__":
-    app.run(debug=True)

+import streamlit as st
+import numpy as np
+from pandas import DataFrame
+import run_segbot
+from functionforDownloadButtons import download_button
+import os
+import json
+st.set_page_config(
+    page_title="Clinical segment generater",
+    page_icon="🚑",
+    layout="wide"
+)
+def _max_width_():
+    max_width_str = f"max-width: 1400px;"
+    st.markdown(
+        f"""
+    <style>
+    .reportview-container .main .block-container{{
+        {max_width_str}
+    }}
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+#_max_width_()
+#c30 = st.columns([1,])
+#with c30:
+# st.image("logo.png", width=400)
+st.title("🚑 Clinical segment generater")
+st.header("")
+with st.expander("ℹ️ - About this app", expanded=True):
+    st.write(
+        """
+-   The *Clinical segment generater* app is an implementation of [our paper](https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000099).
+-   It automatically splits Japanese sentences into smaller units representing medical meanings.
+	    """
+    )
+    st.markdown("")
+st.markdown("")
+st.markdown("## 📌 Paste document")
+@st.cache(allow_output_mutation=True)
+def model_load():
+    return run_segbot.setup()
+model,fm,index = model_load()
+with st.form(key="my_form"):
+    ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 5, 0.07])
+    with c1:
+        ModelType = st.radio(
+            "Choose the method of sentence split",
+            ["fullstop & linebreak (Default)", "pySBD"],
+            help="""
+            At present, you can choose between 2 methods to split your text into sentences.
+            The fullstop & linebreak is naive and robust to noise, but has low accuracy.
+            pySBD is more accurate, but more complex and less robust to noise.
+            """,
+        )
+        if ModelType == "fullstop & linebreak (Default)":
+            split_method="fullstop"
+        else:
+            split_method="pySBD"
+    with c2:
+        doc = st.text_area(
+            "Paste your text below",
+            height=510,
+        )
+        submit_button = st.form_submit_button(label="👍 Go to split!")
+if not submit_button:
+    st.stop()
+keywords = run_segbot.generate(doc, model, fm, index, split_method)
+st.markdown("## 🎈 Check & download results")
+st.header("")
+cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
+with c1:
+    CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
+with c2:
+    CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
+with c3:
+    CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
+st.header("")
+#df = DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
+df = DataFrame(keywords)
+df.index += 1
+df.columns = ['Segment']
+print(df)
+# Add styling
+#c1, c2, c3 = st.columns([1, 3, 1])
+#with c2:
+st.table(df)

credata.py ADDED Viewed

	@@ -0,0 +1,653 @@

+import gensim
+import MeCab
+import pickle
+from gensim.models.wrappers.fasttext import FastText
+#import fasttext as ft
+import random
+import mojimoji
+import numpy as np
+from tqdm import tqdm
+def ymyi(lis):
+    wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+    with open('fm_space.pickle', 'rb') as f:
+        fm = pickle.load(f)
+    #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+    model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+    texts = []
+    sent = ""
+    sparate = []
+    label = []
+    ruiseki = 0
+    ruiseki2 = 0
+    alls = []
+    labels, text, num = [], [], []
+    for n, line in enumerate(open(lis)):
+        line = line.strip("\t").rstrip("\n")
+        #print(line)
+        if line == "":
+            if sent == "":
+                continue
+            sent = wakati.parse(sent).split(" ")[:-1]
+            flag = 0
+            for i in sent:
+                for j in sparate:
+                    if ruiseki+len(i) > j and ruiseki < j:
+                        label.append(1)
+                        flag = 1
+                    elif ruiseki+len(i) == j:
+                        label.append(1)
+                        flag = 1
+                if flag == 0:
+                    label.append(0)
+                flag = 0
+                ruiseki += len(i)
+                #texts += i + " "
+                try:
+                    texts.append(model[i])
+                    #texts.append(np.array(fm.vocab[i]))
+                    #texts += str(fm.vocab[i].index) + " "
+                    #print(i,str(fm.vocab[i].index))
+                except KeyError:
+                    texts.append(fm["<unk>"])
+            label[-1] = 1
+            #texts = texts.rstrip() + "\t"
+            #texts += " ".join(label) + "\n"
+            #alls.append((n,texts,label))
+            labels.append(label)
+            text.append(texts)
+            num.append(n)
+            sent = ""
+            sparate = []
+            texts = []
+            label = []
+            ruiseki = 0
+            ruiseki2 = 0
+            continue
+        sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
+        ruiseki2 += len(line)
+        sparate.append(ruiseki2)
+    return num,text,labels
+def nmni(lis):
+    #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+    wakati = MeCab.Tagger("-Owakati -b 81920")
+    with open('fm_space.pickle', 'rb') as f:
+        fm = pickle.load(f)
+    #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+    #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+    texts = []
+    sent = ""
+    sparate = []
+    label = []
+    ruiseki = 0
+    ruiseki2 = 0
+    alls = []
+    labels, text, num = [], [], []
+    for n, line in enumerate(open(lis)):
+        line = line.strip("\t").rstrip("\n")
+        #print(line)
+        if line == "":
+            if sent == "":
+                continue
+            sent = wakati.parse(sent).split(" ")[:-1]
+            flag = 0
+            for i in sent:
+                for j in sparate:
+                    if ruiseki+len(i) > j and ruiseki < j:
+                        label.append(1)
+                        flag = 1
+                    elif ruiseki+len(i) == j:
+                        label.append(1)
+                        flag = 1
+                if flag == 0:
+                    label.append(0)
+                flag = 0
+                ruiseki += len(i)
+                #texts += i + " "
+                try:
+                    #texts.append(model[i])
+                    texts.append(fm[i])
+                    #texts += str(fm.vocab[i].index) + " "
+                    #print(i,str(fm.vocab[i].index))
+                except KeyError:
+                    texts.append(fm["<unk>"])
+            label[-1] = 1
+            #texts = texts.rstrip() + "\t"
+            #texts += " ".join(label) + "\n"
+            #alls.append((n,texts,label))
+            labels.append(label)
+            text.append(texts)
+            num.append(n)
+            sent = ""
+            sparate = []
+            texts = []
+            label = []
+            ruiseki = 0
+            ruiseki2 = 0
+            continue
+        sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
+        ruiseki2 += len(line)
+        sparate.append(ruiseki2)
+    return num,text,labels
+def nmni_finetune(lis):
+    #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+    wakati = MeCab.Tagger("-Owakati -b 81920")
+    #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
+    with open('fm.pickle', 'rb') as f:
+        fm = pickle.load(f)
+    #fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
+    #with open('fm.pickle', 'wb') as f:
+    #    pickle.dump(fm, f)
+    #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+    #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+    texts = []
+    sent = ""
+    sparate = []
+    label = []
+    ruiseki = 0
+    ruiseki2 = 0
+    alls = []
+    labels, text, num = [], [], []
+    for n, line in enumerate(open(lis)):
+        line = line.strip("\t").rstrip("\n")
+        #print(line)
+        if line == "":
+            if sent == "":
+                continue
+            sent = wakati.parse(sent).split(" ")[:-1]
+            flag = 0
+            for i in sent:
+                for j in sparate:
+                    if ruiseki+len(i) > j and ruiseki < j:
+                        label.append(1)
+                        flag = 1
+                    elif ruiseki+len(i) == j:
+                        label.append(1)
+                        flag = 1
+                if flag == 0:
+                    label.append(0)
+                flag = 0
+                ruiseki += len(i)
+                #texts += i + " "
+                try:
+                    #texts.append(model[i])
+                    #texts.append(fm[i])
+                    texts.append(fm.vocab[i].index)
+                    #print(i,str(fm.vocab[i].index))
+                except KeyError:
+                    texts.append(fm.vocab["<unk>"].index)
+            label[-1] = 1
+            #texts = texts.rstrip() + "\t"
+            #texts += " ".join(label) + "\n"
+            #alls.append((n,texts,label))
+            labels.append(np.array(label))
+            text.append(np.array(texts))
+            num.append(n)
+            sent = ""
+            sparate = []
+            texts = []
+            label = []
+            ruiseki = 0
+            ruiseki2 = 0
+            continue
+        sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
+        ruiseki2 += len(line)
+        sparate.append(ruiseki2)
+    return text,labels
+def nmni_carte(lis):
+    #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+    wakati = MeCab.Tagger("-Owakati -b 81920")
+    #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
+    #fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
+    #with open('fm.pickle', 'wb') as f:
+    #    pickle.dump(fm, f)
+    #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+    #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+    with open('fm.pickle', 'rb') as f:
+        fm = pickle.load(f)
+    texts = []
+    sent = ""
+    sparate = []
+    label = []
+    ruiseki = 0
+    ruiseki2 = 0
+    alls = []
+    labels, text, num = [], [], []
+    allab, altex, fukugenss = [], [], []
+    #for n in tqdm(range(26431)):
+    for n in tqdm(range(108)):
+        fukugens = []
+        for line in open(lis+str(n)+".txt"):
+            line = line.strip()
+            if line == "":
+                continue
+            sent = wakati.parse(line).split(" ")[:-1]
+            flag = 0
+            label = []
+            texts = []
+            fukugen = []
+            for i in sent:
+                try:
+                    texts.append(fm.vocab[i].index)
+                except KeyError:
+                    texts.append(fm.vocab["<unk>"].index)
+                fukugen.append(i)
+                label.append(0)
+            label[-1] = 1
+            labels.append(np.array(label))
+            text.append(np.array(texts))
+            #labels.append(label)
+            #text.append(texts)
+            fukugens.append(fukugen)
+        allab.append(labels)
+        altex.append(text)
+        fukugenss.append(fukugens)
+        labels, text, fukugens= [], [], []
+    return altex, allab, fukugenss
+def nmni_finetune_s(lis):
+    #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+    wakati = MeCab.Tagger("-Owakati -b 81920")
+    #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
+    fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
+    with open('fm.pickle', 'wb') as f:
+        pickle.dump(fm, f)
+    #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+    #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+    texts = []
+    sent = ""
+    sparate = []
+    label = []
+    ruiseki = 0
+    ruiseki2 = 0
+    alls = []
+    labels, text, num = [], [], []
+    for n, line in enumerate(open(lis)):
+        line = line.strip("\t").rstrip("\n")
+        sent = wakati.parse(line).split(" ")[:-1]
+        flag = 0
+        label = []
+        texts = []
+        for i in sent:
+            try:
+                texts.append(fm.vocab[i].index)
+            except KeyError:
+                texts.append(fm.vocab["<unk>"].index)
+            label.append(0)
+        label[-1] = 1
+        labels.append(np.array(label))
+        text.append(np.array(texts))
+    return text,labels
+def nmni_finetune_ss(lis):
+    #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+    wakati = MeCab.Tagger("-Owakati -b 81920")
+    fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
+    with open('fm.pickle', 'wb') as f:
+        pickle.dump(fm, f)
+    #with open('fm.pickle', 'rb') as f:
+    #    fm = pickle.load(f)
+    #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+    #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+    t,l =[],[]
+    for i in range(108):
+        texts = []
+        sent = ""
+        sparate = []
+        label = []
+        ruiseki = 0
+        ruiseki2 = 0
+        alls = []
+        labels, text, num = [], [], []
+        for n, line in enumerate(open(lis+str(i)+".txt")):
+            line = line.strip("\t").rstrip("\n")
+            if line == "":
+                continue
+            sent = wakati.parse(line).split(" ")[:-1]
+            flag = 0
+            label = []
+            texts = []
+            for i in sent:
+                try:
+                    texts.append(fm.vocab[i].index)
+                except KeyError:
+                    texts.append(fm.vocab["<unk>"].index)
+                label.append(0)
+            label[-1] = 1
+            labels.append(np.array(label))
+            text.append(np.array(texts))
+        t.append(text)
+        l.append(labels)
+    return t,l
+#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+#print(model.get_subwords("間質性肺炎"))
+#print(model.get_subwords("誤嚥性肺炎"))
+#print(model.get_subwords("談話ユニット分割"))
+"""
+texts = []
+sent = ""
+sparate = []
+label = []
+ruiseki = 0
+ruiseki2 = 0
+alls = []
+for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")):
+    line = line.strip("\t").rstrip("\n")
+    if line == "":
+        if sent == "":
+            continue
+        alls.append(sent)
+        sent = ""
+        continue
+    else:
+        sent += line
+if len(sent) != 0:
+    alls.append(sent)
+random.shuffle(alls)
+#v = random.sample(alls, 300)
+#for i in v:
+#    alls.remove(i)
+#t = random.sample(alls, 300)
+#for i in t:
+#    alls.remove(i)
+with open("randomdata_concat.tsv","a")as f:
+    f.write("\n".join())
+#with open("dev_fix.tsv","a")as f:
+#    for i in v:
+#        f.write("\n".join(i))
+#        f.write("\n\n")
+#with open("test_fix.tsv","a")as f:
+#    for i in t:
+#        f.write("\n".join(i))
+#        f.write("\n\n")
+"""
+"""
+out = ""
+for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"):
+    line = line.split("\t")
+    line = line[0].strip()
+    if line == "" or "サマリ" in line:
+        continue
+    out += line + "\n"
+with open("alldata3.tsv","w")as f:
+    f.write(out)
+"""
+"""
+#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+wakati = MeCab.Tagger("-Owakati -b 81920")
+with open('fm_space.pickle', 'rb') as f:
+    fm = pickle.load(f)
+#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
+#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
+texts = []
+sent = ""
+sparate = []
+label = []
+ruiseki = 0
+ruiseki2 = 0
+alls = []
+for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")):
+    line = line.strip("\t").rstrip("\n")
+    #print(line)
+    if line == "":
+        if sent == "":
+            continue
+        sent = wakati.parse(sent).split(" ")[:-1]
+        flag = 0
+        for i in sent:
+            for j in sparate:
+                if ruiseki+len(i) > j and ruiseki < j:
+                    label.append(1)
+                    flag = 1
+                elif ruiseki+len(i) == j:
+                    label.append(1)
+                    flag = 1
+            if flag == 0:
+                label.append(0)
+            flag = 0
+            ruiseki += len(i)
+            #texts += i + " "
+            try:
+                #texts.append(model[i])
+                texts.append(fm.vocab[i])
+                #texts += str(fm.vocab[i].index) + " "
+                #print(i,str(fm.vocab[i].index))
+            except KeyError:
+                texts.append(fm.vocab["<unk>"])
+                print(i)
+        label[-1] = 1
+        #texts = texts.rstrip() + "\t"
+        #texts += " ".join(label) + "\n"
+        alls.append((str(n),texts,label))
+        sent = ""
+        sparate = []
+        texts = []
+        label = []
+        ruiseki = 0
+        ruiseki2 = 0
+        continue
+    sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
+    ruiseki2 += len(line)
+    sparate.append(ruiseki2)
+with open('nm_ni/train.pickle', 'wb') as f:
+            pickle.dump(alls, f)
+#print(alls)
+#with open("resepdata_seped.tsv","w")as f:
+#    f.write(texts)
+"""
+wakati = MeCab.Tagger("-Owakati")
+#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
+#with open('fm.pickle', 'wb') as f:
+#        pickle.dump(fm, f)
+texts = ""
+sent = ""
+sparate = []
+label = []
+ruiseki = 0
+ruiseki2 = 0
+for line in open("alldata.tsv"):
+    line = line.split("\t")
+    line = line[0].strip()
+    if line == "" or "サマリ" in line:
+        if sent == "":
+            continue
+        sent = wakati.parse(sent).split(" ")[:-1]
+        flag = 0
+        #print(sent,sparate)
+        for i in sent:
+            #print(i)
+            for j in sparate:
+                if ruiseki+len(i) > j and ruiseki < j:
+                    #print(j)
+                    label.append("1")
+                    flag = 1
+                elif ruiseki+len(i) == j:
+                    #print(j)
+                    label.append("1")
+                    flag = 1
+            if flag == 0:
+                label.append("0")
+            flag = 0
+            ruiseki += len(i)
+            #texts += i + " "
+            try:
+                texts += str(0) + " "
+            except KeyError:
+                print(i)
+                #texts += str(fm.vocab["<unk>"].index) + " "
+        label[-1] = "1"
+        texts = texts.rstrip() + "\t"
+        texts += " ".join(label) + "\n"
+        sent = ""
+        sparate = []
+        label = []
+        ruiseki = 0
+        ruiseki2 = 0
+        #print(texts)
+        continue
+    sent += line.strip()
+    ruiseki2 += len(line.strip())
+    sparate.append(ruiseki2)
+with open("random_labbeled.tsv","w")as f:
+    f.write(texts)
+"""
+wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
+#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False)
+#with open('fm_space.pickle', 'wb') as f:
+#        pickle.dump(fm, f)
+with open('fm_space.pickle', 'rb') as f:
+    fm = pickle.load(f)
+texts = ""
+sent = ""
+sparate = []
+label = []
+ruiseki = 0
+ruiseki2 = 0
+for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"):
+    line = line.split("\t")
+    line = line[0].strip("\t").rstrip("\n")
+    #print(line)
+    if line == "" or "サマリ" in line:
+        if sent == "":
+            continue
+        print(sent)
+        sent = sent.replace(" ","<space>")
+        sent = wakati.parse(sent).split(" ")[:-1]
+        print(sent)
+        flag = 0
+        #print(sent,sparate)
+        for i in sent:
+            #print(i)
+            for j in sparate:
+                if ruiseki+len(i) > j and ruiseki < j:
+                    #print(j)
+                    label.append("1")
+                    flag = 1
+                elif ruiseki+len(i) == j:
+                    #print(j)
+                    label.append("1")
+                    flag = 1
+            if flag == 0:
+                label.append("0")
+            flag = 0
+            ruiseki += len(i)
+            #texts += i + " "
+            try:
+                texts += str(fm.vocab[i].index) + " "
+                #print(i,str(fm.vocab[i].index))
+            except KeyError:
+                texts += str(fm.vocab["<unk>"].index) + " "
+        label[-1] = "1"
+        texts = texts.rstrip() + "\t"
+        texts += " ".join(label) + "\n"
+        sent = ""
+        sparate = []
+        label = []
+        ruiseki = 0
+        ruiseki2 = 0
+        #print(texts)
+        continue
+    sent += line.strip("\t")
+    ruiseki2 += len(line)
+    sparate.append(ruiseki2)
+with open("alldata2_space.tsv","w")as f:
+    f.write(texts)
+"""
+"""
+wakati = MeCab.Tagger("-Owakati")
+fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
+texts = ""
+sent = ""
+cand = ""
+sparate = []
+label = []
+ruiseki = 0
+ruiseki2 = 0
+flag2 = 1
+for line in open("data2.tsv"):
+    line = line.split("\t")
+    if flag2 == 1:
+        cand = line
+        flag2 = 2
+        continue
+    if flag2 == 2:
+        flag2 = 1
+        #print(line,cand)
+        for n,z in enumerate(zip(cand,line)):
+            i = z[0]
+            j = z[1]
+            n = n+1
+            if i == "":
+                sent = wakati.parse(sent).split(" ")[:-1]
+                flag = 0
+                #print(sent,sparate)
+                for i in sent:
+                    #print(i)
+                    for j in sparate:
+                        if ruiseki+len(i) > j and ruiseki < j:
+                            #print(j)
+                            label.append("1")
+                            flag = 1
+                        elif ruiseki+len(i) == j:
+                            #print(j)
+                            label.append("1")
+                            flag = 1
+                    if flag == 0:
+                        label.append("0")
+                    flag = 0
+                    ruiseki += len(i)
+                    #texts += i + " "
+                    try:
+                        texts += str(fm.vocab[i].index) + " "
+                    except KeyError:
+                        texts += str(fm.vocab["<unk>"].index) + " "
+                label[-1] = "1"
+                texts = texts.rstrip() + "\t"
+                texts += " ".join(label) + "\n"
+                sent = ""
+                sparate = []
+                label = []
+                ruiseki = 0
+                ruiseki2 = 0
+                #print(texts)
+                break
+            if j == "|":
+                sparate.append(n)
+            sent += i
+with open("alldata.tsv","w")as f:
+    f.write(texts)
+"""

fm.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c02d5957824106f6217e9a56d89ee5b7ca9ae399c7a49af8dc062e1ea0be99
+size 2521658187

functionforDownloadButtons.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import streamlit as st
+import pickle
+import pandas as pd
+import json
+import base64
+import uuid
+import re
+import importlib.util
+def import_from_file(module_name: str, filepath: str):
+    """
+    Imports a module from file.
+    Args:
+        module_name (str): Assigned to the module's __name__ parameter (does not
+            influence how the module is named outside of this function)
+        filepath (str): Path to the .py file
+    Returns:
+        The module
+    """
+    spec = importlib.util.spec_from_file_location(module_name, filepath)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def notebook_header(text):
+    """
+    Insert section header into a jinja file, formatted as notebook cell.
+    Leave 2 blank lines before the header.
+    """
+    return f"""# # {text}
+"""
+def code_header(text):
+    """
+    Insert section header into a jinja file, formatted as Python comment.
+    Leave 2 blank lines before the header.
+    """
+    seperator_len = (75 - len(text)) / 2
+    seperator_len_left = math.floor(seperator_len)
+    seperator_len_right = math.ceil(seperator_len)
+    return f"# {'-' * seperator_len_left} {text} {'-' * seperator_len_right}"
+def to_notebook(code):
+    """Converts Python code to Jupyter notebook format."""
+    notebook = jupytext.reads(code, fmt="py")
+    return jupytext.writes(notebook, fmt="ipynb")
+def open_link(url, new_tab=True):
+    """Dirty hack to open a new web page with a streamlit button."""
+    # From: https://discuss.streamlit.io/t/how-to-link-a-button-to-a-webpage/1661/3
+    if new_tab:
+        js = f"window.open('{url}')"  # New tab or window
+    else:
+        js = f"window.location.href = '{url}'"  # Current tab
+    html = '<img src onerror="{}">'.format(js)
+    div = Div(text=html)
+    st.bokeh_chart(div)
+def download_button(object_to_download, download_filename, button_text):
+    """
+    Generates a link to download the given object_to_download.
+    From: https://discuss.streamlit.io/t/a-download-button-with-custom-css/4220
+    Params:
+    ------
+    object_to_download:  The object to be downloaded.
+    download_filename (str): filename and extension of file. e.g. mydata.csv,
+    some_txt_output.txt download_link_text (str): Text to display for download
+    link.
+    button_text (str): Text to display on download button (e.g. 'click here to download file')
+    pickle_it (bool): If True, pickle file.
+    Returns:
+    -------
+    (str): the anchor tag to download object_to_download
+    Examples:
+    --------
+    download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
+    download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
+    """
+    # if pickle_it:
+    #    try:
+    #        object_to_download = pickle.dumps(object_to_download)
+    #    except pickle.PicklingError as e:
+    #        st.write(e)
+    #        return None
+    # if:
+    if isinstance(object_to_download, bytes):
+        pass
+    elif isinstance(object_to_download, pd.DataFrame):
+        object_to_download = object_to_download.to_csv(index=False)
+    # Try JSON encode for everything else
+    else:
+        object_to_download = json.dumps(object_to_download)
+    try:
+        # some strings <-> bytes conversions necessary here
+        b64 = base64.b64encode(object_to_download.encode()).decode()
+    except AttributeError as e:
+        b64 = base64.b64encode(object_to_download).decode()
+    button_uuid = str(uuid.uuid4()).replace("-", "")
+    button_id = re.sub("\d+", "", button_uuid)
+    custom_css = f"""
+        <style>
+            #{button_id} {{
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                background-color: rgb(255, 255, 255);
+                color: rgb(38, 39, 48);
+                padding: .25rem .75rem;
+                position: relative;
+                text-decoration: none;
+                border-radius: 4px;
+                border-width: 1px;
+                border-style: solid;
+                border-color: rgb(230, 234, 241);
+                border-image: initial;
+            }}
+            #{button_id}:hover {{
+                border-color: rgb(246, 51, 102);
+                color: rgb(246, 51, 102);
+            }}
+            #{button_id}:active {{
+                box-shadow: none;
+                background-color: rgb(246, 51, 102);
+                color: white;
+                }}
+        </style> """
+    dl_link = (
+        custom_css
+        + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br><br>'
+    )
+    # dl_link = f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}"><input type="button" kind="primary" value="{button_text}"></a><br></br>'
+    st.markdown(dl_link, unsafe_allow_html=True)
+# def download_link(
+#     content, label="Download", filename="file.txt", mimetype="text/plain"
+# ):
+#     """Create a HTML link to download a string as a file."""
+#     # From: https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806/9
+#     b64 = base64.b64encode(
+#         content.encode()
+#     ).decode()  # some strings <-> bytes conversions necessary here
+#     href = (
+#         f'<a href="data:{mimetype};base64,{b64}" download="{filename}">{label}</a>'
+#     )
+#     return href

logo.png ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import torch
+import torch.nn as nn
+import torch.nn.utils.rnn as R
+import torch.nn.functional as F
+from torch.autograd import Variable
+import numpy as np
+class PointerNetworks(nn.Module):
+    def __init__(self,voca_size, voc_embeddings,word_dim, hidden_dim,is_bi_encoder_rnn,rnn_type,rnn_layers,
+                 dropout_prob,use_cuda,finedtuning,isbanor,batchsize):
+        super(PointerNetworks,self).__init__()
+        self.word_dim = word_dim
+        self.voca_size = voca_size
+        self.hidden_dim = hidden_dim
+        self.dropout_prob = dropout_prob
+        self.is_bi_encoder_rnn = is_bi_encoder_rnn
+        self.num_rnn_layers = rnn_layers
+        self.rnn_type = rnn_type
+        self.voc_embeddings = voc_embeddings
+        self.finedtuning = finedtuning
+        self.batchsize = batchsize
+        self.nnDropout = nn.Dropout(dropout_prob)
+        self.isbanor = isbanor
+        if rnn_type in ['LSTM', 'GRU']:
+            self.decoder_rnn = getattr(nn, rnn_type)(input_size=word_dim,
+                                                     hidden_size=2 * hidden_dim if is_bi_encoder_rnn else hidden_dim,
+                                                     num_layers=rnn_layers,
+                                                     dropout=dropout_prob,
+                                                     batch_first=True)
+            self.encoder_rnn = getattr(nn, rnn_type)(input_size=word_dim,
+                                       hidden_size=hidden_dim,
+                                       num_layers=rnn_layers,
+                                       bidirectional=is_bi_encoder_rnn,
+                                       dropout=dropout_prob,
+                                       batch_first=True)
+        else:
+            print('rnn_type should be LSTM,GRU')
+        self.use_cuda = True
+        self.nnSELU = nn.SELU()
+        self.nnEm = nn.Embedding(self.voca_size,self.word_dim,padding_idx=2000001)
+        #self.nnEm = nn.Embedding.from_pretrained(self.voc_embeddings,freeze=self.finedtuning,padding_idx=-1)
+        self.initEmbeddings(self.voc_embeddings)
+        if self.use_cuda:
+            self.nnEm = self.nnEm.cuda()
+        if self.is_bi_encoder_rnn:
+            self.num_encoder_bi = 2
+        else:
+            self.num_encoder_bi = 1
+        self.nnW1 = nn.Linear(self.num_encoder_bi * hidden_dim, self.num_encoder_bi * hidden_dim, bias=False)
+        self.nnW2 = nn.Linear(self.num_encoder_bi * hidden_dim, self.num_encoder_bi * hidden_dim, bias=False)
+        self.nnV = nn.Linear(self.num_encoder_bi * hidden_dim, 1, bias=False)
+    def initEmbeddings(self,weights):
+        self.nnEm.weight.data.copy_(torch.from_numpy(weights))
+        self.nnEm.weight.requires_grad = self.finedtuning
+    def initHidden(self,hsize,batchsize):
+        #hsize=self.hidden_dim
+        #batchsize=self.batchsize
+        if self.rnn_type == 'LSTM':
+            h_0 = Variable(torch.zeros(self.num_encoder_bi*self.num_rnn_layers, batchsize, hsize))
+            c_0 = Variable(torch.zeros(self.num_encoder_bi*self.num_rnn_layers, batchsize, hsize))
+            if self.use_cuda:
+                h_0 = h_0.cuda()
+                c_0 = c_0.cuda()
+            return (h_0, c_0)
+        else:
+            h_0 = Variable(torch.zeros(self.num_encoder_bi*self.num_rnn_layers, batchsize, hsize))
+            if self.use_cuda:
+                h_0 = h_0.cuda()
+            return h_0
+    def _run_rnn_packed(self, cell, x, x_lens, h=None):
+        #print(x_lens)
+        x_packed = R.pack_padded_sequence(x, x_lens.data.tolist(),
+                                          batch_first=True, enforce_sorted=False)
+        if h is not None:
+            output, h = cell(x_packed, h)
+        else:
+            output, h = cell(x_packed)
+        output, _ = R.pad_packed_sequence(output, batch_first=True)
+        return output, h
+    def pointerEncoder(self,Xin,lens):
+        self.bn_inputdata = nn.BatchNorm1d(self.word_dim, affine=False, track_running_stats=False)
+        batch_size,maxL = Xin.size()
+        X = self.nnEm(Xin)  # N L  C
+        if self.isbanor and maxL>1:
+            X= X.permute(0,2,1) # N C L
+            X = self.bn_inputdata(X)
+            X = X.permute(0, 2, 1) # N L C
+        X = self.nnDropout(X)
+        encoder_lstm_co_h_o = self.initHidden(self.hidden_dim, batch_size)
+        o, h = self._run_rnn_packed(self.encoder_rnn, X, lens, encoder_lstm_co_h_o)  # batch_first=True
+        o = o.contiguous()
+        o = self.nnDropout(o)
+        return o,h
+    def pointerLayer(self,en,di):
+        """
+        :param en:  [L,H]
+        :param di:  [H,]
+        :return:
+        """
+        WE = self.nnW1(en)
+        exdi = di.expand_as(en)
+        WD = self.nnW2(exdi)
+        nnV = self.nnV(self.nnSELU(WE+WD))
+        nnV = nnV.permute(1,0)
+        nnV = self.nnSELU(nnV)
+        #TODO: for log loss
+        att_weights = F.softmax(nnV)
+        logits = F.log_softmax(nnV)
+        return logits,att_weights
+    def training_decoder(self,hn,hend,X,Xindex,Yindex,lens):
+        """
+        """
+        loss_function  = nn.NLLLoss()
+        batch_loss =0
+        LoopN =0
+        batch_size = len(lens)
+        for i in range(len(lens)): #Loop batch size
+            curX_index = Xindex[i]
+            #print(curX_index)
+            #print()
+            curY_index = Yindex[i]
+            curL = lens[i]
+            curX = X[i]
+            #print(curX)
+            x_index_var = Variable(torch.from_numpy(curX_index.astype(np.int64)))
+            if self.use_cuda:
+                x_index_var = x_index_var.cuda()
+            cur_lookup = curX[x_index_var]
+            #print(cur_lookup)
+            curX_vectors = self.nnEm(cur_lookup)  # output: [seq,features]
+            curX_vectors = curX_vectors.unsqueeze(0)  # [batch, seq, features]
+            if self.rnn_type =='LSTM':# need h_end,c_end
+                h_end = hend[0].permute(1, 0, 2).contiguous().view(batch_size, self.num_rnn_layers,-1)
+                c_end = hend[1].permute(1, 0, 2).contiguous().view(batch_size, self.num_rnn_layers,-1)
+                curh0 = h_end[i].unsqueeze(0).permute(1, 0, 2)
+                curc0 = c_end[i].unsqueeze(0).permute(1, 0, 2)
+                h_pass = (curh0,curc0)
+            else:
+                h_end = hend.permute(1, 0, 2).contiguous().view(batch_size, self.num_rnn_layers,-1)
+                curh0 = h_end[i].unsqueeze(0).permute(1, 0, 2)
+                h_pass = curh0
+            decoder_out,_ = self.decoder_rnn(curX_vectors,h_pass)
+            decoder_out = decoder_out.squeeze(0)   #[seq,features]
+            curencoder_hn = hn[i,0:curL,:]  # hn[batch,seq,H] -->[seq,H] i is loop batch size
+            for j in range(len(decoder_out)):  #Loop di
+                #print(len(decoder_out),curY_index)
+                cur_dj = decoder_out[j]
+                cur_groundy = curY_index[j]
+                cur_start_index = curX_index[j]
+                predict_range = list(range(cur_start_index,curL))
+                # TODO: make it point backward, only consider predict_range in current time step
+                # align groundtruth
+                cur_groundy_var = Variable(torch.LongTensor([int(cur_groundy) - int(cur_start_index)]))
+                if self.use_cuda:
+                    cur_groundy_var = cur_groundy_var.cuda()
+                curencoder_hn_back = curencoder_hn[predict_range,:]
+                cur_logists, cur_weights = self.pointerLayer(curencoder_hn_back,cur_dj)
+                batch_loss = batch_loss + loss_function(cur_logists,cur_groundy_var)
+                LoopN = LoopN +1
+        batch_loss = batch_loss/LoopN
+        return batch_loss
+    def neg_log_likelihood(self,Xin,index_decoder_x, index_decoder_y,lens):
+        '''
+        :param Xin:  stack_x, [allseq,wordDim]
+        :param Yin:
+        :param lens:
+        :return:
+        '''
+        encoder_hn, encoder_h_end = self.pointerEncoder(Xin,lens)
+        loss = self.training_decoder(encoder_hn, encoder_h_end,Xin,index_decoder_x, index_decoder_y,lens)
+        return loss
+    def test_decoder(self,hn,hend,X,Yindex,lens):
+        loss_function = nn.NLLLoss()
+        batch_loss = 0
+        LoopN = 0
+        batch_boundary =[]
+        batch_boundary_start =[]
+        batch_align_matrix =[]
+        batch_size = len(lens)
+        for i in range(len(lens)):  # Loop batch size
+            curL = lens[i]
+            curY_index = Yindex[i]
+            curX = X[i]
+            cur_end_boundary =curY_index[-1]
+            cur_boundary = []
+            cur_b_start = []
+            cur_align_matrix = []
+            cur_sentence_vectors = self.nnEm(curX)  # output: [seq,features]
+            if self.rnn_type =='LSTM':# need h_end,c_end
+                h_end = hend[0].permute(1, 0, 2).contiguous().view(batch_size, self.num_rnn_layers,-1)
+                c_end = hend[1].permute(1, 0, 2).contiguous().view(batch_size, self.num_rnn_layers,-1)
+                curh0 = h_end[i].unsqueeze(0).permute(1, 0, 2)
+                curc0 = c_end[i].unsqueeze(0).permute(1, 0, 2)
+                h_pass = (curh0,curc0)
+            else: # only need h_end
+                h_end = hend.permute(1, 0, 2).contiguous().view(batch_size, self.num_rnn_layers,-1)
+                curh0 = h_end[i].unsqueeze(0).permute(1, 0, 2)
+                h_pass = curh0
+            curencoder_hn = hn[i, 0:curL, :]  # hn[batch,seq,H] --> [seq,H]  i is loop batch size
+            Not_break = True
+            loop_in = cur_sentence_vectors[0,:].unsqueeze(0).unsqueeze(0)  #[1,1,H]
+            loop_hc = h_pass
+            loopstart =0
+            loop_j =0
+            while (Not_break): #if not end
+                loop_o, loop_hc = self.decoder_rnn(loop_in,loop_hc)
+                #TODO: make it point backward
+                predict_range = list(range(loopstart,curL))
+                curencoder_hn_back = curencoder_hn[predict_range,:]
+                cur_logists, cur_weights = self.pointerLayer(curencoder_hn_back, loop_o.squeeze(0).squeeze(0))
+                cur_align_vector = np.zeros(curL)
+                cur_align_vector[predict_range]=cur_weights.data.cpu().numpy()[0]
+                cur_align_matrix.append(cur_align_vector)
+                #TODO:align groundtruth
+                if loop_j > len(curY_index)-1:
+                    cur_groundy = curY_index[-1]
+                else:
+                    cur_groundy = curY_index[loop_j]
+                cur_groundy_var = Variable(torch.LongTensor([max(0,int(cur_groundy) - loopstart)]))
+                if self.use_cuda:
+                    cur_groundy_var = cur_groundy_var.cuda()
+                batch_loss = batch_loss + loss_function(cur_logists, cur_groundy_var)
+                #TODO: get predicted boundary
+                topv, topi = cur_logists.data.topk(1)
+                pred_index = topi[0][0]
+                #TODO: align pred_index to original seq
+                ori_pred_index =pred_index + loopstart
+                if cur_end_boundary == ori_pred_index:
+                    cur_boundary.append(ori_pred_index)
+                    cur_b_start.append(loopstart)
+                    Not_break = False
+                    loop_j = loop_j + 1
+                    LoopN = LoopN + 1
+                    break
+                else:
+                    cur_boundary.append(ori_pred_index)
+                    loop_in = cur_sentence_vectors[ori_pred_index+1,:].unsqueeze(0).unsqueeze(0)
+                    cur_b_start.append(loopstart)
+                    loopstart = ori_pred_index+1  # start =  pred_end + 1
+                    loop_j = loop_j + 1
+                    LoopN = LoopN + 1
+            #For each instance in batch
+            batch_boundary.append(cur_boundary)
+            batch_boundary_start.append(cur_b_start)
+            batch_align_matrix.append(cur_align_matrix)
+        batch_loss = batch_loss / LoopN
+        batch_boundary=np.array(batch_boundary)
+        batch_boundary_start = np.array(batch_boundary_start)
+        batch_align_matrix = np.array(batch_align_matrix)
+        return batch_loss,batch_boundary,batch_boundary_start,batch_align_matrix
+    def predict(self,Xin,index_decoder_y,lens):
+        batch_size = index_decoder_y.shape[0]
+        encoder_hn, encoder_h_end = self.pointerEncoder(Xin, lens)
+        batch_loss, batch_boundary, batch_boundary_start, batch_align_matrix = self.test_decoder(encoder_hn,encoder_h_end,Xin,index_decoder_y,lens)
+        return  batch_loss,batch_boundary,batch_boundary_start,batch_align_matrix

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+seaborn
+matplotlib
+streamlit == 0.87
+pandas == 1.2.4
+keybert
+flair
+click<8

run_segbot.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-from nltk.tokenize import word_tokenize
 import pickle
 import numpy as np
 import random
@@ -8,99 +7,67 @@ from solver import TrainSolver
 from model import PointerNetworks
 import gensim
-from tqdm import tqdm
-class Lang:
-    def __init__(self, name):
-        self.name = name
-        self.word2index = {"RE_DIGITS":1,"UNKNOWN":0,"PADDING":2000001}
-        self.word2count = {"RE_DIGITS":1,"UNKNOWN":1,"PADDING":1}
-        self.index2word = {2000001: "PADDING", 1: "RE_DIGITS", 0: "UNKNOWN"}
-        self.n_words = 3  # Count SOS and EOS
-    def addSentence(self, sentence):
-        for word in sentence.strip('\n').strip('\r').split(' '):
-            self.addWord(word)
-    def addWord(self, word):
-        if word not in self.word2index:
-            self.word2index[word] = self.n_words
-            self.word2count[word] = 1
-            self.index2word[self.n_words] = word
-            self.n_words += 1
-        else:
-            self.word2count[word] += 1
-def mytokenizer(inS,all_dict):
-    #repDig = re.sub(r'\d+[\.,/]?\d+','RE_DIGITS',inS)
-    #repDig = re.sub(r'\d*[\d,]*\d+', 'RE_DIGITS', inS)
-    toked = inS
-    or_toked = inS
-    re_unk_list = []
-    ori_list = []
-    for (i,t) in enumerate(toked):
-        if t not in all_dict and t not in ['RE_DIGITS']:
-            re_unk_list.append('UNKNOWN')
-            ori_list.append(or_toked[i])
-        else:
-            re_unk_list.append(t)
-            ori_list.append(or_toked[i])
-    labey_edus = [0]*len(re_unk_list)
-    labey_edus[-1] = 1
-    return ori_list,re_unk_list,labey_edus
-def get_mapping(X,Y,D):
-    X_map = []
-    for w in X:
-        if w in D:
-            X_map.append(D[w])
         else:
-            X_map.append(D['UNKNOWN'])
-    X_map = np.array([X_map])
-    Y_map = np.array([Y])
-    return X_map,Y_map
-def get_model():
     with open('model.pickle', 'rb') as f:
         mysolver = pickle.load(f)
-    return mysolver
-    #for i in tqdm(range(0,26431)):
-    test_batch_ave_loss, test_pre, test_rec, test_f1, visdata = mysolver.check_accuracy(X_tes, Y_tes,index2word, fukugen)
-    #test_batch_ave_loss, test_pre, test_rec, test_f1, visdata = mysolver.check_accuracy(X_tes, Y_tes,0)
-        #with open(str(i)+"seped","w")as f:
-        #    f.write(o)
-    #test_batch_ave_loss, test_pre, test_rec, test_f1, visdata = mysolver.check_accuracy(X_tes, Y_tes,0)
-    print(test_pre, test_rec, test_f1)
-    #start_b = visdata[3][0]
-    #end_b = visdata[2][0] + 1
-    #segments = []
-    #for i, END in enumerate(end_b):
-    #    START = start_b[i]
-    #    segments.append(' '.join(ori_X[START:END]))
-    return test_pre, test_rec, test_f1

 import re
 import pickle
 import numpy as np
 import random
 from model import PointerNetworks
 import gensim
+import MeCab
+import pysbd
+def create_data(doc,fm,split_method):
+    wakati = MeCab.Tagger("-Owakati -b 81920")
+    seg = pysbd.Segmenter(language="ja", clean=False)
+    texts = []
+    sent = ""
+    label = []
+    alls = []
+    labels, text, num = [], [], []
+    allab, altex, fukugenss = [], [], []
+    for n in range(1):
+        fukugens = []
+        if split_method == "pySBD":
+            lines = seg.segment(doc)
         else:
+            doc = doc.strip().replace("。","。\n").replace("．","．\n")
+            doc = re.sub("(\n)+","\n",doc)
+            lines = doc.split("\n")
+        for line in lines:
+            line = line.strip()
+            if line == "":
+                continue
+            sent = wakati.parse(line).split(" ")[:-1]
+            flag = 0
+            label = []
+            texts = []
+            fukugen = []
+            for i in sent:
+                try:
+                    texts.append(fm.vocab[i].index)
+                except KeyError:
+                    texts.append(fm.vocab["<unk>"].index)
+                fukugen.append(i)
+                label.append(0)
+            label[-1] = 1
+            labels.append(np.array(label))
+            text.append(np.array(texts))
+            fukugens.append(fukugen)
+        allab.append(labels)
+        altex.append(text)
+        fukugenss.append(fukugens)
+        labels, text, fukugens= [], [], []
+    return altex, allab, fukugenss
+def generate(doc, mymodel, fm, index2word, split_method):
+    X_tes, Y_tes, fukugen = create_data(doc,fm,split_method)
+    output_texts = mymodel.check_accuracy(X_tes, Y_tes,index2word, fukugen)
+    return output_texts
+def setup():
+    with open('index2word.pickle', 'rb') as f:
+        index2word = pickle.load(f)
     with open('model.pickle', 'rb') as f:
         mysolver = pickle.load(f)
+    with open('fm.pickle', 'rb') as f:
+        fm = pickle.load(f)
+    return mysolver,fm,index2word

solver.py CHANGED Viewed

@@ -6,7 +6,6 @@ from torch.autograd import Variable
 import random
 from torch.nn.utils import clip_grad_norm
 import copy
-from tqdm import tqdm
 import os
 import pickle
@@ -56,76 +55,36 @@ def align_variable_numpy(X,maxL,paddingNumber):
 def sample_a_sorted_batch_from_numpy(numpyX,numpyY,batch_size,use_cuda):
-    if batch_size != None:
-        select_index = random.sample(range(len(numpyY)), batch_size)
-    else:
-        select_index = np.array(range(len(numpyY)))
     select_index = np.array(range(len(numpyX)))
     batch_x = [copy.deepcopy(numpyX[i]) for i in select_index]
     batch_y = [copy.deepcopy(numpyY[i]) for i in select_index]
-    #print(batch_y)
     index_decoder_X,index_decoder_Y = get_decoder_index_XY(batch_y)
-    #index_decoder = [get_decoder_index_XY(i) for i in batch_y]
-    #index_decoder_X = [i[0] for i in index_decoder]
-    #index_decoder_Y = [i[1] for i in index_decoder]
-    #print(index_decoder_Y)
-    #all_lens = []
     all_lens = np.array([len(x) for x in batch_y])
-    #for x in batch_y:
-    #    print(x)
-    #    try:
-    #        all_lens.append(len(x))
-    #    except:
-    #        all_lens.append(1)
-    #all_lens = np.array(all_lens)
     maxL = np.max(all_lens)
-    #idx = all_lens
-    #print(idx)
     idx = np.argsort(all_lens)
     idx = np.sort(idx)
-    #print(idx)
-    #idx = idx[::-1]  # decreasing
-    #print(idx)
     batch_x = [batch_x[i] for i in idx]
     batch_y = [batch_y[i] for i in idx]
     all_lens = all_lens[idx]
     index_decoder_X = np.array([index_decoder_X[i] for i in idx])
     index_decoder_Y = np.array([index_decoder_Y[i] for i in idx])
-    #print(index_decoder_Y)
     numpy_batch_x = batch_x
     batch_x = align_variable_numpy(batch_x,maxL,2000001)
     batch_y = align_variable_numpy(batch_y,maxL,2)
-    print(len(batch_x))
-    #batch_x = Variable(torch.from_numpy(batch_x.astype(np.int64)))
     batch_x = Variable(torch.from_numpy(np.array(batch_x, dtype="int64")))
     if use_cuda:
         batch_x = batch_x.cuda()
     return  numpy_batch_x,batch_x,batch_y,index_decoder_X,index_decoder_Y,all_lens,maxL
@@ -144,7 +103,6 @@ class TrainSolver(object):
         self.lr_decay_epoch = lr_decay_epoch
         self.eval_size  = eval_size
         self.dev_x, self.dev_y = dev_x, dev_y
         self.model = model
@@ -152,294 +110,70 @@ class TrainSolver(object):
         self.weight_decay =weight_decay
-    def sample_dev(self):
-        test_tr_x = []
-        test_tr_y = []
-        select_index = random.sample(range(len(self.train_y)),self.eval_size)
-        test_tr_x = [self.train_x[n] for n in select_index]
-        test_tr_y = [self.train_y[n] for n in select_index]
-        return test_tr_x,test_tr_y
     def get_batch_micro_metric(self,pre_b, ground_b, x,index2word, fukugen, nloop):
         tokendic = {}
-        #with open('index2word.pickle', 'rb') as f:
-        #    index2word = pickle.load(f)
         for n,i in enumerate(index2word):
             tokendic[n] = i
-        All_C = []
-        All_R = []
-        All_G = []
-        """
-        for i,cur_seq_y in enumerate(zip(ground_b,fukugen[nloop])):
-            #print(fukugen[nloop])
-            fuku = cur_seq_y[1]
-            cur_seq_y = cur_seq_y[0]
-            index_of_1 = np.where(cur_seq_y==1)[0]
-            #print(index_of_1)
-            index_pre = pre_b[i]
-            inp = x[i]
-            #print(len(inp))
-        """
-        print(len(pre_b), len(ground_b), len(fukugen))
-        #global leng
-        #print(fukugen)
         for i,cur_seq_y in enumerate(ground_b):
-            #print(fukugen[nloop])
             fuku = fukugen[i]
-            #cur_seq_y = cur_seq_y[0]
             index_of_1 = np.where(cur_seq_y==1)[0]
-            #print(index_of_1)
             index_pre = pre_b[i]
             inp = x[i]
-            #print(len(inp))
             index_pre = np.array(index_pre)
             END_B = index_of_1[-1]
             index_pre = index_pre[index_pre != END_B]
             index_of_1 = index_of_1[index_of_1 != END_B]
-            no_correct = len(np.intersect1d(list(index_of_1), list(index_pre)))
-            All_C.append(no_correct)
-            All_R.append(len(index_pre))
-            All_G.append(len(index_of_1))
             index_of_1 = list(index_of_1)
             index_pre = list(index_pre)
-            FN = []
             FP = []
-            TP = []
             sent = []
             ex = ""
-            for j in inp:
-                sent.append(tokendic[int(j.to('cpu').detach().numpy().copy())])
-            for k in index_of_1:
-                if k not in index_pre:
-                    FN.append(k)
-                if k in index_pre:
-                    TP.append(k)
             for k in index_pre:
                 if k not in index_of_1:
                     FP.append(k)
-            #if len(FN) == 0 and len(FP) == 0:
-            #    continue
-            #for n,i in enumerate(sent):
             for n,k in enumerate(zip(sent, fuku)):
                 f = k[1]
                 i = k[0]
                 if k == "<pad>":
                     continue
                 if n in FP:
-                    ex += f + "<FP>"
-                else:
                     ex += f
-                """
-                if n in FN:
-                    #ex += i + "<FN>"
-                    ex += i
-                elif n in FP:
-                    ex += i + "<FP>"
-                elif n in TP:
-                    ex += i + "<TP>"
                 else:
-                    ex += i
-                """
-            #with open(str(nloop)+"_sep_nounk.txt", "a")as f:
-            #    f.write(ex+"\n")
-            #print(i)
-            #leng += 1
-        return All_C,All_R,All_G
-    def get_batch_metric(self,pre_b, ground_b):
-        b_pr =[]
-        b_re =[]
-        b_f1 =[]
-        for i,cur_seq_y in enumerate(ground_b):
-            index_of_1 = np.where(cur_seq_y==1)[0]
-            index_pre = pre_b[i]
-            no_correct = len(np.intersect1d(index_of_1,index_pre))
-            cur_pre = no_correct / len(index_pre)
-            cur_rec = no_correct / len(index_of_1)
-            cur_f1 = 2*cur_pre*cur_rec/ (cur_pre+cur_rec)
-            b_pr.append(cur_pre)
-            b_re.append(cur_rec)
-            b_f1.append(cur_f1)
-        return b_pr,b_re,b_f1
     def check_accuracy(self,data2X,data2Y,index2word, fukugen2):
-        for nloop in tqdm(range(0,108)):
             dataY = data2Y[nloop]
             dataX = data2X[nloop]
             fukugen = fukugen2[nloop]
-            #print(len(dataX), len(dataY), len(fukugen))
             need_loop = int(np.ceil(len(dataY) / self.batch_size))
-            #need_loop = int(np.ceil(len(dataY) / 1))
-            all_ave_loss =[]
-            all_boundary =[]
-            all_boundary_start = []
-            all_align_matrix = []
-            all_index_decoder_y =[]
-            all_x_save = []
-            all_C =[]
-            all_R =[]
-            all_G =[]
             for lp in range(need_loop):
                 startN = lp*self.batch_size
                 endN =  (lp+1)*self.batch_size
                 if endN > len(dataY):
                     endN = len(dataY)
-                #print(fukugen)
                 fukuge = fukugen[startN:endN]
-                #print(startN, endN)
-                #print(len(fukugen))
-                #print(fukugen)
-                #for nloop in tqdm(range(0,26431)):
                 numpy_batch_x, batch_x, batch_y, index_decoder_X, index_decoder_Y, all_lens, maxL = sample_a_sorted_batch_from_numpy(
                     dataX[startN:endN], dataY[startN:endN], None, self.use_cuda)
-                #numpy_batch_x, batch_x, batch_y, index_decoder_X, index_decoder_Y, all_lens, maxL = sample_a_sorted_batch_from_numpy(
-                #    dataX, dataY, None, self.use_cuda)
-                batch_ave_loss, batch_boundary, batch_boundary_start, batch_align_matrix = self.model.predict(batch_x,
-                                                                                                          index_decoder_Y,
-                                                                                                      all_lens)
-                all_ave_loss.extend([batch_ave_loss.data.item()])  #[batch_ave_loss.data[0]]
-                all_boundary.extend(batch_boundary)
-                all_boundary_start.extend(batch_boundary_start)
-                all_align_matrix.extend(batch_align_matrix)
-                all_index_decoder_y.extend(index_decoder_Y)
-                all_x_save.extend(numpy_batch_x)
-                #print(batch_y)
-                ba_C,ba_R,ba_G = self.get_batch_micro_metric(batch_boundary,batch_y,batch_x,index2word, fukuge, nloop)
-                all_C.extend(ba_C)
-                all_R.extend(ba_R)
-                all_G.extend(ba_G)
-            ba_pre = np.sum(all_C)/ np.sum(all_R)
-            ba_rec = np.sum(all_C)/ np.sum(all_G)
-            ba_f1 = 2*ba_pre*ba_rec/ (ba_pre+ba_rec)
-        return np.mean(all_ave_loss),ba_pre,ba_rec,ba_f1, (all_x_save,all_index_decoder_y,all_boundary, all_boundary_start, all_align_matrix)
-    def adjust_learning_rate(self,optimizer,epoch,lr_decay=0.5, lr_decay_epoch=5):
-        if (epoch % lr_decay_epoch == 0) and (epoch != 0):
-            for param_group in optimizer.param_groups:
-                param_group['lr'] *= lr_decay
-    def train(self,n):
-        self.test_train_x, self.test_train_y = self.sample_dev()
-        optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr, weight_decay=self.weight_decay)
-        num_each_batch = int(np.round(len(self.train_y) / self.batch_size))
-        #os.mkdir(self.save_path)
-        best_i =0
-        best_f1 =0
-        for epoch in range(self.epoch):
-            print(epoch)
-            self.adjust_learning_rate(optimizer, epoch, 0.8, self.lr_decay_epoch)
-            track_epoch_loss = []
-            for iter in tqdm(range(num_each_batch)):
-                #print("epoch:%d,iteration:%d" % (epoch, iter))
-                self.model.zero_grad()
-                numpy_batch_x,batch_x, batch_y, index_decoder_X, index_decoder_Y, all_lens, maxL = sample_a_sorted_batch_from_numpy(
-                    self.train_x, self.train_y, self.batch_size, self.use_cuda)
-                neg_loss = self.model.neg_log_likelihood(batch_x, index_decoder_X, index_decoder_Y,all_lens)
-                neg_loss_v = float(neg_loss.data.item())
-                #print(neg_loss_v)
-                track_epoch_loss.append(neg_loss_v)
-                neg_loss.backward()
-                clip_grad_norm(self.model.parameters(), 5)
-                optimizer.step()
-            #TODO: after each epoch,check accuracy
-            self.model.eval()
-            #tr_batch_ave_loss, tr_pre, tr_rec, tr_f1 ,visdata=    self.check_accuracy(self.test_train_x,self.test_train_y)
-            dev_batch_ave_loss, dev_pre, dev_rec, dev_f1, visdata =self.check_accuracy(self.dev_x,self.dev_y,n)
-            print("f1="+str(dev_f1))
-            print("loss="+str(dev_batch_ave_loss))
-            """
-            if best_f1 < dev_f1:
-                best_f1 = dev_f1
-                best_rec = dev_rec
-                best_pre = dev_pre
-                best_i = epoch
-            save_data = [epoch,dev_batch_ave_loss,dev_pre,dev_rec,dev_f1]
-            save_file_name = 'bs_{}_es_{}_lr_{}_lrdc_{}_wd_{}_epoch_loss_acc_pk_wd.txt'.format(self.batch_size,self.eval_size,self.lr,self.lr_decay_epoch,self.weight_decay)
-            """
-            #with open(os.path.join(self.save_path,save_file_name), 'a') as f:
-            #    f.write(','.join(map(str,save_data))+'\n')
-            #if epoch % 1 ==0 and epoch !=0:
-            #    torch.save(self.model, os.path.join(self.save_path,r'model_epoch_%d.torchsave'%(epoch)))
-            self.model.train()
-        #return best_i,best_pre,best_rec,best_f1
-        return best_i,best_f1,n

 import random
 from torch.nn.utils import clip_grad_norm
 import copy
 import os
 import pickle
 def sample_a_sorted_batch_from_numpy(numpyX,numpyY,batch_size,use_cuda):
+    select_index = np.array(range(len(numpyY)))
     select_index = np.array(range(len(numpyX)))
     batch_x = [copy.deepcopy(numpyX[i]) for i in select_index]
     batch_y = [copy.deepcopy(numpyY[i]) for i in select_index]
     index_decoder_X,index_decoder_Y = get_decoder_index_XY(batch_y)
     all_lens = np.array([len(x) for x in batch_y])
     maxL = np.max(all_lens)
     idx = np.argsort(all_lens)
     idx = np.sort(idx)
     batch_x = [batch_x[i] for i in idx]
     batch_y = [batch_y[i] for i in idx]
     all_lens = all_lens[idx]
     index_decoder_X = np.array([index_decoder_X[i] for i in idx])
     index_decoder_Y = np.array([index_decoder_Y[i] for i in idx])
     numpy_batch_x = batch_x
     batch_x = align_variable_numpy(batch_x,maxL,2000001)
     batch_y = align_variable_numpy(batch_y,maxL,2)
     batch_x = Variable(torch.from_numpy(np.array(batch_x, dtype="int64")))
     if use_cuda:
         batch_x = batch_x.cuda()
     return  numpy_batch_x,batch_x,batch_y,index_decoder_X,index_decoder_Y,all_lens,maxL
         self.lr_decay_epoch = lr_decay_epoch
         self.eval_size  = eval_size
         self.dev_x, self.dev_y = dev_x, dev_y
         self.model = model
         self.weight_decay =weight_decay
     def get_batch_micro_metric(self,pre_b, ground_b, x,index2word, fukugen, nloop):
         tokendic = {}
         for n,i in enumerate(index2word):
             tokendic[n] = i
+        sents = []
         for i,cur_seq_y in enumerate(ground_b):
             fuku = fukugen[i]
             index_of_1 = np.where(cur_seq_y==1)[0]
             index_pre = pre_b[i]
             inp = x[i]
             index_pre = np.array(index_pre)
             END_B = index_of_1[-1]
             index_pre = index_pre[index_pre != END_B]
             index_of_1 = index_of_1[index_of_1 != END_B]
             index_of_1 = list(index_of_1)
             index_pre = list(index_pre)
             FP = []
             sent = []
             ex = ""
+            sent = [tokendic[int(j.to('cpu').detach().numpy().copy())] for j in inp]
             for k in index_pre:
                 if k not in index_of_1:
                     FP.append(k)
+            #FP = [int(j.to('cpu').detach().numpy().copy()) for j in FP]
             for n,k in enumerate(zip(sent, fuku)):
                 f = k[1]
                 i = k[0]
                 if k == "<pad>":
                     continue
                 if n in FP:
                     ex += f
+                    sents.append(ex)
+                    ex = ""
                 else:
+                    ex += f
+            sents.append(ex)
+        return sents
     def check_accuracy(self,data2X,data2Y,index2word, fukugen2):
+        for nloop in range(1):
             dataY = data2Y[nloop]
             dataX = data2X[nloop]
             fukugen = fukugen2[nloop]
             need_loop = int(np.ceil(len(dataY) / self.batch_size))
             for lp in range(need_loop):
                 startN = lp*self.batch_size
                 endN =  (lp+1)*self.batch_size
                 if endN > len(dataY):
                     endN = len(dataY)
                 fukuge = fukugen[startN:endN]
                 numpy_batch_x, batch_x, batch_y, index_decoder_X, index_decoder_Y, all_lens, maxL = sample_a_sorted_batch_from_numpy(
                     dataX[startN:endN], dataY[startN:endN], None, self.use_cuda)
+                batch_ave_loss, batch_boundary, batch_boundary_start, batch_align_matrix = self.model.predict(batch_x,index_decoder_Y,all_lens)
+                output_texts = self.get_batch_micro_metric(batch_boundary,batch_y,batch_x,index2word, fukuge, nloop)
+        return output_texts