Spaces:

mikeee
/

radiobee-aligner

Build error

App Files Files Community

freemt commited on Jan 2, 2022

Commit

d5ff673

1 Parent(s): 077e1eb

Update plot_mat

Browse files

Files changed (9) hide show

data/ps-cn.txt +0 -0
data/ps-en.txt +0 -0
package.json +1 -1
radiobee/__main__.py +203 -88
radiobee/gen_pset.py +5 -0
radiobee/plot_cmat.py +145 -0
radiobee/plot_df.py +74 -47
radiobee/trim_df.py +30 -0
run-radiobee.bat +2 -1

data/ps-cn.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/ps-en.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

package.json CHANGED Viewed

@@ -8,7 +8,7 @@
   },
   "scripts": {
     "pyright": "pyright",
-    "flake8": "flake8.bat",
     "test": "echo \"Error: no test specified\" && exit 1"
   },
   "repository": {

   },
   "scripts": {
     "pyright": "pyright",
+    "flake8": "flake8",
     "test": "echo \"Error: no test specified\" && exit 1"
   },
   "repository": {

radiobee/__main__.py CHANGED Viewed

@@ -10,7 +10,7 @@ from textwrap import dedent
 from itertools import zip_longest
 from socket import socket, AF_INET, SOCK_STREAM
-from sklearn.cluster import DBSCAN
 import joblib
 from varname import nameof
 from logzero import logger
@@ -18,7 +18,8 @@ from logzero import logger
 # import numpy as np
 import pandas as pd
 import seaborn as sns
-import matplotlib.pyplot as plt
 # from tabulate import tabulate
 from fastlid import fastlid
@@ -34,9 +35,12 @@ from radiobee.gen_pset import gen_pset
 from radiobee.gen_aset import gen_aset
 from radiobee.align_texts import align_texts
-# from radiobee.plot_df import plot_df
 from radiobee.cmat2tset import cmat2tset
 sns.set()
 sns.set_style("darkgrid")
 fastlid.set_languages = ["en", "zh"]
@@ -91,8 +95,12 @@ if __name__ == "__main__":
     ]
     outputs = ["dataframe"]
     # """
-    # import logzero
-    # logzero.loglevel(10)
     logger.debug(" debug ")
     logger.info(" info ")
@@ -119,9 +127,15 @@ if __name__ == "__main__":
         x min_df: int | float = 1
         x max_df: int | float = 1.0
     # """
-    input_tf_type = gr.inputs.Dropdown(["linear", "sqrt", "log", "binary"], default="linear")
-    input_idf_type = gr.inputs.Radio(["None", "standard", "smooth", "bm25"], default="None")  # need to convert "None" this to None in fn
-    input_dl_type = gr.inputs.Radio(["None", "linear", "sqrt", "log"], default="None")  # ditto
     input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto
     inputs = [
@@ -147,12 +161,76 @@ if __name__ == "__main__":
     # modi
     examples = [
-        ["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 10, 6, ],
-        ["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 10, 6, ],
-        ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 10, 6, ],
-        ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 10, 6, ],
-        ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 10, 6, ],
-        ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 10, 6, ],
     ]
     outputs = ["dataframe", "plot"]
     outputs = ["plot"]
@@ -227,8 +305,8 @@ if __name__ == "__main__":
         lst1 = [elm for elm in df1.text1 if elm]
         lst2 = [elm for elm in df1.text2 if elm]
-        len1 = len(lst1)
-        len2 = len(lst2)
         cmat = lists2cmat(
             lst1,
@@ -242,66 +320,8 @@ if __name__ == "__main__":
         tset = pd.DataFrame(cmat2tset(cmat))
         tset.columns = ["x", "y", "cos"]
-        df_ = tset
-        xlabel: str = lang1
-        ylabel: str = lang2
-        sns.set()
-        sns.set_style("darkgrid")
-        # close all existing figures, necesssary for hf spaces
-        plt.close("all")
-        # if sys.platform not in ["win32", "linux"]:
-        plt.switch_backend('Agg')  # to cater for Mac, thanks to WhiteFox
-        fig = plt.figure()
-        gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
-        ax2 = fig.add_subplot(gs[0, 0])
-        ax0 = fig.add_subplot(gs[0, 1])
-        ax1 = fig.add_subplot(gs[1, 0])
-        cmap = "viridis_r"
-        sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
-        ax2.set_xlabel(xlabel)
-        ax2.set_ylabel(ylabel)
-        ax2.set_title("cos similarity heatmap")
-        fig.suptitle("alignment projection")
-        _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
-        # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
-        _x = ~_
-        df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
-        # clustered
-        df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
-        # outliers
-        df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
-        # ax0.set_xlabel("")
-        # ax0.set_ylabel("zh")
-        ax0.set_xlabel(xlabel)
-        ax0.set_ylabel(ylabel)
-        ax0.set_xlim(0, len1)
-        ax0.set_ylim(0, len2)
-        ax0.set_title("max along columns ('x': outliers)")
-        # ax1.set_xlabel("en")
-        # ax1.set_ylabel("zh")
-        ax1.set_xlabel(xlabel)
-        ax1.set_ylabel(ylabel)
-        ax1.set_xlim(0, len1)
-        ax1.set_ylim(0, len2)
-        ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
-        # return df, plot_df(pd.DataFrame(cmat))
-        # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
         df_trimmed = pd.concat(
             [
                 df1.iloc[:4, :],
@@ -318,12 +338,13 @@ if __name__ == "__main__":
             ],
             ignore_index=1,
         )
         # process lst1, lst2 to obtained df_aligned
         # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
         # fixed in gen_pet, but we leave the loop here
         for min_s in range(min_samples):
-            logger.info(" min_samples, try %s", min_samples - min_s)
             try:
                 pset = gen_pset(
                     cmat,
@@ -342,6 +363,89 @@ if __name__ == "__main__":
             # break should happen above when min_samples = 2
             raise Exception("bummer, this shouldn't happen, probably another bug")
         src_len, tgt_len = cmat.shape
         aset = gen_aset(pset, src_len, tgt_len)
         final_list = align_texts(aset, lst2, lst1)  # note the order
@@ -359,7 +463,9 @@ if __name__ == "__main__":
         # file_dl.write_text(_, encoding="gb2312")  # no go
-        file_dl_xlsx = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx")
         df_aligned.to_excel(file_dl_xlsx)
         # return df_trimmed, plt
@@ -395,10 +501,13 @@ if __name__ == "__main__":
         *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
     """
     )
-    css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
-    css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
-    css_file = (
-        ".input_file, .output_file {height: 9rem !important; width: 100% !important;}"
     )
     logger.info("running at port %s", server_port)
@@ -424,15 +533,21 @@ if __name__ == "__main__":
         # height=150,  # 500
         width=900,  # 900
         allow_flagging=True,
-        flagging_options=["fatal", "bug", "brainstorm", "excelsior", ],  # "paragon"],
-        css=f"{css} {css_file}",
     )
     iface.launch(
-        # share=False,
-        share=True,
-        debug=True,
-        server_name="0.0.0.0",
         server_port=server_port,
         # show_tips=True,
         enable_queue=True,

 from itertools import zip_longest
 from socket import socket, AF_INET, SOCK_STREAM
+from sklearn.cluster import DBSCAN  # noqa
 import joblib
 from varname import nameof
 from logzero import logger
 # import numpy as np
 import pandas as pd
 import seaborn as sns
+import matplotlib.pyplot as plt  # noqa
 # from tabulate import tabulate
 from fastlid import fastlid
 from radiobee.gen_aset import gen_aset
 from radiobee.align_texts import align_texts
 from radiobee.cmat2tset import cmat2tset
+# from radiobee.plot_df import plot_df
+# from radiobee.plot_cmat import plot_cmat
+from radiobee.trim_df import trim_df
 sns.set()
 sns.set_style("darkgrid")
 fastlid.set_languages = ["en", "zh"]
     ]
     outputs = ["dataframe"]
     # """
+    import logzero
+    # debug = True
+    debug = False
+    if debug:
+        logzero.loglevel(10)
     logger.debug(" debug ")
     logger.info(" info ")
         x min_df: int | float = 1
         x max_df: int | float = 1.0
     # """
+    input_tf_type = gr.inputs.Dropdown(
+        ["linear", "sqrt", "log", "binary"], default="linear"
+    )
+    input_idf_type = gr.inputs.Radio(
+        ["None", "standard", "smooth", "bm25"], default="None"
+    )  # need to convert "None" this to None in fn
+    input_dl_type = gr.inputs.Radio(
+        ["None", "linear", "sqrt", "log"], default="None"
+    )  # ditto
     input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto
     inputs = [
     # modi
     examples = [
+        [
+            "data/test_zh.txt",
+            "data/test_en.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+        ],
+        [
+            "data/test_en.txt",
+            "data/test_zh.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+        ],
+        [
+            "data/shakespeare_zh500.txt",
+            "data/shakespeare_en500.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+        ],
+        [
+            "data/shakespeare_en500.txt",
+            "data/shakespeare_zh500.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+        ],
+        [
+            "data/hlm-ch1-zh.txt",
+            "data/hlm-ch1-en.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+        ],
+        [
+            "data/hlm-ch1-en.txt",
+            "data/hlm-ch1-zh.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            6,
+        ],
+        [
+            "data/ps-cn.txt",
+            "data/ps-en.txt",
+            "linear",
+            "None",
+            "None",
+            "None",
+            10,
+            4,
+        ],
     ]
     outputs = ["dataframe", "plot"]
     outputs = ["plot"]
         lst1 = [elm for elm in df1.text1 if elm]
         lst2 = [elm for elm in df1.text2 if elm]
+        # len1 = len(lst1)  # noqa
+        # len2 = len(lst2)  # noqa
         cmat = lists2cmat(
             lst1,
         tset = pd.DataFrame(cmat2tset(cmat))
         tset.columns = ["x", "y", "cos"]
+        df_trimmed = trim_df(df1)
+        _ = """
         df_trimmed = pd.concat(
             [
                 df1.iloc[:4, :],
             ],
             ignore_index=1,
         )
+        # """
         # process lst1, lst2 to obtained df_aligned
         # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
         # fixed in gen_pet, but we leave the loop here
         for min_s in range(min_samples):
+            logger.info(" min_samples, using %s", min_samples - min_s)
             try:
                 pset = gen_pset(
                     cmat,
             # break should happen above when min_samples = 2
             raise Exception("bummer, this shouldn't happen, probably another bug")
+        min_samples = gen_pset.min_samples
+        # will result in error message:
+        # UserWarning: Starting a Matplotlib GUI outside of
+        # the main thread will likely fail."
+        _ = """
+        plot_cmat(
+            cmat,
+            eps=eps,
+            min_samples=min_samples,
+            xlabel=lang1,
+            ylabel=lang2,
+        )
+        # """
+        # move plot_cmat's code to the main thread here
+        # to make it work
+        xlabel = lang1
+        ylabel = lang2
+        len1, len2 = cmat.shape
+        ylim, xlim = len1, len2
+        # does not seem to show up
+        logger.debug(" len1 (ylim): %s, len2 (xlim): %s", len1, len2)
+        if debug:
+            print(f" len1 (ylim): {len1}, len2 (xlim): {len2}")
+        df_ = pd.DataFrame(cmat2tset(cmat))
+        df_.columns = ["x", "y", "cos"]
+        sns.set()
+        sns.set_style("darkgrid")
+        # close all existing figures, necesssary for hf spaces
+        plt.close("all")
+        # if sys.platform not in ["win32", "linux"]:
+        plt.switch_backend('Agg')  # to cater for Mac, thanks to WhiteFox
+        # figsize=(13, 8), (339, 212) mm on '1280x800+0+0'
+        fig = plt.figure(figsize=(13, 8))
+        # gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
+        gs = fig.add_gridspec(1, 2, wspace=0.4, hspace=0.58)
+        ax_heatmap = fig.add_subplot(gs[0, 0])  # ax2
+        ax0 = fig.add_subplot(gs[0, 1])
+        # ax1 = fig.add_subplot(gs[1, 0])
+        cmap = "viridis_r"
+        sns.heatmap(cmat, cmap=cmap, ax=ax_heatmap).invert_yaxis()
+        ax_heatmap.set_xlabel(xlabel)
+        ax_heatmap.set_ylabel(ylabel)
+        ax_heatmap.set_title("cos similarity heatmap")
+        fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
+        _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
+        # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
+        _x = ~_
+        # max cos along columns
+        df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
+        # outliers
+        df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
+        ax0.set_xlabel(xlabel)
+        ax0.set_ylabel(ylabel)
+        ax0.set_xlim(xmin=0, xmax=xlim)
+        ax0.set_ylim(ymin=0, ymax=ylim)
+        ax0.set_title(
+            "max along columns ('x': outliers)\n"
+            "potential aligned pairs (green line)\n"
+            f"({round(sum(_) / xlim, 2):.0%})"
+        )
+        # clustered
+        # df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
+        # ax1.set_xlabel(xlabel)
+        # ax1.set_ylabel(ylabel)
+        # ax1.set_xlim(0, len1)
+        # ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
+        # end of plot_cmat
         src_len, tgt_len = cmat.shape
         aset = gen_aset(pset, src_len, tgt_len)
         final_list = align_texts(aset, lst2, lst1)  # note the order
         # file_dl.write_text(_, encoding="gb2312")  # no go
+        file_dl_xlsx = Path(
+            f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx"
+        )
         df_aligned.to_excel(file_dl_xlsx)
         # return df_trimmed, plt
         *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
     """
     )
+    css_image = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
+    # css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
+    css_input_file = (
+        ".input_file, {height: 9rem !important; width: 100% !important;}"
+    )
+    css_output_file = (
+        ".output_file , {height: 4rem !important; width: 100% !important;}"
     )
     logger.info("running at port %s", server_port)
         # height=150,  # 500
         width=900,  # 900
         allow_flagging=True,
+        flagging_options=[
+            "fatal",
+            "bug",
+            "brainstorm",
+            "excelsior",
+        ],  # "paragon"],
+        css=f"{css_image} {css_input_file} {css_output_file}",
     )
     iface.launch(
+        share=False,
+        # share=True,
+        debug=debug,
+        # server_name="0.0.0.0",
+        server_name="127.0.0.1",
         server_port=server_port,
         # show_tips=True,
         enable_queue=True,

radiobee/gen_pset.py CHANGED Viewed

@@ -152,6 +152,7 @@ def gen_pset(
     Refer to _gen_pset.
     """
     for min_s in range(min_samples):
         logger.debug(" min_samples, try %s", min_samples - min_s)
         try:
@@ -171,4 +172,8 @@ def gen_pset(
     else:
         # break should happen above when min_samples = 2
         raise Exception("bummer, this shouldn't happen, probably another bug")
     return pset

     Refer to _gen_pset.
     """
+    gen_pset.min_samples = min_samples
     for min_s in range(min_samples):
         logger.debug(" min_samples, try %s", min_samples - min_s)
         try:
     else:
         # break should happen above when min_samples = 2
         raise Exception("bummer, this shouldn't happen, probably another bug")
+    # store new min_samples
+    gen_pset.min_samples = min_samples - min_s
     return pset

radiobee/plot_cmat.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""Plot pandas.DataFrame with DBSCAN clustering."""
+# pylint: disable=invalid-name, too-many-arguments
+import numpy as np
+import pandas as pd
+import matplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.cluster import DBSCAN
+from fastlid import fastlid
+import logzero
+from logzero import logger
+from radiobee.cmat2tset import cmat2tset
+# turn interactive when in ipython session
+_ = """
+if "get_ipython" in globals():
+    plt.ion()
+else:
+    plt.switch_backend("Agg")
+# """
+logzero.loglevel(20)  # 10: debug on
+fastlid.set_languages = ["en", "zh"]
+# fmt: off
+def plot_cmat(
+        # df_: pd.DataFrame,
+        cmat: np.ndarray,
+        eps: float = 10,
+        min_samples: int = 6,
+        # ylim: int = None,
+        xlabel: str = "zh",
+        ylabel: str = "en",
+        backend: str = "Agg",
+        showfig: bool = False,
+):
+    # ) -> plt:
+    # fmt: on
+    """Plot df with DBSCAN clustering.
+    Args:
+        df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
+    Returns:
+        matplotlib.pyplot: for possible use in gradio
+    plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
+    df_ = pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos'])
+    # sort 'x', axis 0 changes, index regenerated
+    df_s = df_.sort_values('x', axis=0, ignore_index=True)
+    # sorintg does not seem to impact clustering
+    DBSCAN(1.5, min_samples=3).fit(df_).labels_
+    DBSCAN(1.5, min_samples=3).fit(df_s).labels_
+    """
+    logger.debug(
+        '"get_ipython" in globals(): %s', "get_ipython" in globals()
+    )
+    len1, len2 = cmat.shape
+    df_ = pd.DataFrame(cmat2tset(cmat))
+    df_.columns = ["x", "y", "cos"]
+    backend_saved = matplotlib.get_backend()
+    # switch backend if necessary
+    if backend_saved != backend:
+        plt.switch_backend(backend)
+    # len1 = len(lst1)  # noqa
+    # len2 = len(lst2)  # noqa
+    # lang1, _ = fastlid(" ".join(lst1))
+    # lang2, _ = fastlid(" ".join(lst2))
+    # xlabel: str = lang1
+    # ylabel: str = lang2
+    sns.set()
+    sns.set_style("darkgrid")
+    # close all existing figures, necesssary for hf spaces
+    plt.close("all")
+    # if sys.platform not in ["win32", "linux"]:
+    # plt.switch_backend('Agg')  # to cater for Mac, thanks to WhiteFox
+    fig = plt.figure()
+    gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
+    ax2 = fig.add_subplot(gs[0, 0])
+    ax0 = fig.add_subplot(gs[0, 1])
+    ax1 = fig.add_subplot(gs[1, 0])
+    cmap = "viridis_r"
+    sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
+    ax2.set_xlabel(xlabel)
+    ax2.set_ylabel(ylabel)
+    ax2.set_title("cos similarity heatmap")
+    fig.suptitle("alignment projection")
+    _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
+    # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
+    _x = ~_
+    df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
+    # clustered
+    df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
+    # outliers
+    df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
+    ax0.set_xlabel(xlabel)
+    ax0.set_ylabel(ylabel)
+    ax0.set_xlim(0, len1)
+    ax0.set_ylim(0, len2)
+    ax0.set_title("max along columns ('x': outliers)")
+    # ax1.set_xlabel("en")
+    # ax1.set_ylabel("zh")
+    ax1.set_xlabel(xlabel)
+    ax1.set_ylabel(ylabel)
+    ax1.set_xlim(0, len1)
+    ax1.set_ylim(0, len2)
+    ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
+    logger.debug(" matplotlib.get_backend(): %s", matplotlib.get_backend())
+    # if matplotlib.get_backend() not in ["Agg"]:
+    if showfig:
+        # plt.ioff()  # or we'll just see the plot show and disappear
+        # plt.show()
+        plt.show(block=True)
+    # restore if necessary
+    if backend_saved != backend:
+        plt.switch_backend(backend_saved)
+    # return plt

radiobee/plot_df.py CHANGED Viewed

@@ -1,26 +1,37 @@
 """Plot pandas.DataFrame with DBSCAN clustering."""
 # pylint: disable=invalid-name, too-many-arguments
-# import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.cluster import DBSCAN
-from logzero import logger
 # turn interactive when in ipython session
 if "get_ipython" in globals():
     plt.ion()
 # fmt: off
 def plot_df(
         df_: pd.DataFrame,
-        min_samples: int = 6,
         eps: float = 10,
-        ylim: int = None,
-        xlabel: str = "en",
-        ylabel: str = "zh",
 ) -> plt:
     # fmt: on
     """Plot df with DBSCAN clustering.
@@ -41,60 +52,76 @@ def plot_df(
     DBSCAN(1.5, min_samples=3).fit(df_s).labels_
     """
-    df_ = pd.DataFrame(df_)
-    if df_.columns.__len__() < 3:
-        logger.error(
-            "expected 3 columns DataFram, got: %s, cant proceed, returninng None",
-            df_.columns.tolist(),
-        )
-        return None
-    # take first three columns
-    columns = df_.columns[:3]
-    df_ = df_[columns]
-    # rename columns to "x", "y", "cos"
-    df_.columns = ["x", "y", "cos"]
     sns.set()
     sns.set_style("darkgrid")
-    # fig, (ax0, ax1) = plt.subplots(2, figsize=(11.69, 8.27))
-    fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
-    fig.suptitle("alignment projection")
-    _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
-    _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
-    # ax0.scatter(df_[_].x, df_[_].y, marker='o', c='g', alpha=0.5)
-    # ax0.grid()
-    # print("ratio: %.2f%%" % (100 * sum(_)/len(df_)))
-    df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0)
-    # clustered
-    df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1)
     # outliers
     df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
-    # ax0.set_xlabel("")
-    # ax0.set_ylabel("zh")
-    ax0.set_xlabel("")
-    ax0.set_ylabel(ylabel)
-    xlim = len(df_)
-    ax0.set_xlim(0, xlim)
-    if ylim:
-        ax0.set_ylim(0, ylim)
-    ax0.set_title("max similarity along columns (outliers denoted by 'x')")
     # ax1.set_xlabel("en")
     # ax1.set_ylabel("zh")
-    ax1.set_xlabel(xlabel)
-    ax1.set_ylabel(ylabel)
-    ax1.set_xlim(0, xlim)
-    if ylim:
-        ax1.set_ylim(0, ylim)
-    ax1.set_title(f"potential aligned pairs ({round(sum(_) / len(df_), 2):.0%})")
     return plt

 """Plot pandas.DataFrame with DBSCAN clustering."""
 # pylint: disable=invalid-name, too-many-arguments
+import numpy as np  # noqa
 import pandas as pd
+import matplotlib
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.cluster import DBSCAN
+from logzero import logger  # noqa
+# from radiobee.cmat2tset import cmat2tset
 # turn interactive when in ipython session
+_ = """
 if "get_ipython" in globals():
     plt.ion()
+else:
+    plt.switch_backend('Agg')
+# """
+# fastlid.set_languages = ["en", "zh"]
 # fmt: off
 def plot_df(
         df_: pd.DataFrame,
+        # cmat: np.ndarray,
         eps: float = 10,
+        min_samples: int = 6,
+        xlabel: str = "",
+        ylabel: str = "",
+        xlim: int = 0,
+        ylim: int = 0,
+        backend: str = "TkAgg",
 ) -> plt:
     # fmt: on
     """Plot df with DBSCAN clustering.
     DBSCAN(1.5, min_samples=3).fit(df_s).labels_
     """
+    # df_ = pd.DataFrame(cmat2tset(cmat))
+    if df_.shape[1] == 3:
+        df_.columns = ["x", "y", "cos"]
+    else:
+        logger.error(" shape mismatch: %s, expected (x, 3)", df_.shape)
+        # return None
+        raise Exception(" df_.shape[1] not equal to 3 ")
+    if not xlim:
+        xlim = len(df_)
+    if not ylim:
+        ylim = df_.y.max()
+    if not xlabel:
+        xlabel = str(xlim)
+    if not ylabel:
+        ylabel = str(ylim)
+    backend_saved = matplotlib.get_backend()
+    # switch if necessary
+    if backend_saved != backend:
+        plt.switch_backend(backend)
     sns.set()
     sns.set_style("darkgrid")
+    fig = plt.figure(figsize=(13, 8))
+    # gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
+    # ax2 = fig.add_subplot(gs[0, 0])
+    # ax0 = fig.add_subplot(gs[0, 1])
+    # ax1 = fig.add_subplot(gs[1, 0])
+    gs = fig.add_gridspec(1, 1, wspace=0.4, hspace=0.58)
+    ax0 = fig.add_subplot(gs[0, 0])
+    cmap = "viridis_r"
+    _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
+    _x = ~_
+    # clustered
+    df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
     # outliers
     df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
     # ax1.set_xlabel("en")
     # ax1.set_ylabel("zh")
+    ax0.set_xlabel(xlabel)
+    ax0.set_ylabel(ylabel)
+    # ax0.set_xlim(0, xlim)
+    # ax0.set_ylim(0, ylim)
+    ax0.set_title("max cos ('x': outliers)")
+    # ax1.set_title(f"potential aligned pairs ({round(sum(_) / xlim, 2):.0%})")
+    # restore if necessary
+    if backend_saved != backend:
+        plt.switch_backend(backend_saved)
     return plt
+_ = """
+        eps: float = 10
+        min_samples: int = 6
+        xlabel: str = ""
+        ylabel: str = ""
+        xlim: int = 0
+        ylim: int = 0
+"""

radiobee/trim_df.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""Trim df."""
+import pandas as pd
+# fmt: off
+def trim_df(
+        df1: pd.DataFrame,
+        len_: int = 4,
+) -> pd.DataFrame:
+    # fmt: on
+    """Trim df."""
+    if len(df1) > 2 * len_:
+        df_trimmed = pd.concat(
+            [
+                df1.iloc[:len_, :],
+                pd.DataFrame(
+                    [
+                        [
+                            "...",
+                            "...",
+                        ]
+                    ],
+                    columns=df1.columns,
+                ),
+                df1.iloc[-len_:, :],
+            ],
+            ignore_index=1,
+        )
+        return df_trimmed
+    return df1

run-radiobee.bat CHANGED Viewed

@@ -1,3 +1,4 @@
 REM nodemon -V -w radiobee -x "sleep 3 && python -m radiobee"
 REM nodemon -V -w radiobee -x python -m radiobee
-nodemon -V -w radiobee -x py -3.8 -m radiobee

 REM nodemon -V -w radiobee -x "sleep 3 && python -m radiobee"
 REM nodemon -V -w radiobee -x python -m radiobee
+REM nodemon -V -w radiobee -x py -3.8 -m radiobee
+nodemon -V -w radiobee -x "run-p pyright flake8 && py -3.8 -m radiobee"