freemt
Update aligned pairs plot
dab2de2
raw
history blame
11 kB
"""Run interactively."""
from typing import Tuple, Optional
import joblib
from random import randint
from textwrap import dedent
from itertools import zip_longest
from sklearn.cluster import DBSCAN
from socket import socket, AF_INET, SOCK_STREAM
import signal
from varname import nameof
from logzero import logger
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from tabulate import tabulate
from fastlid import fastlid
import gradio as gr
from radiobee.process_upload import process_upload
from radiobee.files2df import files2df
from radiobee.file2text import file2text
from radiobee.lists2cmat import lists2cmat
from radiobee.plot_df import plot_df
from radiobee.cmat2tset import cmat2tset
sns.set()
sns.set_style("darkgrid")
fastlid.set_languages = ["en", "zh"]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quit\n")
def savelzma(obj, fileloc: str = None):
if fileloc is None:
fileloc = nameof(obj) # this wont work
joblib.dump(obj, f"data/{fileloc}.lzma")
def greet(input):
"""Greet yo."""
return f"'Sup yo! (your input: {input})"
def upfile1(file1, file2=None) -> Tuple[str, str]:
"""Upload file1, file2."""
return file1.name, f"'Sup yo! (your input: {input})"
def process_2upoads(file1, file2):
"""Process stuff."""
# return f"{process_upload(file1)}\n===***\n{process_upload(file2)}"
text1 = [_.strip() for _ in process_upload(file1).splitlines() if _.strip()]
text2 = [_.strip() for _ in process_upload(file2).splitlines() if _.strip()]
text1, text2 = zip(*zip_longest(text1, text2, fillvalue=""))
df = pd.DataFrame({"text1": text1, "text2": text2})
# return tabulate(df)
# return tabulate(df, tablefmt="grid")
# return tabulate(df, tablefmt='html')
return df
if __name__ == "__main__":
_ = """
fn = process_2upoads
inputs = ["file", "file"]
examples = [
["data/test_zh.txt", "data/test_en.txt"],
["data/test_en.txt", "data/test_zh.txt"],
]
outputs = ["dataframe"]
# """
import logzero
logzero.loglevel(10)
logger.debug(" debug ")
logger.info(" info ")
# _ = """
inputs = [
gr.inputs.Textbox(
# placeholder="Input something here",
default="test text")
]
inputs = ["file", "file"]
inputs = [
gr.inputs.File(label="file 1"),
# gr.inputs.File(file_count="multiple", label="file 2", optional=True),
gr.inputs.File(label="file 2", optional=True),
]
examples = [
["data/test_zh.txt", "data/test_en.txt"],
["data/test_en.txt", "data/test_zh.txt"],
["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt"],
["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt"],
]
outputs = ["dataframe", "plot"]
outputs = ["plot"]
outputs = ["dataframe", "plot"]
out1 = gr.outputs.Dataframe(
headers=None,
max_rows=12, # 20
max_cols=None,
overflow_row_behaviour='paginate',
type='auto',
label="To be aligned",
)
outputs = [
out1,
"plot",
]
# outputs = ["dataframe", "plot", "plot"] # wont work
# outputs = ["dataframe"]
# outputs = ["dataframe", "dataframe", ]
def fn(file1, file2):
"""Process inputs."""
logger.debug(" *debug* ")
# logger.info("file1: *%s*, file2: *%s*", file1, file2)
logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
# bypass if file1 or file2 is str input
if not (isinstance(file1, str) or isinstance(file2, str)):
text1 = file2text(file1)
text2 = file2text(file2)
lang1, _ = fastlid(text1)
lang2, _ = fastlid(text2)
df1 = files2df(file1, file2)
lst1 = [elm for elm in df1.text1 if elm]
lst2 = [elm for elm in df1.text2 if elm]
len1 = len(lst1)
len2 = len(lst2)
# this wont work
# for obj in [text1, text2, df1, lst1, lst2, ]:
# savelzma(text1) wont work
joblib.dump(text1, f"data/{nameof(text1)}.lzma")
joblib.dump(text2, f"data/{nameof(text2)}.lzma")
joblib.dump(df1, f"data/{nameof(df1)}.lzma")
joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")
cmat = lists2cmat(lst1, lst2)
tset = pd.DataFrame(cmat2tset(cmat))
tset.columns = ["x", "y", "cos"]
print("lst1: %s" % lst1)
print("lst2: %s" % lst2)
print("cmat: %s" % cmat)
print("tset: %s" % tset)
logger.debug("lst1: %s", lst1)
logger.debug("lst2: %s", lst2)
logger.debug("cmat: %s", cmat)
logger.debug("tset: %s", tset)
# plt0 = plot_df(pd.DataFrame(cmat))
df_ = tset
min_samples: int = 6
eps: float = 10
ylim: Optional[int] = None
xlabel: str = lang1
ylabel: str = lang2
sns.set()
sns.set_style("darkgrid")
# fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
# plt.subplot_tool()
fig.subplots_adjust(hspace=.4)
sns.heatmap(cmat, cmap="viridis_r", ax=ax2).invert_yaxis()
ax2.set_xlabel(xlabel)
ax2.set_ylabel(ylabel)
ax2.set_title("cos similarity heatmap")
fig.suptitle("alignment projection")
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0)
# clustered
df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1)
# outliers
df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
# ax0.set_xlabel("")
# ax0.set_ylabel("zh")
ax0.set_xlabel(xlabel)
ax0.set_ylabel(ylabel)
ax0.set_xlim(0, len1)
ax0.set_ylim(0, len2)
ax0.set_title("max similarity along columns ('x': outliers)")
# ax1.set_xlabel("en")
# ax1.set_ylabel("zh")
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
ax1.set_xlim(0, len1)
ax1.set_ylim(0, len2)
ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
# return df, plot_df(pd.DataFrame(cmat))
# tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
else:
fig, ax1 = plt.subplots()
df1 = pd.DataFrame(
[[5.1, 3.5, 0],
[4.9, 3.0, 0],
[7.0, 3.2, 1],
[6.4, 3.2, 1],
[5.9, 3.0, 2]],
columns=['length', 'width', 'species']
)
df1.plot.scatter(x='length', y='width', c='DarkBlue', ax=ax1)
# plt_heatmap = plt
# plt.scatter(df.length, df.width) # gradio eturn plt.gcf() or plt
# return df, plt
# return plt
# return df, df
# return df1.iloc[:10, :], plt
# pd.concat([df0, pd.DataFrame([[".", ".", "..."]], columns=df0.columns)], ignore_index=1)
# pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns), df0.iloc[-1:, :]], ignore_index=1)
# _ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", "...", ]], columns=df1.columns), df1.iloc[-2:, :]], ignore_index=True)
# _ = pd.concat([df.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df.columns), df.iloc[-1:, :]], ignore_index=1)
_ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", ]], columns=df1.columns), df1.iloc[-4:, :]], ignore_index=1)
return _, plt
# return _, plt
# """
server_port = 7860
with socket(AF_INET, SOCK_STREAM) as sock:
sock.settimeout(0.01) # 10ms
# try numb times before giving up
numb = 5
for _ in range(numb):
if sock.connect_ex(("127.0.0.1", server_port)) != 0: # port idle
break
server_port = server_port + randint(0, 50)
else:
raise SystemExit(
f"Tried {numb} times to no avail, giving up..."
)
article = dedent("""
Click "Clear" first for subsequent submits
""")
css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
css_file = ".input_file, .output_file {height: 9rem !important; width: 100% !important;}"
logger.info("running at port %s", server_port)
iface = gr.Interface(
# fn=greet,
# inputs="text",
# fn=process_upload,
# fn=process_2upoads,
# inputs=["file", "file"],
# outputs="text",
# outputs="html",
fn=fn,
inputs=inputs,
outputs=outputs,
title="radiobee-aligner🔠",
description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
article=article,
examples=examples,
# theme="darkgrass",
layout="vertical", # horizontal unaligned
height=150, # 500
width=900, # 900
allow_flagging=False,
flagging_options=["fatal", "bug", "brainstorm", "excelsior", "paragon"],
css=f"{css} {css_file}",
)
iface.launch(
# share=False,
share=True,
debug=True,
server_name="0.0.0.0",
server_port=server_port,
# show_tips=True,
enable_queue=True,
)
_ = """
ax = sns.heatmap(cmat, cmap="viridis_r")
ax.invert_yaxis()
ax.set_xlabel(fastlid(df.text1)[0])
ax.set_xlabel(fastlid(df.text2)[0])
# return df, plt
return plt.gca()
https://colab.research.google.com/drive/1Gz9624VeAQLT7wlETgjOjPVURzQckXI0#scrollTo=qibtTvwecgsL colab gradio-file-inputs-upload.ipynb
iface = gr.Interface(plot_text, "file", "image")
def is_port_in_use(port):
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) == 0
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex(('127.0.0.1', 7911))
---
css https://huggingface.co/spaces/nielsr/LayoutLMv2-FUNSD/blob/main/app.py#L83
css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
#css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"
mod = 'en2zh'
packname = packx.__name__
globals()[mod] = getattr(importlib.import_module(f"{packname}.{mod}"), mod)
"""