Spaces:

mikeee
/

radiobee-aligner

Build error

radiobee-aligner / radiobee /__main__.py

freemt

Update aligned pairs plot

dab2de2 about 3 years ago

11 kB

	"""Run interactively."""
	from typing import Tuple, Optional

	import joblib
	from random import randint
	from textwrap import dedent
	from itertools import zip_longest
	from sklearn.cluster import DBSCAN

	from socket import socket, AF_INET, SOCK_STREAM
	import signal
	from varname import nameof
	from logzero import logger

	import numpy as np
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	# from tabulate import tabulate
	from fastlid import fastlid

	import gradio as gr
	from radiobee.process_upload import process_upload
	from radiobee.files2df import files2df
	from radiobee.file2text import file2text
	from radiobee.lists2cmat import lists2cmat
	from radiobee.plot_df import plot_df
	from radiobee.cmat2tset import cmat2tset

	sns.set()
	sns.set_style("darkgrid")
	fastlid.set_languages = ["en", "zh"]

	signal.signal(signal.SIGINT, signal.SIG_DFL)
	print("Press Ctrl+C to quit\n")

	def savelzma(obj, fileloc: str = None):
	if fileloc is None:
	fileloc = nameof(obj) # this wont work
	joblib.dump(obj, f"data/{fileloc}.lzma")


	def greet(input):
	"""Greet yo."""
	return f"'Sup yo! (your input: {input})"


	def upfile1(file1, file2=None) -> Tuple[str, str]:
	"""Upload file1, file2."""
	return file1.name, f"'Sup yo! (your input: {input})"

	def process_2upoads(file1, file2):
	"""Process stuff."""
	# return f"{process_upload(file1)}\n===***\n{process_upload(file2)}"

	text1 = [_.strip() for _ in process_upload(file1).splitlines() if _.strip()]
	text2 = [_.strip() for _ in process_upload(file2).splitlines() if _.strip()]

	text1, text2 = zip(*zip_longest(text1, text2, fillvalue=""))

	df = pd.DataFrame({"text1": text1, "text2": text2})

	# return tabulate(df)
	# return tabulate(df, tablefmt="grid")
	# return tabulate(df, tablefmt='html')

	return df


	if __name__ == "__main__":
	_ = """
	fn = process_2upoads
	inputs = ["file", "file"]
	examples = [
	["data/test_zh.txt", "data/test_en.txt"],
	["data/test_en.txt", "data/test_zh.txt"],
	]
	outputs = ["dataframe"]
	# """
	import logzero
	logzero.loglevel(10)
	logger.debug(" debug ")
	logger.info(" info ")

	# _ = """
	inputs = [
	gr.inputs.Textbox(
	# placeholder="Input something here",
	default="test text")
	]
	inputs = ["file", "file"]
	inputs = [
	gr.inputs.File(label="file 1"),
	# gr.inputs.File(file_count="multiple", label="file 2", optional=True),
	gr.inputs.File(label="file 2", optional=True),
	]
	examples = [
	["data/test_zh.txt", "data/test_en.txt"],
	["data/test_en.txt", "data/test_zh.txt"],
	["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt"],
	["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt"],
	]
	outputs = ["dataframe", "plot"]
	outputs = ["plot"]
	outputs = ["dataframe", "plot"]
	out1 = gr.outputs.Dataframe(
	headers=None,
	max_rows=12, # 20
	max_cols=None,
	overflow_row_behaviour='paginate',
	type='auto',
	label="To be aligned",
	)
	outputs = [
	out1,
	"plot",
	]
	# outputs = ["dataframe", "plot", "plot"] # wont work
	# outputs = ["dataframe"]
	# outputs = ["dataframe", "dataframe", ]

	def fn(file1, file2):
	"""Process inputs."""
	logger.debug(" debug ")

	# logger.info("file1: %s, file2: %s", file1, file2)
	logger.info("file1.name: %s, file2.name: %s", file1.name, file2.name)

	# bypass if file1 or file2 is str input
	if not (isinstance(file1, str) or isinstance(file2, str)):
	text1 = file2text(file1)
	text2 = file2text(file2)
	lang1, _ = fastlid(text1)
	lang2, _ = fastlid(text2)

	df1 = files2df(file1, file2)

	lst1 = [elm for elm in df1.text1 if elm]
	lst2 = [elm for elm in df1.text2 if elm]
	len1 = len(lst1)
	len2 = len(lst2)

	# this wont work
	# for obj in [text1, text2, df1, lst1, lst2, ]:
	# savelzma(text1) wont work
	joblib.dump(text1, f"data/{nameof(text1)}.lzma")
	joblib.dump(text2, f"data/{nameof(text2)}.lzma")
	joblib.dump(df1, f"data/{nameof(df1)}.lzma")
	joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
	joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")

	cmat = lists2cmat(lst1, lst2)

	tset = pd.DataFrame(cmat2tset(cmat))
	tset.columns = ["x", "y", "cos"]

	print("lst1: %s" % lst1)
	print("lst2: %s" % lst2)
	print("cmat: %s" % cmat)
	print("tset: %s" % tset)

	logger.debug("lst1: %s", lst1)
	logger.debug("lst2: %s", lst2)
	logger.debug("cmat: %s", cmat)
	logger.debug("tset: %s", tset)


	# plt0 = plot_df(pd.DataFrame(cmat))
	df_ = tset
	min_samples: int = 6
	eps: float = 10
	ylim: Optional[int] = None
	xlabel: str = lang1
	ylabel: str = lang2

	sns.set()
	sns.set_style("darkgrid")

	# fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
	fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
	# plt.subplot_tool()
	fig.subplots_adjust(hspace=.4)

	sns.heatmap(cmat, cmap="viridis_r", ax=ax2).invert_yaxis()
	ax2.set_xlabel(xlabel)
	ax2.set_ylabel(ylabel)
	ax2.set_title("cos similarity heatmap")

	fig.suptitle("alignment projection")

	_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
	_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0

	df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0)

	# clustered
	df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1)

	# outliers
	df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)

	# ax0.set_xlabel("")
	# ax0.set_ylabel("zh")
	ax0.set_xlabel(xlabel)
	ax0.set_ylabel(ylabel)

	ax0.set_xlim(0, len1)
	ax0.set_ylim(0, len2)
	ax0.set_title("max similarity along columns ('x': outliers)")

	# ax1.set_xlabel("en")
	# ax1.set_ylabel("zh")
	ax1.set_xlabel(xlabel)
	ax1.set_ylabel(ylabel)

	ax1.set_xlim(0, len1)
	ax1.set_ylim(0, len2)
	ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")

	# return df, plot_df(pd.DataFrame(cmat))
	# tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
	else:
	fig, ax1 = plt.subplots()
	df1 = pd.DataFrame(
	[[5.1, 3.5, 0],
	[4.9, 3.0, 0],
	[7.0, 3.2, 1],
	[6.4, 3.2, 1],
	[5.9, 3.0, 2]],
	columns=['length', 'width', 'species']
	)
	df1.plot.scatter(x='length', y='width', c='DarkBlue', ax=ax1)
	# plt_heatmap = plt

	# plt.scatter(df.length, df.width) # gradio eturn plt.gcf() or plt

	# return df, plt
	# return plt
	# return df, df
	# return df1.iloc[:10, :], plt

	# pd.concat([df0, pd.DataFrame([[".", ".", "..."]], columns=df0.columns)], ignore_index=1)
	# pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns), df0.iloc[-1:, :]], ignore_index=1)

	# _ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", "...", ]], columns=df1.columns), df1.iloc[-2:, :]], ignore_index=True)
	# _ = pd.concat([df.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df.columns), df.iloc[-1:, :]], ignore_index=1)

	_ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", ]], columns=df1.columns), df1.iloc[-4:, :]], ignore_index=1)

	return _, plt
	# return _, plt

	# """

	server_port = 7860
	with socket(AF_INET, SOCK_STREAM) as sock:
	sock.settimeout(0.01) # 10ms

	# try numb times before giving up
	numb = 5
	for _ in range(numb):
	if sock.connect_ex(("127.0.0.1", server_port)) != 0: # port idle
	break
	server_port = server_port + randint(0, 50)
	else:
	raise SystemExit(
	f"Tried {numb} times to no avail, giving up..."
	)

	article = dedent("""
	Click "Clear" first for subsequent submits
	""")
	css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
	css_file = ".input_file, .output_file {height: 9rem !important; width: 100% !important;}"

	logger.info("running at port %s", server_port)

	iface = gr.Interface(
	# fn=greet,
	# inputs="text",
	# fn=process_upload,
	# fn=process_2upoads,
	# inputs=["file", "file"],
	# outputs="text",
	# outputs="html",
	fn=fn,
	inputs=inputs,
	outputs=outputs,
	title="radiobee-aligner🔠",
	description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
	article=article,
	examples=examples,
	# theme="darkgrass",
	layout="vertical", # horizontal unaligned
	height=150, # 500
	width=900, # 900
	allow_flagging=False,
	flagging_options=["fatal", "bug", "brainstorm", "excelsior", "paragon"],
	css=f"{css} {css_file}",
	)

	iface.launch(
	# share=False,
	share=True,
	debug=True,
	server_name="0.0.0.0",
	server_port=server_port,
	# show_tips=True,
	enable_queue=True,
	)

	_ = """

	ax = sns.heatmap(cmat, cmap="viridis_r")

	ax.invert_yaxis()
	ax.set_xlabel(fastlid(df.text1)[0])
	ax.set_xlabel(fastlid(df.text2)[0])

	# return df, plt
	return plt.gca()

	https://colab.research.google.com/drive/1Gz9624VeAQLT7wlETgjOjPVURzQckXI0#scrollTo=qibtTvwecgsL colab gradio-file-inputs-upload.ipynb
	iface = gr.Interface(plot_text, "file", "image")

	def is_port_in_use(port):
	import socket
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	return s.connect_ex(('localhost', port)) == 0

	socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex(('127.0.0.1', 7911))

	---
	css https://huggingface.co/spaces/nielsr/LayoutLMv2-FUNSD/blob/main/app.py#L83

	css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
	#css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
	# css = ".output_image, .input_image {height: 600px !important}"

	mod = 'en2zh'
	packname = packx.__name__

	globals()[mod] = getattr(importlib.import_module(f"{packname}.{mod}"), mod)

	"""