Spaces:

raynardj
/

duguwen-classical-chinese-to-morden-translate

Runtime error

duguwen-classical-chinese-to-morden-translate / app.py

raynardj

🪁 a decent version

e0a6c5a over 2 years ago

No virus

6.89 kB

	import streamlit as st
	import pandas as pd
	from pathlib import Path
	import requests
	import base64
	from requests.auth import HTTPBasicAuth
	import torch

	st.set_page_config(layout="wide")

	@st.cache(allow_output_mutation=True)
	def load_model():
	from transformers import (
	EncoderDecoderModel,
	AutoTokenizer
	)
	PRETRAINED = "raynardj/wenyanwen-ancient-translate-to-modern"
	tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
	model = EncoderDecoderModel.from_pretrained(PRETRAINED)
	return tokenizer, model


	tokenizer, model = load_model()


	def inference(text):
	tk_kwargs = dict(
	truncation=True,
	max_length=168,
	padding="max_length",
	return_tensors='pt')

	inputs = tokenizer([text, ], **tk_kwargs)
	with torch.no_grad():
	return tokenizer.batch_decode(
	model.generate(
	inputs.input_ids,
	attention_mask=inputs.attention_mask,
	num_beams=3,
	max_length=256,
	bos_token_id=101,
	eos_token_id=tokenizer.sep_token_id,
	pad_token_id=tokenizer.pad_token_id,
	), skip_special_tokens=True)[0].replace(" ", "")


	@st.cache
	def get_file_df():
	file_df = pd.read_csv("meta.csv")
	return file_df


	file_df = get_file_df()

	st.sidebar.title("【隨無涯】")
	st.sidebar.markdown("""
	* 朕自庖[🤗 模型](https://huggingface.co/raynardj/wenyanwen-ancient-translate-to-modern), [⭐️ 訓習處](https://github.com/raynardj/yuan)
	* 📚 充棟汗牛，取自[殆知閣](http://www.daizhige.org/)，[github api](https://github.com/garychowcmu/daizhigev20)
	""")

	c2 = st.container()
	c2.write("The entirety of ancient Chinese literature, with a modern translator at your side.")
	st.markdown("""---""")
	c = st.container()

	USER_ID = st.secrets["USER_ID"]
	SECRET = st.secrets["SECRET"]


	@st.cache
	def get_maps():
	file_obj_hash_map = dict(file_df[["filepath", "obj_hash"]].values)
	file_size_map = dict(file_df[["filepath", "fsize"]].values)
	return file_obj_hash_map, file_size_map


	file_obj_hash_map, file_size_map = get_maps()


	def show_file_size(size: int):
	if size < 1024:
	return f"{size} B"
	elif size < 1024*1024:
	return f"{size//1024} KB"
	else:
	return f"{size/1024//1024} MB"

	@st.cache(max_entries=100, allow_output_mutation=True)
	def fetch_file(path):
	# reading from local path first
	if (Path("data")/path).exists():
	with open(Path("data")/path, "r") as f:
	return f.read()

	# read from github api
	obj_hash = file_obj_hash_map[path]
	auth = HTTPBasicAuth(USER_ID, SECRET)
	url = f"https://api.github.com/repos/garychowcmu/daizhigev20/git/blobs/{obj_hash}"
	r = requests.get(url, auth=auth)
	if r.status_code == 200:
	data = r.json()
	content = base64.b64decode(data['content']).decode('utf-8')
	return content
	else:
	r.raise_for_status()


	@st.cache(allow_output_mutation=True, max_entries=100)
	def fetch_from_df(sub_paths: str = ""):
	sub_df = file_df.copy()
	for idx, step in enumerate(sub_paths):
	sub_df.query(f"col_{idx} == '{step}'", inplace=True)
	if len(sub_df) == 0:
	return None
	return list(sub_df[f"col_{len(sub_paths)}"].unique())

	def show_filepath(filepath: str):
	text = fetch_file(filepath)
	c.markdown(
	f"""<pre style='max-height:300px;overflow-y:auto'>{text}</pre>""", unsafe_allow_html=True)

	if st.sidebar.selectbox(label="何以尋跡 How to search",options=["以類尋書 category","書名求書 search"])=="以類尋書 category":

	st.session_state.translating = False

	# root_data = fetch_from_github()
	if 'pathway' in st.session_state:
	pass
	else:
	st.session_state.pathway = []

	path_text = st.sidebar.text("/".join(st.session_state.pathway))


	def reset_path():
	st.session_state.pathway = []
	path_text.text(st.session_state.pathway)


	if st.sidebar.button("還至初錄(back to root)"):
	reset_path()

	def display_tree():
	sublist = fetch_from_df(st.session_state.pathway)
	dropdown = st.sidebar.selectbox("【擇書 choose】", options=sublist)
	with st.spinner("書非借不能讀也..."):
	st.session_state.pathway.append(dropdown)
	if dropdown.endswith('.txt'):
	filepath = "/".join(st.session_state.pathway)
	file_size = file_size_map[filepath]
	with st.spinner(f"Load 載文:{filepath},({show_file_size(file_size)})"):
	# if file size is too large, we will not load it
	if file_size > 310241024:
	urlpath = filepath.replace(".txt", ".html")
	dzg = f"http://www.daizhige.org/{urlpath}"
	st.markdown(f"File too big 其文碩而難載，不能為之，[往殆知閣]({dzg}), 或擇他書")
	reset_path()
	return None
	path_text.text(filepath)
	text = fetch_file(filepath)
	# create markdown with max heights
	c.markdown(
	f"""<pre style='max-height:300px;overflow-y:auto'>{text}</pre>""", unsafe_allow_html=True
	)
	reset_path()

	else:
	sub_list = fetch_from_df(
	st.session_state.pathway)
	path_text.text("/".join(st.session_state.pathway))
	display_tree()

	if st.session_state.translating == False:
	display_tree()
	else:
	def search_kw():
	result = file_df[file_df.filepath.str.contains(st.session_state.kw)].reset_index(drop=True)
	if len(result) == 0:
	st.sidebar.write(f"尋之不得:{st.session_state.kw}")
	else:
	filepath = st.sidebar.selectbox("選一書名", options=list(result.head(15).filepath))
	show_filepath(filepath)

	def loading_with_search():
	kw = st.sidebar.text_input("書名求書 Search", value="楞伽经")
	st.session_state.kw = kw
	search_kw()

	if st.session_state.translating == False:
	loading_with_search()


	def translate_text():
	st.session_state.translating = True
	if c2.button("【曉文達義 Translate】"):
	if cc:
	if len(cc) > 168:
	c2.write(f"句甚長不得過百又六十八字 Sentence too long, should be less than 168 characters")
	else:
	c2.markdown(f"""```{inference(cc)}```""")
	else:
	c2.write("【入難曉之文字 Please input sentence for translating】")
	st.session_state.translating = False

	cc = c2.text_area("【入難曉之文字 Input sentence】", height=150)
	translate_text()