Spaces:

raynardj
/

duguwen-classical-chinese-to-morden-translate

Runtime error

App Files Files Community

duguwen-classical-chinese-to-morden-translate / app.py

raynardj

Update app.py

d5ccfc3 6 months ago

raw history blame contribute delete

No virus

6.9 kB

	import streamlit as st
	import pandas as pd
	from pathlib import Path
	import requests
	import base64
	from requests.auth import HTTPBasicAuth
	import torch

	st.set_page_config(layout="wide")

	@st.cache(allow_output_mutation=True)
	def load_model():
	from transformers import (
	EncoderDecoderModel,
	AutoTokenizer,
	)
	PRETRAINED = "raynardj/wenyanwen-ancient-translate-to-modern"
	tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
	model = EncoderDecoderModel.from_pretrained(PRETRAINED)
	return tokenizer, model


	tokenizer, model = load_model()


	def inference(text):
	print(f"from: {text}")
	tk_kwargs = dict(
	truncation=True,
	max_length=168,
	padding="max_length",
	return_tensors='pt')

	inputs = tokenizer([text, ], **tk_kwargs)
	with torch.no_grad():
	new = tokenizer.batch_decode(
	model.generate(
	inputs.input_ids,
	attention_mask=inputs.attention_mask,
	num_beams=3,
	max_length=256,
	bos_token_id=101,
	eos_token_id=tokenizer.sep_token_id,
	pad_token_id=tokenizer.pad_token_id,
	), skip_special_tokens=True)[0].replace(" ", "")
	print(f"to: {new}")
	return new


	@st.cache
	def get_file_df():
	file_df = pd.read_csv("meta.csv")
	return file_df


	file_df = get_file_df()

	st.sidebar.title("【隨無涯】")
	st.sidebar.markdown("""
	* 朕自庖[🤗 模型](https://huggingface.co/raynardj/wenyanwen-ancient-translate-to-modern), [⭐️ 訓習處](https://github.com/raynardj/yuan)
	* 📚 充棟汗牛，取自[殆知閣](http://www.daizhige.org/)，[github api](https://github.com/garychowcmu/daizhigev20)
	""")

	c2 = st.container()
	c2.write("The entirety of ancient Chinese literature, with a modern translator at your side.")
	st.markdown("""---""")
	c = st.container()

	USER_ID = st.secrets["USER_ID"]
	SECRET = st.secrets["SECRET"]


	@st.cache
	def get_maps():
	file_obj_hash_map = dict(file_df[["filepath", "obj_hash"]].values)
	file_size_map = dict(file_df[["filepath", "fsize"]].values)
	return file_obj_hash_map, file_size_map


	file_obj_hash_map, file_size_map = get_maps()


	def show_file_size(size: int):
	if size < 1024:
	return f"{size} B"
	elif size < 1024*1024:
	return f"{size//1024} KB"
	else:
	return f"{size/1024//1024} MB"

	@st.cache(max_entries=100, allow_output_mutation=True)
	def fetch_file(path):
	# reading from local path first
	if (Path("data")/path).exists():
	with open(Path("data")/path, "r") as f:
	return f.read()

	# read from github api
	obj_hash = file_obj_hash_map[path]
	auth = HTTPBasicAuth(USER_ID, SECRET)
	url = f"https://api.github.com/repos/garychowcmu/daizhigev20/git/blobs/{obj_hash}"
	print(f"requesting {url}")
	r = requests.get(url, auth=auth)
	if r.status_code == 200:
	data = r.json()
	content = base64.b64decode(data['content']).decode('utf-8')
	return content
	else:
	r.raise_for_status()


	@st.cache(allow_output_mutation=True, max_entries=100)
	def fetch_from_df(sub_paths: str = ""):
	sub_df = file_df.copy()
	for idx, step in enumerate(sub_paths):
	sub_df.query(f"col_{idx} == '{step}'", inplace=True)
	if len(sub_df) == 0:
	return None
	return list(sub_df[f"col_{len(sub_paths)}"].unique())

	def show_filepath(filepath: str):
	text = fetch_file(filepath)
	c.markdown(
	f"""<pre style='white-space:pre-wrap;max-height:300px;overflow-y:auto'>{text}</pre>""", unsafe_allow_html=True)

	if st.sidebar.selectbox(label="何以尋跡 How to search",options=["以類尋書 category","書名求書 search"])=="以類尋書 category":

	# root_data = fetch_from_github()
	if 'pathway' in st.session_state:
	pass
	else:
	st.session_state.pathway = []

	path_text = st.sidebar.text("/".join(st.session_state.pathway))


	def reset_path():
	st.session_state.pathway = []
	path_text.text(st.session_state.pathway)


	if st.sidebar.button("還至初錄(back to root)"):
	reset_path()

	def display_tree():
	sublist = fetch_from_df(st.session_state.pathway)
	dropdown = st.sidebar.selectbox("【擇書 choose】", options=sublist)
	with st.spinner("書非借不能讀也..."):
	st.session_state.pathway.append(dropdown)
	if dropdown.endswith('.txt'):
	filepath = "/".join(st.session_state.pathway)
	file_size = file_size_map[filepath]
	with st.spinner(f"Load 載文:{filepath},({show_file_size(file_size)})"):
	# if file size is too large, we will not load it
	if file_size > 310241024:
	print(f"skip {filepath}")
	urlpath = filepath.replace(".txt", ".html")
	dzg = f"http://www.daizhige.org/{urlpath}"
	st.markdown(f"File too big 其文碩而難載，不能為之，[往殆知閣]({dzg}), 或擇他書")
	reset_path()
	return None
	path_text.text(filepath)
	print(f"read {filepath}")
	text = fetch_file(filepath)
	# create markdown with max heights
	c.markdown(
	f"""<pre style='white-space:pre-wrap;max-height:300px;overflow-y:auto'>{text}</pre>""", unsafe_allow_html=True
	)
	reset_path()

	else:
	sub_list = fetch_from_df(
	st.session_state.pathway)
	path_text.text("/".join(st.session_state.pathway))
	display_tree()

	display_tree()
	else:
	def search_kw():
	result = file_df[file_df.filepath.str.contains(st.session_state.kw)].reset_index(drop=True)
	if len(result) == 0:
	st.sidebar.write(f"尋之不得:{st.session_state.kw}")
	else:
	filepath = st.sidebar.selectbox("選一書名", options=list(result.head(15).filepath))
	show_filepath(filepath)

	def loading_with_search():
	kw = st.sidebar.text_input("書名求書 Search", value="楞伽经")
	st.session_state.kw = kw
	search_kw()

	loading_with_search()


	def translate_text():
	if c2.button("【曉文達義 Translate】"):
	if cc:
	if len(cc) > 168:
	c2.write(f"句甚長不得過百又六十八字 Sentence too long, should be less than 168 characters")
	else:
	c2.markdown(f"""```{inference(cc)}```""")
	else:
	c2.write("【入難曉之文字 Please input sentence for translating】")

	cc = c2.text_area("【入難曉之文字 Input sentence】", height=150)
	translate_text()