language-identification

Sleeping

App Files Files Community

language-identification / app.py

kargaranamir

Upadte GlotLID

715fd06 12 months ago

raw

history blame

7.67 kB

	# coding=utf-8
	# Copyright 2023 The GlotLID Authors.
	# Lint as: python3


	# This space is built based on AMR-KELEG/ALDi space.
	# GlotLID Space


	import constants
	import pandas as pd
	import streamlit as st
	from huggingface_hub import hf_hub_download
	from GlotScript import get_script_predictor
	import matplotlib.pyplot as plt
	import fasttext
	import altair as alt
	from altair import X, Y, Scale
	import base64
	import json

	@st.cache_resource
	def load_sp():
	sp = get_script_predictor()
	return sp


	sp = load_sp()

	def get_script(text):
	"""Get the writing systems of given text.

	Args:
	text: The text to be preprocessed.

	Returns:
	The main script and list of all scripts.
	"""
	res = sp(text)
	main_script = res[0] if res[0] else 'Zyyy'
	all_scripts_dict = res[2]['details']
	if all_scripts_dict:
	all_scripts = list(all_scripts_dict.keys())
	else:
	all_scripts = 'Zyyy'

	return main_script, all_scripts


	@st.cache_data
	def language_names(json_path):
	with open(json_path, 'r') as json_file:
	data = json.load(json_file)
	return data

	label2name = language_names("assets/language_names.json")

	def get_name(label):
	"""Get the name of language from label"""
	iso_3 = label.split('_')[0]
	name = label2name[iso_3]
	return name


	@st.cache_data
	def render_svg(svg):
	"""Renders the given svg string."""
	b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
	html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}"/> </p>'
	c = st.container()
	c.write(html, unsafe_allow_html=True)


	@st.cache_data
	def convert_df(df):
	# IMPORTANT: Cache the conversion to prevent computation on every rerun
	return df.to_csv(index=None).encode("utf-8")


	@st.cache_resource
	def load_GlotLID_v1(model_name, file_name):
	model_path = hf_hub_download(repo_id=model_name, filename=file_name)
	model = fasttext.load_model(model_path)
	return model

	@st.cache_resource
	def load_GlotLID_v2(model_name, file_name):
	model_path = hf_hub_download(repo_id=model_name, filename=file_name)
	model = fasttext.load_model(model_path)
	return model


	model_1 = load_GlotLID_v1(constants.MODEL_NAME, "model_v1.bin")
	model_2 = load_GlotLID_v2(constants.MODEL_NAME, "model_v2.bin")

	@st.cache_resource
	def plot(label, prob):

	ORANGE_COLOR = "#FF8000"
	fig, ax = plt.subplots(figsize=(8, 1))
	fig.patch.set_facecolor("none")
	ax.set_facecolor("none")

	ax.spines["left"].set_color(ORANGE_COLOR)
	ax.spines["bottom"].set_color(ORANGE_COLOR)
	ax.tick_params(axis="x", colors=ORANGE_COLOR)

	ax.spines[["right", "top"]].set_visible(False)

	ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
	ax.set_xlim(0, 1)
	ax.set_ylim(-1, 1)
	ax.set_title(f"Label: {label}, Language: {get_name(label)}", color=ORANGE_COLOR)
	ax.get_yaxis().set_visible(False)
	ax.set_xlabel("Confidence", color=ORANGE_COLOR)
	st.pyplot(fig)

	def compute(sentences, version = 'v2'):
	"""Computes the language probablities and labels for the given sentences.

	Args:
	sentences: A list of sentences.

	Returns:
	A list of language probablities and labels for the given sentences.
	"""
	progress_text = "Computing Language..."
	model_choice = model_2 if version == 'v2' else model_1
	my_bar = st.progress(0, text=progress_text)

	probs = []
	labels = []

	for index, sent in enumerate(sentences):

	output = model_choice.predict(sent)

	output_label = output[0][0].split('__')[-1]
	output_prob = max(min(output[1][0], 1), 0)
	output_label_language = output_label.split('_')[0]

	# script control
	if version in ['v2'] and output_label_language!= 'zxx':
	main_script, all_scripts = get_script(sent)
	output_label_script = output_label.split('_')[1]

	if output_label_script not in all_scripts:
	output_label_script = main_script
	output_label = f"und_{output_label_script}"
	output_prob = 0


	labels = labels + [output_label]
	probs = probs + [output_prob]

	my_bar.progress(
	min((index) / len(sentences), 1),
	text=progress_text,
	)
	my_bar.empty()
	return probs, labels

	st.markdown("[![Duplicate Space](https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14)](https://huggingface.co/spaces/cis-lmu/glotlid-space?duplicate=true)")

	render_svg(open("assets/GlotLID_logo.svg").read())

	tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])

	with tab1:

	# choice = st.radio(
	# "Set granularity level",
	# ["default", "merge", "individual"],
	# captions=["enable both macrolanguage and its varieties (default)", "merge macrolanguage and its varieties into one label", "remove macrolanguages - only shows individual langauges"],
	# )

	version = st.radio(
	"Choose model",
	["v1", "v2"],
	captions=["GlotLID version 1", "GlotLID version 2 (more data and languages)"],
	index = 1,
	key = 'version_tab1',
	horizontal = True
	)

	sent = st.text_input(
	"Sentence:", placeholder="Enter a sentence.", on_change=None
	)

	# TODO: Check if this is needed!

	clicked = st.button("Submit")

	if sent:
	sent = sent.replace('\n', '')

	probs, labels = compute([sent], version=version)
	prob = probs[0]
	label = labels[0]

	# plot
	plot(label, prob)

	print(sent)
	with open("logs.txt", "a") as f:
	f.write(sent + "\n")
	with tab2:

	version = st.radio(
	"Choose model",
	["v1", "v2"],
	captions=["GlotLID version 1", "GlotLID version 2 (more data and languages)"],
	index = 1,
	key = 'version_tab2',
	horizontal = True
	)

	file = st.file_uploader("Upload a file", type=["txt"])
	if file is not None:
	df = pd.read_csv(file, sep="¦\t¦", header=None)
	df.columns = ["Sentence"]
	df.reset_index(drop=True, inplace=True)

	# TODO: Run the model
	df['Prob'], df["Label"] = compute(df["Sentence"].tolist(), version= version)
	df['Language'] = df["Label"].apply(get_name)

	# A horizontal rule
	st.markdown("""---""")

	chart = (
	alt.Chart(df.reset_index())
	.mark_area(color="darkorange", opacity=0.5)
	.encode(
	x=X(field="index", title="Sentence Index"),
	y=Y("Prob", scale=Scale(domain=[0, 1])),
	)
	)
	st.altair_chart(chart.interactive(), use_container_width=True)

	col1, col2 = st.columns([4, 1])

	with col1:
	# Display the output
	st.table(
	df,
	)

	with col2:
	# Add a download button
	csv = convert_df(df)
	st.download_button(
	label=":file_folder: Download predictions as CSV",
	data=csv,
	file_name="GlotLID.csv",
	mime="text/csv",
	)