Spaces:

a-menu
/

arches_demo

Running

App Files Files Community

arches_demo / app.py

a-menu

typo

4683d94 verified 8 months ago

raw

history blame

No virus

37.1 kB

	import streamlit as st
	import spacy
	from spacy import displacy
	import pandas as pd
	from io import StringIO, BytesIO
	from lxml import etree
	from bs4 import BeautifulSoup
	import html
	from streamlit_extras.stylable_container import stylable_container
	import subprocess
	import importlib.util


	# This app was inspired by Lucas Terriel's NER4Archives Visualizer App (2022-2023), https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main
	# Check out the NER4Archives project (INRIA-ALMAnaCH/Archives nationales) : https://github.com/NER4Archives-project



	# ===== SOME SETTING UP =====

	# Setting up the app's page
	st.set_page_config(page_title="Arches Demo", page_icon="🏺")

	# Path to the statics directory
	statics = "./static"

	# Making the radio widgets' titles bigger
	# Source : arnaud, https://discuss.streamlit.io/t/how-to-change-font-size-of-streamlit-radio-widget-title/35945/2
	st.markdown(
	"""<style>
	div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
	font-size: 17px;
	}
	</style>
	""", unsafe_allow_html=True)

	# Hiding the possibility to display pictures fullscreen
	# Source : AvratanuBiswas, https://discuss.streamlit.io/t/hide-fullscreen-option-when-displaying-images-using-st-image/19792/2
	st.markdown(
	"""<style>
	button[title="View fullscreen"]{
	visibility: hidden;
	}
	</style>
	""", unsafe_allow_html=True)

	# Setting up the colors of the entity tags for displacy
	ENTITIES_COLORS = {
	"CHRONOLOGIE": "#ffb627",
	"MOBILIER": "#6b7fd7",
	"MATERIAU": "#d36582",
	"STRUCTURE": "#00b2ca",
	"TECHNIQUE_STYLE": "#ED6A5A",
	"ESPECE": "#96C7FF",
	"EDIFICE": "#9F86C0",
	"ID": "#f65bff",
	"LIEUDIT_SITE": "#d8e446",
	"PERSONNE": "#D3B4B4",
	"PEUPLE_CULTURE": "#d20000",
	"LOC": "#81db72",
	"DECOR": "#fff46a",
	"ORG": "#887575",
	"GPE": "#00a878"
	}
	OPTIONS = {
	"ents":
	[
	"CHRONOLOGIE",
	"MOBILIER",
	"MATERIAU",
	"STRUCTURE",
	"TECHNIQUE_STYLE",
	"ESPECE",
	"EDIFICE",
	"ID",
	"LIEUDIT_SITE",
	"PERSONNE",
	"PEUPLE_CULTURE",
	"LOC",
	"DECOR",
	"ORG",
	"GPE"
	],
	"colors": ENTITIES_COLORS}

	# ===== SIDEBAR =====

	st.sidebar.title("ARCHES - Étude, composition et processus pour une édition structurée des rapports d’opérations archéologiques préventives")

	st.sidebar.markdown("Avec ses 2200 collaborateurs, l’[Inrap](https://www.inrap.fr/) représente la plus importante structure publique de recherche archéologique française. De fait, chaque année, près de 2000 chantiers (diagnostics archéologiques et fouilles) sont réalisés en partenariat avec les aménageurs publics et privés, en France métropolitaine et dans les départements d’outre-mer. Les missions de l’Institut intégrant l’exploitation scientifique des résultats et la diffusion de la connaissance archéologique auprès du public, plus de 2000 rapports d’opération archéologique sont ainsi rédigés annuellement.")

	st.sidebar.markdown("Financé avec le soutien du [Fonds National pour la Science Ouverte](https://www.ouvrirlascience.fr/accueil/) et réalisé en collaboration avec l’infrastructure de recherche [Métopes](http://www.metopes.fr/) ([Université de Caen Normandie](https://www.unicaen.fr/) - [CNRS](https://www.cnrs.fr/fr)), [ARCHES](https://www.inrap.fr/arches-etude-composition-et-processus-pour-une-edition-structuree-des-rapports-d-17145) vise à explorer l’amélioration de la diffusion et de l’exploitation des rapports d’opération à l’aide du format de balisage XML-TEI, permettant d’encoder tant la structuration formelle que le contenu sémantique d’un document. Dans cette optique, vingt-et-un rapports de fouilles de l’Inrap ont été annotés pour entraîner un modèle de reconnaissance des entités nommées (représentant plus de 80 000 entités annotées). Cette application vise à tester la manipulation du modèle, tant avec des fichiers XML que texte brut.")

	st.sidebar.markdown("Le corpus a été annoté à l'aide d'[INCEpTION](https://inception-project.github.io/), tandis que les modèles de [segmentation](https://huggingface.co/a-menu/fr_arches_sentencizer) et de reconnaissance des entités nommées ([avec](https://huggingface.co/a-menu/fr_arches_ner_trf) et [sans](https://huggingface.co/a-menu/fr_arches_ner) architecture transformer) ont été entraînés et évalués avec [spaCy](https://spacy.io/). Les modalités de [citation](https://huggingface.co/spaces/a-menu/arches_demo/blob/main/CITATION.cff) de l'application peuvent être retrouvées dans le [dépôt](https://huggingface.co/spaces/a-menu/arches_demo/tree/main) de celle-ci.")

	st.sidebar.write("")

	st.sidebar.markdown("ARCHES (Inrap), janvier 2024")

	st.sidebar.write("")
	st.sidebar.write("")

	st.sidebar.header("Partenaires")
	st.sidebar.write("")
	# Display logos
	col1, col2, col3 = st.sidebar.columns(3)
	col1.image(f"{statics}/logo_inrap.png", width=100)
	col2.write("")
	col2.image(f"{statics}/logo_ouvrir_la_science.png", width=100)
	col3.image(f"{statics}/logo_mesr.png", width=100)
	col1.image(f"{statics}/logo_ir_metopes.png", width=100)
	col2.write("")
	col2.write("")
	col2.image(f"{statics}/logo_mrsh.jpg", width=100)
	col3.image(f"{statics}/logo_unicaen.png", width=100)
	col1.image(f"{statics}/logo_cnrs.png", width=80)

	# ===== SOME FUNCTIONS =====


	# Cached to prevent computation on every rerun
	@st.cache_resource
	def download_sentencizer():
	"""
	Downloads the fr_arches_sentencizer model.

	:returns: None
	"""

	# Check if the model is already installed
	# If not, install it
	# Source : ice.nicer & Arthur, https://stackoverflow.com/a/41815890
	check_senter = importlib.util.find_spec("fr_arches_sentencizer")
	if check_senter is None:
	subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_sentencizer/resolve/main/fr_arches_sentencizer-any-py3-none-any.whl"])


	# Cached to prevent computation on every rerun
	@st.cache_resource
	def download_ner_trf():
	"""
	Downloads the fr_arches_ner_trf TRF NER model.

	:returns: None
	"""

	# Check if the model is already installed
	# If not, install it
	# Source : ice.nicer & Arthur, https://stackoverflow.com/a/41815890
	check_ner_trf = importlib.util.find_spec("fr_arches_ner_trf")
	if check_ner_trf is None:
	subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_ner_trf/resolve/main/fr_arches_ner_trf-any-py3-none-any.whl"])


	# Cached to prevent computation on every rerun
	@st.cache_resource
	def download_ner():
	"""
	Downloads the fr_arches_ner NER model.

	:returns: None
	"""

	# Check if the model is already installed
	# If not, install it
	# Source : ice.nicer & Arthur, https://stackoverflow.com/a/41815890
	check_ner = importlib.util.find_spec("fr_arches_ner")
	if check_ner is None:
	subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_ner/resolve/main/fr_arches_ner-any-py3-none-any.whl"])


	# Cached to prevent computation on every rerun
	@st.cache_resource
	def load_sentencizer():
	"""
	Loads our custom sentence segmentation model.

	:returns: loaded fr_arches_sentencizer model
	:rtype: spacy.lang.fr.French
	"""

	senter = spacy.load("fr_arches_sentencizer")
	return senter


	# Cached to prevent computation on every rerun
	@st.cache_resource
	def load_ner_trf():
	"""
	Loads our custom fr_arches_ner_trf trf ner model.

	:returns: loaded fr_arches model
	:rtype: spacy.lang.fr.French
	"""

	ner = spacy.load("fr_arches_ner_trf")
	# To try to reduce memory usage
	config = {"attrs": {"tensor": None}}
	ner.add_pipe("doc_cleaner", config=config)
	return ner


	# Cached to prevent computation on every rerun
	@st.cache_resource
	def load_ner():
	"""
	Loads our custom fr_arches_ner ner model.

	:returns: loaded fr_arches model
	:rtype: spacy.lang.fr.French
	"""

	ner = spacy.load("fr_arches_ner")
	# To try to reduce memory usage
	config = {"attrs": {"tensor": None}}
	ner.add_pipe("doc_cleaner", config=config)
	return ner


	def apply_senter(senter, data):
	"""
	Applies our custom sentence segmentation model on data.

	:param senter: sentence segmentation model
	:type senter: spacy.lang.fr.French
	:param data: text to be segmented
	:type data: str
	:returns: sentencized text
	:rtype: str
	"""

	mes_phrases = senter(data)
	sentencized_text = ""
	for sent in mes_phrases.sents:
	sentencized_text += str(sent) + "\n"
	return sentencized_text


	def get_doc(ner, data):
	"""
	Applies our custom ner model on data.

	:param ner: ner model
	:type ner: spacy.lang.fr.French
	:param data: text to be analyzed
	:type data: str
	:returns: spacy doc
	:rtype: spacy.tokens.doc.Doc
	"""

	# Replace the non-breaking spaces (NBSP) with regular spaces before applying our model on the text. To do so:
	# Create a list to store their position
	list_nbsp = []
	# Iterate over each character and save the position of the non-breaking spaces
	for i, char in enumerate(data):
	if char == "\u00A0":
	list_nbsp.append(i)
	# Once we have memorized the NBSP's positions, we replace them with regular spaces
	data = data.replace("\u00A0", " ")

	# Apply the NER model on our data
	doc = ner(data)

	return doc, list_nbsp


	def get_entities(doc, list_nbsp):
	"""
	Extracts the named entities from the doc.

	:param doc: spacy doc
	:type doc: spacy.tokens.doc.Doc
	:returns: list of named entities
	:rtype: list
	"""

	# Put back the NBSP
	characters_with_nbsp = [char if i not in list_nbsp else "\u00A0" for i, char in enumerate(doc.text)]
	# Convert the list back to a string
	nbsp_text = "".join(characters_with_nbsp)

	entities = []
	for ent in doc.ents:
	# We collect :
	# The named entity (using its position since the tokenizer would sometimes add unwanted spaces, for instance before a comma)
	# Its label
	# Its position
	entities.append((nbsp_text[ent.start_char:ent.end_char].strip(), ent.label_, ent.start_char, ent.end_char))

	return entities, nbsp_text


	def create_displacy(text, entities):
	"""
	Render named entities using displacy.

	:param text: input text
	:type text: str
	:param entities: list of named entities with start and end character positions
	:type entities: list
	:returns: showcase of entities with displacy
	:rtype: str
	"""

	# Prepare data for displacy
	entity_data = [{"start": ent[2], "end": ent[3], "label": ent[1]} for ent in entities]
	# Render using displacy
	my_displacy = displacy.render([{"text": text, "ents": entity_data}], style="ent", options=OPTIONS, manual=True)
	return my_displacy


	def create_df(entities):
	"""
	Creates a dataframe to display the named entities found in text.

	:param entities: named entities
	:type entities: list
	:returns: dataframe
	:rtype: pd.DataFrame
	"""

	df = pd.DataFrame(entities, columns=["ENTITE",
	"LABEL",
	"DEBUT",
	"FIN"
	])
	return df


	def df_to_csv(df_to_convert):
	"""
	Converts df to csv.

	:param df_to_convert: dataframe to be converted to csv
	:type df_to_convert: pd.DataFrame
	:returns: csv
	:rtype: csv
	"""

	return df_to_convert.to_csv(encoding="utf-8")


	def doc_to_conll(doc, updated_name=False):
	"""
	Converts a doc and its entities to a conll2002 file.

	:param doc: spacy doc
	:type doc: spacy.tokens.doc.Doc
	:param updated_name: should the name of the downloaded file be updated?
	:type updated_name: bool
	:returns: button to download the conll2002 file
	:rtype: streamlit.components.v1.components.download_button.DownloadButtonMixin
	"""

	# Writing to a BytesIO object to get the byte content
	with BytesIO() as sortie_buffer:
	for tok in doc:
	# Convert a named entity to conll2002
	if tok.ent_type and tok.text != "\n":
	sortie_buffer.write(f"{tok.text} {tok.ent_iob_}-{tok.ent_type_}\n".encode("utf-8"))
	# Convert a token without a named entity to conll2002
	else:
	if tok.text != "\n" and tok.ent_iob_:
	sortie_buffer.write(f"{tok.text} {tok.ent_iob_}\n".encode("utf-8"))
	# Write a single empty line for each new line in the original text
	else:
	sortie_buffer.write(b"\n")

	# Move the buffer position to the beginning for reading
	sortie_buffer.seek(0)

	# Check if the buffer has a line only consisting of "O\n" and delete it
	buffer_content = sortie_buffer.getvalue().decode("utf-8")
	lines = buffer_content.split("\n")
	modified_lines = [line for line in lines if line.strip() != "O"]
	modified_buffer_content = "\n".join(modified_lines)

	# Write the modified content back to the buffer
	sortie_buffer.seek(0)
	sortie_buffer.write(modified_buffer_content.encode("utf-8"))

	# Move the buffer position to the beginning for reading
	sortie_buffer.seek(0)

	# If we have an uploaded file: update the name of the exported file.
	if updated_name:
	my_button = st.download_button(
	label="Télécharger le fichier CoNLL2002",
	data=sortie_buffer,
	file_name=updated_name + ".conll"
	)
	# If we have no uploaded file ('example on the go' mode): use a default name for the exported file.
	else:
	my_button = st.download_button(
	label="Télécharger le fichier CoNLL2002",
	data=sortie_buffer,
	file_name="prediction_arches.conll"
	)

	return my_button


	def get_body_text(xml_input):
	"""
	Parses an xml file and returns its <body>.

	:param xml_input: xml file to be parsed
	:type xml_input: str
	:returns: the <body> if successful, None otherwise
	:rtype: str or None
	"""

	try:
	# Parse XML content
	parser = etree.XMLParser(recover=True)
	root = etree.fromstring(xml_input, parser=parser)
	# Find <body> element in the XML namespace
	body = root.xpath("//tei:body", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})

	if body:
	body_element = body[0]

	if len(body_element) > 0:
	# Extract the text content
	body_soup = BeautifulSoup(etree.tostring(body_element), "html.parser")
	body_text = body_soup.get_text(separator=" ", strip=True)
	return body_text
	else:
	st.warning("L'élément <body> est vide.")
	return None
	else:
	st.warning("Aucun élément <body> n'a été détecté dans le fichier XML.")
	return None

	except etree.XMLSyntaxError:
	st.warning("Format XML incorrect. Veuillez importer un fichier XML valide.")
	return None


	def xml_mapping(entity, label):
	"""
	Create an XML element based on an entity's given label.

	:param entity: entity text
	:type entity: str
	:param label: entity label
	:type label: str
	:returns: custom XML element if successful, default <name> element if not
	:rtype: etree.Element
	"""

	element_mapping = {
	"CHRONOLOGIE": {"tag": "date"},
	"DECOR": {"tag": "name", "attrib": {"type": "decor"}},
	"EDIFICE": {"tag": "placeName", "attrib": {"type": "edifice"}},
	"ESPECE": {"tag": "name", "attrib": {"type": "espece"}},
	"GPE": {"tag": "placeName"},
	"ID": {"tag": "idno", "attrib": {"type": "entite"}},
	"LIEUDIT_SITE": {"tag": "placeName", "attrib": {"type": "lieudit_site"}},
	"LOC": {"tag": "geogName"},
	"MATERIAU": {"tag": "material"},
	"MOBILIER": {"tag": "objectType"},
	"ORG": {"tag": "orgName"},
	"PERSONNE": {"tag": "persName"},
	"PEUPLE_CULTURE": {"tag": "orgName", "attrib": {"type": "peuple_culture"}},
	"STRUCTURE": {"tag": "name", "attrib": {"type": "structure"}},
	"TECHNIQUE_STYLE": {"tag": "name", "attrib": {"type": "technique_style"}},
	}

	mapping = element_mapping.get(label)

	if mapping:
	xml_tag = etree.Element(mapping["tag"], attrib=mapping.get("attrib", {}))
	xml_tag.text = entity
	return xml_tag
	# If the mapping is impossible, encode the entity with a default <name type="generique">
	else:
	st.warning(f"Mapping introuvable pour le label : {label}. Entité encodée par conséquence comme : <name type=\"generique\">.")
	return etree.Element("name", attrib={"type": "generique"})


	def entities_to_xml(xml_content, ner):
	"""
	Process XML content by replacing identified entities with XML elements.

	:param xml_content: original xml content
	:type xml_content: str
	:param ner: ner model
	:type ner: spacy.lang.fr.French
	:returns: modified XML content if successful, None otherwise
	:rtype: str or None
	"""

	try:
	# Parse XML content
	parser = etree.XMLParser(recover=True)
	root = etree.fromstring(xml_content, parser=parser)
	# Find <body> element in the XML namespace
	body = root.xpath("//tei:body", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})

	if body:
	body_element = body[0]

	if len(body_element) > 0:

	# Strip the <body> of the <hi> tags
	etree.strip_tags(body_element, "{http://www.tei-c.org/ns/1.0}hi")
	# Get the <body>'s descendants
	descendants = body_element.xpath("descendant::*")

	# Iterate through all descendants in the <body>
	for descendant in descendants:

	# Apply the ner model on the text of the descendant
	if descendant.text:
	doc = get_doc(ner, descendant.text)[0]
	list_nbsp = get_doc(ner, descendant.text)[1]
	entities = get_entities(doc, list_nbsp)[0]
	entities.sort(key=lambda ent: ent[2], reverse=True)

	for ent in entities:
	xml_tag = xml_mapping(ent[0], ent[1])
	start_index = ent[2]
	end_index = ent[3]
	descendant.text = (
	descendant.text[:start_index]
	+ etree.tostring(xml_tag, encoding="unicode")
	+ descendant.text[end_index:]
	)

	# Apply the ner model on the tail of the descendant
	if descendant.tail:
	doc_tail = get_doc(ner, descendant.tail)[0]
	list_nbsp_tail = get_doc(ner, descendant.tail)[1]
	entities_tail = get_entities(doc_tail, list_nbsp_tail)[0]
	entities_tail.sort(key=lambda ent: ent[2], reverse=True)

	for ent_tail in entities_tail:
	xml_tag_tail = xml_mapping(ent_tail[0], ent_tail[1])
	start_index_tail = ent_tail[2]
	end_index_tail = ent_tail[3]
	descendant.tail = (
	descendant.tail[:start_index_tail]
	+ etree.tostring(xml_tag_tail, encoding="unicode")
	+ descendant.tail[end_index_tail:]
	)

	# Export modified XML content
	modified_xml = etree.tostring(root, xml_declaration=True, pretty_print=True, encoding="utf-8").decode("utf-8")
	return modified_xml
	else:
	st.warning("L'élément <body> est vide.")
	return None
	else:
	st.warning("Aucun élément <body> n'a été détecté dans le fichier XML.")
	return None
	except etree.XMLSyntaxError:
	st.error("Format XML incorrect. Veuillez importer un fichier XML valide.")
	return None


	# ===== BODY OF THE PAGE =====

	st.title("La reconnaissance des entités nommées dans le projet ARCHES")
	st.header("Visualisation & extraction")
	st.write("")
	st.write("")

	# Check GPU presence
	gpu = spacy.prefer_gpu()
	if gpu:
	st.success("GPU détecté avec succès")
	else:
	st.warning("Aucun GPU détecté, l'application du modèle pourra nécessiter un certain temps. Considérez une [installation locale du dépôt](https://huggingface.co/spaces/a-menu/arches_demo/blob/main/guide_installation_locale_et_gpu.md) si besoin.")

	st.write("")
	st.write("")


	def main():

	# Download and load our models

	# Sentencizer
	download_sentencizer()
	senter = load_sentencizer()

	# NER
	# Choose which NER model you want
	pick_model = st.radio("Quel modèle de reconnaissance d'entités nommées souhaitez-vous utiliser ?", ("fr_arches_ner (plus léger en ressources mais moins efficace)", "fr_arches_ner_trf (plus lourd en ressources mais plus efficace, GPU conseillé)"))

	st.write("")
	st.write("")

	if pick_model == "fr_arches_ner (plus léger en ressources mais moins efficace)":
	download_ner()
	ner = load_ner()
	if pick_model == "fr_arches_ner_trf (plus lourd en ressources mais plus efficace, GPU conseillé)":
	download_ner_trf()
	ner = load_ner_trf()

	with st.expander("Au sujet des entités nommées recherchées"):
	st.markdown("Les différents types d'entités sont : \n\n- CHRONOLOGIE : utilisé pour les références chronologiques (\"Antiquité\", \"XIIe siècle\", \"200 av. n. ère\", etc.). \n- MOBILIER : utilisé pour le mobilier (\"os\", \"pot\", \"tuile\", etc.). \n- STRUCTURE : utilisé pour les structures archéologiques (\"fosse\", \"mur\", \"fossé\", \"foyer\", etc.). \n- MATERIAU : utilisé pour les matériaux (\"bronze\", \"dolérite\", \"terre cuite\", etc.). \n- ID : utilisé pour les identifiants de vestiges (\"4\" pour \"le fossé 4\" par exemple). \n- TECHNIQUE_STYLE : utilisé pour les mentions de techniques et styles de fabrication ou construction (\"taillé\", \"glaçuré\", \"en petit appareil\", etc.). \n- DECOR : utilisé pour les éléments de décor. \n- ESPECE : utilisé pour signaler les taxons et noms vernaculaires rencontrés dans le texte. \n- EDIFICE : utilisé pour les édifices et monuments nommés (\"église Saint-Paul\", \"pont du Gard\", etc.). \n- PEUPLE_CULTURE : utilisé pour les cultures et peuples évoqués (tribus gauloises, cultures préhistoriques, etc.). \n- PERSONNE : utilisé pour les noms de personnes (historiques, fictives, équipe scientifique, etc.). \n- ORG : utilisé pour les institutions, sociétés, laboratoires, universités, musées, archives, etc. \n- GPE : utilisé pour les entités géopolitiques (villes, départements, États, etc.). \n- LOC : utilisé pour les lieux non-GPE (lieux naturels par exemple). \n- LIEUDIT_SITE : utilisé pour les lieux-dits et noms de sites archéologiques.")
	st.write("")

	# Select input type
	use_type = st.radio("Veuillez choisir le type de données à analyser :", ("Taper un exemple", "Importer un fichier texte", "Importer un fichier xml-tei"))
	st.write("")

	# ===== MODE: EXAMPLE ON THE GO =====
	if use_type == "Taper un exemple":

	# Checkbox to apply our custom sentence segmentation model
	bouton_phraseur = st.checkbox("Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")

	st.write("")
	st.write("")
	# Create a text area
	raw_text = st.text_area("Veuillez saisir votre exemple dans le bloc ci-dessous (max. 5000 caractères)", "La fosse 34 a livré des restes de pinces en bronze et quelques grains d'orge.", max_chars=5000)
	st.write("")

	# Launch prediction
	if st.button("Lancer la prédiction"):

	if len(raw_text) > 0:

	with st.spinner("Application du modèle.."):

	# If requested, apply the sentence segmentation model
	if bouton_phraseur:
	raw_text = apply_senter(senter, raw_text)

	# Apply the ner model
	doc = get_doc(ner, raw_text)[0]
	list_nbsp = get_doc(ner, raw_text)[1]
	entities = get_entities(doc, list_nbsp)[0]
	nbsp_text = get_entities(doc, list_nbsp)[1]

	st.write("")
	st.subheader("Résultats :")
	st.write("")
	st.write("")

	# Display the entities with displacy
	my_displacy = create_displacy(nbsp_text, entities)
	st.markdown(my_displacy, unsafe_allow_html=True)

	st.write("")
	# Download results as a conll2002 file
	doc_to_conll(doc)
	st.write("")

	df = create_df(entities)
	st.write("")
	# Display the entities as a table
	st.markdown("Tableau regroupant les entités détectées")
	st.write("")
	st.dataframe(df, use_container_width=True)

	csv = df_to_csv(df)

	st.write("")

	# Download results as a csv file
	st.download_button(
	label="Télécharger le fichier CSV",
	data=csv,
	file_name="prediction_arches.csv",
	mime="text/csv",
	)

	else:
	st.warning("Veuillez saisir un exemple.")

	# ===== MODE: LOAD A PLAIN TEXT FILE =====
	if use_type == "Importer un fichier texte":

	# Checkbox to apply our custom sentence segmentation model
	bouton_phraseur = st.checkbox("Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
	st.write("")
	st.write("")

	# Upload a plain text file
	uploaded_file = st.file_uploader("Importez un fichier texte (.txt)", type="txt")

	if uploaded_file is not None:
	# Collect the name of the uploaded file (for the future export)
	updated_name = uploaded_file.name[:-4]

	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
	file_contents = stringio.read()

	# Launch prediction
	if st.button("Lancer la prédiction"):

	if len(file_contents) > 0:

	with st.spinner("Application du modèle.."):

	# If requested, apply the sentence segmentation model
	if bouton_phraseur:
	file_contents = apply_senter(senter, file_contents)

	# Apply the ner model
	doc = get_doc(ner, file_contents)[0]
	list_nbsp = get_doc(ner, file_contents)[1]
	entities = get_entities(doc, list_nbsp)[0]
	nbsp_text = get_entities(doc, list_nbsp)[1]

	st.write("")
	st.subheader("Résultats :")
	st.write("")
	st.write("")

	# Display the entities with displacy
	with st.expander("Voir les entités dans le texte"):
	my_displacy = create_displacy(nbsp_text, entities)
	st.markdown(my_displacy, unsafe_allow_html=True)

	st.write("")
	# Download the results as a conll2002 file
	doc_to_conll(doc, updated_name)
	st.write("")

	df = create_df(entities)
	st.write("")
	# Display the entities as a table
	with st.expander("Voir les entités sous forme de tableau"):
	st.write("")
	st.dataframe(df, use_container_width=True)

	csv = df_to_csv(df)

	st.write("")

	# Download the results as a csv file
	st.download_button(
	label="Télécharger le fichier CSV",
	data=csv,
	file_name=updated_name + ".csv",
	mime="text/csv",
	)

	else:
	st.warning("Le fichier importé est vide.")

	# ===== MODE: LOAD AN XML FILE =====
	if use_type == "Importer un fichier xml-tei":

	# User chooses between xml or conll2002 & csv export
	choix_xml = st.radio("Comment souhaitez vous appliquer le modèle sur le <body> ?", ("Conserver les balises (export xml de l'intégralité* du fichier importé)", "Ne pas conserver les balises (export conll2002 ou csv du <body> uniquement)"))

	# ===== MODE: XML EXPORT =====
	if choix_xml == "Conserver les balises (export xml de l'intégralité* du fichier importé)":

	st.write("")
	st.error("\* À l'exception des balises <hi> du body.")
	st.write("")
	with st.expander("Au sujet du mapping XML des entités nommées"):
	st.markdown(
	"Les entités nommées ont été converties comme suit : \n\n- CHRONOLOGIE : ```<date>``` \n- MOBILIER : ```<objectType>``` \n- STRUCTURE : ```<name type=\"structure\">``` \n- MATERIAU : ```<material>``` \n- ID : ```<idno type=\"entite\">``` \n- TECHNIQUE_STYLE : ```<name type=\"technique_style\">``` \n- DECOR : ```<name type=\"decor\">``` \n- ESPECE : ```<name type=\"espece\">``` \n- EDIFICE : ```<placeName type=\"edifice\">``` \n- PEUPLE_CULTURE : ```<orgName type=\"peuple_culture\">``` \n- PERSONNE : ```<persName>``` \n- ORG : ```<orgName>``` \n- GPE : ```<placeName>``` \n- LOC : ```<geogName>``` \n- LIEUDIT_SITE : ```<placeName type=\"lieudit_site\">```\n- Entité inconnue : ```<name type=\"generique\">```")
	st.write("")
	st.write("")

	# Upload an xml file
	uploaded_file = st.file_uploader("Importez un fichier XML (.xml)", type="xml")

	if uploaded_file is not None:
	# Collect the name of the uploaded file (for the export later)
	updated_name = uploaded_file.name[:-4]
	file_contents = uploaded_file.read()

	# Launch prediction
	if st.button("Lancer la prédiction"):

	if len(file_contents) > 0:

	with st.spinner("Application du modèle.."):

	# Apply the ner model to an xml file
	modified_xml = entities_to_xml(file_contents, ner)

	if modified_xml is not None:

	# Convert HTML entities back to characters
	modified_xml = html.unescape(modified_xml)

	st.write("")
	st.subheader("Résultats :")
	st.write("")
	st.write("")

	# Display the modified XML
	with st.expander("Contenu XML modifié"):
	# Wrap the code
	# Source : blackary, https://discuss.streamlit.io/t/st-code-on-multiple-lines/50511/8
	with stylable_container(
	"codeblock",
	"""
	code {
	white-space: pre-wrap !important;
	}
	""",
	):
	st.code(modified_xml, language="xml")

	st.write("")

	# Download the modified XML
	st.download_button(
	label="Télécharger le fichier xml modifié",
	data=modified_xml,
	file_name=updated_name + ".xml",
	mime="xml",
	)

	# ===== MODE: CONLL2002 & CSV EXPORT =====
	if choix_xml == "Ne pas conserver les balises (export conll2002 ou csv du <body> uniquement)":

	st.write("")

	# Checkbox to apply our custom sentence segmentation model
	bouton_phraseur = st.checkbox(
	"Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
	st.write("")
	st.write("")

	# Upload an xml file
	uploaded_file = st.file_uploader("Importez un fichier XML (.xml)", type="xml")

	if uploaded_file is not None:

	# Collect the name of the file (for the export later)
	updated_name = uploaded_file.name[:-4]

	file_contents = uploaded_file.read()

	# Launch prediction
	if st.button("Lancer la prédiction"):

	if len(file_contents) > 0:

	with st.spinner("Application du modèle.."):

	st.write("")
	# Strip the <body> of its tags
	body_text = get_body_text(file_contents)

	if body_text is not None:

	# If requested, apply the sentence segmentation model
	if bouton_phraseur:
	body_text = apply_senter(senter, body_text)

	# Apply the ner model
	doc = get_doc(ner, body_text)[0]
	list_nbsp = get_doc(ner, body_text)[1]
	entities = get_entities(doc, list_nbsp)[0]
	nbsp_text = get_entities(doc, list_nbsp)[1]

	st.write("")
	st.subheader("Résultats :")
	st.write("")
	st.write("")

	# Display the entities with displacy
	with st.expander("Voir les entités dans le texte"):
	my_displacy = create_displacy(nbsp_text, entities)
	st.markdown(my_displacy, unsafe_allow_html=True)

	st.write("")
	# Download the results as a conll2002 file
	doc_to_conll(doc, updated_name)
	st.write("")

	df = create_df(entities)
	st.write("")
	# Display the entities as a table
	with st.expander("Voir les entités sous forme de tableau"):
	st.write("")
	st.dataframe(df, use_container_width=True)

	csv = df_to_csv(df)

	st.write("")

	# Download the results as a csv file
	st.download_button(
	label="Télécharger le fichier CSV",
	data=csv,
	file_name=updated_name + ".csv",
	mime="text/csv",
	)

	# Add a "footer"
	st.markdown("# ")
	st.markdown("# ")


	if __name__ == "__main__":
	main()