arches_demo / app.py
a-menu's picture
typo
4683d94 verified
raw
history blame
No virus
37.1 kB
import streamlit as st
import spacy
from spacy import displacy
import pandas as pd
from io import StringIO, BytesIO
from lxml import etree
from bs4 import BeautifulSoup
import html
from streamlit_extras.stylable_container import stylable_container
import subprocess
import importlib.util
# This app was inspired by Lucas Terriel's NER4Archives Visualizer App (2022-2023), https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main
# Check out the NER4Archives project (INRIA-ALMAnaCH/Archives nationales) : https://github.com/NER4Archives-project
# ===== SOME SETTING UP =====
# Setting up the app's page
st.set_page_config(page_title="Arches Demo", page_icon="🏺")
# Path to the statics directory
statics = "./static"
# Making the radio widgets' titles bigger
# Source : arnaud, https://discuss.streamlit.io/t/how-to-change-font-size-of-streamlit-radio-widget-title/35945/2
st.markdown(
"""<style>
div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
font-size: 17px;
}
</style>
""", unsafe_allow_html=True)
# Hiding the possibility to display pictures fullscreen
# Source : AvratanuBiswas, https://discuss.streamlit.io/t/hide-fullscreen-option-when-displaying-images-using-st-image/19792/2
st.markdown(
"""<style>
button[title="View fullscreen"]{
visibility: hidden;
}
</style>
""", unsafe_allow_html=True)
# Setting up the colors of the entity tags for displacy
ENTITIES_COLORS = {
"CHRONOLOGIE": "#ffb627",
"MOBILIER": "#6b7fd7",
"MATERIAU": "#d36582",
"STRUCTURE": "#00b2ca",
"TECHNIQUE_STYLE": "#ED6A5A",
"ESPECE": "#96C7FF",
"EDIFICE": "#9F86C0",
"ID": "#f65bff",
"LIEUDIT_SITE": "#d8e446",
"PERSONNE": "#D3B4B4",
"PEUPLE_CULTURE": "#d20000",
"LOC": "#81db72",
"DECOR": "#fff46a",
"ORG": "#887575",
"GPE": "#00a878"
}
OPTIONS = {
"ents":
[
"CHRONOLOGIE",
"MOBILIER",
"MATERIAU",
"STRUCTURE",
"TECHNIQUE_STYLE",
"ESPECE",
"EDIFICE",
"ID",
"LIEUDIT_SITE",
"PERSONNE",
"PEUPLE_CULTURE",
"LOC",
"DECOR",
"ORG",
"GPE"
],
"colors": ENTITIES_COLORS}
# ===== SIDEBAR =====
st.sidebar.title("ARCHES - Étude, composition et processus pour une édition structurée des rapports d’opérations archéologiques préventives")
st.sidebar.markdown("Avec ses 2200 collaborateurs, l’[Inrap](https://www.inrap.fr/) représente la plus importante structure publique de recherche archéologique française. De fait, chaque année, près de 2000 chantiers (diagnostics archéologiques et fouilles) sont réalisés en partenariat avec les aménageurs publics et privés, en France métropolitaine et dans les départements d’outre-mer. Les missions de l’Institut intégrant l’exploitation scientifique des résultats et la diffusion de la connaissance archéologique auprès du public, plus de 2000 rapports d’opération archéologique sont ainsi rédigés annuellement.")
st.sidebar.markdown("Financé avec le soutien du [Fonds National pour la Science Ouverte](https://www.ouvrirlascience.fr/accueil/) et réalisé en collaboration avec l’infrastructure de recherche [Métopes](http://www.metopes.fr/) ([Université de Caen Normandie](https://www.unicaen.fr/) - [CNRS](https://www.cnrs.fr/fr)), [ARCHES](https://www.inrap.fr/arches-etude-composition-et-processus-pour-une-edition-structuree-des-rapports-d-17145) vise à explorer l’amélioration de la diffusion et de l’exploitation des rapports d’opération à l’aide du format de balisage XML-TEI, permettant d’encoder tant la structuration formelle que le contenu sémantique d’un document. Dans cette optique, vingt-et-un rapports de fouilles de l’Inrap ont été annotés pour entraîner un modèle de reconnaissance des entités nommées (représentant plus de 80 000 entités annotées). Cette application vise à tester la manipulation du modèle, tant avec des fichiers XML que texte brut.")
st.sidebar.markdown("Le corpus a été annoté à l'aide d'[INCEpTION](https://inception-project.github.io/), tandis que les modèles de [segmentation](https://huggingface.co/a-menu/fr_arches_sentencizer) et de reconnaissance des entités nommées ([avec](https://huggingface.co/a-menu/fr_arches_ner_trf) et [sans](https://huggingface.co/a-menu/fr_arches_ner) architecture transformer) ont été entraînés et évalués avec [spaCy](https://spacy.io/). Les modalités de [citation](https://huggingface.co/spaces/a-menu/arches_demo/blob/main/CITATION.cff) de l'application peuvent être retrouvées dans le [dépôt](https://huggingface.co/spaces/a-menu/arches_demo/tree/main) de celle-ci.")
st.sidebar.write("")
st.sidebar.markdown("*ARCHES (Inrap), janvier 2024*")
st.sidebar.write("")
st.sidebar.write("")
st.sidebar.header("Partenaires")
st.sidebar.write("")
# Display logos
col1, col2, col3 = st.sidebar.columns(3)
col1.image(f"{statics}/logo_inrap.png", width=100)
col2.write("")
col2.image(f"{statics}/logo_ouvrir_la_science.png", width=100)
col3.image(f"{statics}/logo_mesr.png", width=100)
col1.image(f"{statics}/logo_ir_metopes.png", width=100)
col2.write("")
col2.write("")
col2.image(f"{statics}/logo_mrsh.jpg", width=100)
col3.image(f"{statics}/logo_unicaen.png", width=100)
col1.image(f"{statics}/logo_cnrs.png", width=80)
# ===== SOME FUNCTIONS =====
# Cached to prevent computation on every rerun
@st.cache_resource
def download_sentencizer():
"""
Downloads the fr_arches_sentencizer model.
:returns: None
"""
# Check if the model is already installed
# If not, install it
# Source : ice.nicer & Arthur, https://stackoverflow.com/a/41815890
check_senter = importlib.util.find_spec("fr_arches_sentencizer")
if check_senter is None:
subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_sentencizer/resolve/main/fr_arches_sentencizer-any-py3-none-any.whl"])
# Cached to prevent computation on every rerun
@st.cache_resource
def download_ner_trf():
"""
Downloads the fr_arches_ner_trf TRF NER model.
:returns: None
"""
# Check if the model is already installed
# If not, install it
# Source : ice.nicer & Arthur, https://stackoverflow.com/a/41815890
check_ner_trf = importlib.util.find_spec("fr_arches_ner_trf")
if check_ner_trf is None:
subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_ner_trf/resolve/main/fr_arches_ner_trf-any-py3-none-any.whl"])
# Cached to prevent computation on every rerun
@st.cache_resource
def download_ner():
"""
Downloads the fr_arches_ner NER model.
:returns: None
"""
# Check if the model is already installed
# If not, install it
# Source : ice.nicer & Arthur, https://stackoverflow.com/a/41815890
check_ner = importlib.util.find_spec("fr_arches_ner")
if check_ner is None:
subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_ner/resolve/main/fr_arches_ner-any-py3-none-any.whl"])
# Cached to prevent computation on every rerun
@st.cache_resource
def load_sentencizer():
"""
Loads our custom sentence segmentation model.
:returns: loaded fr_arches_sentencizer model
:rtype: spacy.lang.fr.French
"""
senter = spacy.load("fr_arches_sentencizer")
return senter
# Cached to prevent computation on every rerun
@st.cache_resource
def load_ner_trf():
"""
Loads our custom fr_arches_ner_trf trf ner model.
:returns: loaded fr_arches model
:rtype: spacy.lang.fr.French
"""
ner = spacy.load("fr_arches_ner_trf")
# To try to reduce memory usage
config = {"attrs": {"tensor": None}}
ner.add_pipe("doc_cleaner", config=config)
return ner
# Cached to prevent computation on every rerun
@st.cache_resource
def load_ner():
"""
Loads our custom fr_arches_ner ner model.
:returns: loaded fr_arches model
:rtype: spacy.lang.fr.French
"""
ner = spacy.load("fr_arches_ner")
# To try to reduce memory usage
config = {"attrs": {"tensor": None}}
ner.add_pipe("doc_cleaner", config=config)
return ner
def apply_senter(senter, data):
"""
Applies our custom sentence segmentation model on data.
:param senter: sentence segmentation model
:type senter: spacy.lang.fr.French
:param data: text to be segmented
:type data: str
:returns: sentencized text
:rtype: str
"""
mes_phrases = senter(data)
sentencized_text = ""
for sent in mes_phrases.sents:
sentencized_text += str(sent) + "\n"
return sentencized_text
def get_doc(ner, data):
"""
Applies our custom ner model on data.
:param ner: ner model
:type ner: spacy.lang.fr.French
:param data: text to be analyzed
:type data: str
:returns: spacy doc
:rtype: spacy.tokens.doc.Doc
"""
# Replace the non-breaking spaces (NBSP) with regular spaces before applying our model on the text. To do so:
# Create a list to store their position
list_nbsp = []
# Iterate over each character and save the position of the non-breaking spaces
for i, char in enumerate(data):
if char == "\u00A0":
list_nbsp.append(i)
# Once we have memorized the NBSP's positions, we replace them with regular spaces
data = data.replace("\u00A0", " ")
# Apply the NER model on our data
doc = ner(data)
return doc, list_nbsp
def get_entities(doc, list_nbsp):
"""
Extracts the named entities from the doc.
:param doc: spacy doc
:type doc: spacy.tokens.doc.Doc
:returns: list of named entities
:rtype: list
"""
# Put back the NBSP
characters_with_nbsp = [char if i not in list_nbsp else "\u00A0" for i, char in enumerate(doc.text)]
# Convert the list back to a string
nbsp_text = "".join(characters_with_nbsp)
entities = []
for ent in doc.ents:
# We collect :
# The named entity (using its position since the tokenizer would sometimes add unwanted spaces, for instance before a comma)
# Its label
# Its position
entities.append((nbsp_text[ent.start_char:ent.end_char].strip(), ent.label_, ent.start_char, ent.end_char))
return entities, nbsp_text
def create_displacy(text, entities):
"""
Render named entities using displacy.
:param text: input text
:type text: str
:param entities: list of named entities with start and end character positions
:type entities: list
:returns: showcase of entities with displacy
:rtype: str
"""
# Prepare data for displacy
entity_data = [{"start": ent[2], "end": ent[3], "label": ent[1]} for ent in entities]
# Render using displacy
my_displacy = displacy.render([{"text": text, "ents": entity_data}], style="ent", options=OPTIONS, manual=True)
return my_displacy
def create_df(entities):
"""
Creates a dataframe to display the named entities found in text.
:param entities: named entities
:type entities: list
:returns: dataframe
:rtype: pd.DataFrame
"""
df = pd.DataFrame(entities, columns=["ENTITE",
"LABEL",
"DEBUT",
"FIN"
])
return df
def df_to_csv(df_to_convert):
"""
Converts df to csv.
:param df_to_convert: dataframe to be converted to csv
:type df_to_convert: pd.DataFrame
:returns: csv
:rtype: csv
"""
return df_to_convert.to_csv(encoding="utf-8")
def doc_to_conll(doc, updated_name=False):
"""
Converts a doc and its entities to a conll2002 file.
:param doc: spacy doc
:type doc: spacy.tokens.doc.Doc
:param updated_name: should the name of the downloaded file be updated?
:type updated_name: bool
:returns: button to download the conll2002 file
:rtype: streamlit.components.v1.components.download_button.DownloadButtonMixin
"""
# Writing to a BytesIO object to get the byte content
with BytesIO() as sortie_buffer:
for tok in doc:
# Convert a named entity to conll2002
if tok.ent_type and tok.text != "\n":
sortie_buffer.write(f"{tok.text} {tok.ent_iob_}-{tok.ent_type_}\n".encode("utf-8"))
# Convert a token without a named entity to conll2002
else:
if tok.text != "\n" and tok.ent_iob_:
sortie_buffer.write(f"{tok.text} {tok.ent_iob_}\n".encode("utf-8"))
# Write a single empty line for each new line in the original text
else:
sortie_buffer.write(b"\n")
# Move the buffer position to the beginning for reading
sortie_buffer.seek(0)
# Check if the buffer has a line only consisting of "O\n" and delete it
buffer_content = sortie_buffer.getvalue().decode("utf-8")
lines = buffer_content.split("\n")
modified_lines = [line for line in lines if line.strip() != "O"]
modified_buffer_content = "\n".join(modified_lines)
# Write the modified content back to the buffer
sortie_buffer.seek(0)
sortie_buffer.write(modified_buffer_content.encode("utf-8"))
# Move the buffer position to the beginning for reading
sortie_buffer.seek(0)
# If we have an uploaded file: update the name of the exported file.
if updated_name:
my_button = st.download_button(
label="Télécharger le fichier CoNLL2002",
data=sortie_buffer,
file_name=updated_name + ".conll"
)
# If we have no uploaded file ('example on the go' mode): use a default name for the exported file.
else:
my_button = st.download_button(
label="Télécharger le fichier CoNLL2002",
data=sortie_buffer,
file_name="prediction_arches.conll"
)
return my_button
def get_body_text(xml_input):
"""
Parses an xml file and returns its <body>.
:param xml_input: xml file to be parsed
:type xml_input: str
:returns: the <body> if successful, None otherwise
:rtype: str or None
"""
try:
# Parse XML content
parser = etree.XMLParser(recover=True)
root = etree.fromstring(xml_input, parser=parser)
# Find <body> element in the XML namespace
body = root.xpath("//tei:body", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})
if body:
body_element = body[0]
if len(body_element) > 0:
# Extract the text content
body_soup = BeautifulSoup(etree.tostring(body_element), "html.parser")
body_text = body_soup.get_text(separator=" ", strip=True)
return body_text
else:
st.warning("L'élément <body> est vide.")
return None
else:
st.warning("Aucun élément <body> n'a été détecté dans le fichier XML.")
return None
except etree.XMLSyntaxError:
st.warning("Format XML incorrect. Veuillez importer un fichier XML valide.")
return None
def xml_mapping(entity, label):
"""
Create an XML element based on an entity's given label.
:param entity: entity text
:type entity: str
:param label: entity label
:type label: str
:returns: custom XML element if successful, default <name> element if not
:rtype: etree.Element
"""
element_mapping = {
"CHRONOLOGIE": {"tag": "date"},
"DECOR": {"tag": "name", "attrib": {"type": "decor"}},
"EDIFICE": {"tag": "placeName", "attrib": {"type": "edifice"}},
"ESPECE": {"tag": "name", "attrib": {"type": "espece"}},
"GPE": {"tag": "placeName"},
"ID": {"tag": "idno", "attrib": {"type": "entite"}},
"LIEUDIT_SITE": {"tag": "placeName", "attrib": {"type": "lieudit_site"}},
"LOC": {"tag": "geogName"},
"MATERIAU": {"tag": "material"},
"MOBILIER": {"tag": "objectType"},
"ORG": {"tag": "orgName"},
"PERSONNE": {"tag": "persName"},
"PEUPLE_CULTURE": {"tag": "orgName", "attrib": {"type": "peuple_culture"}},
"STRUCTURE": {"tag": "name", "attrib": {"type": "structure"}},
"TECHNIQUE_STYLE": {"tag": "name", "attrib": {"type": "technique_style"}},
}
mapping = element_mapping.get(label)
if mapping:
xml_tag = etree.Element(mapping["tag"], attrib=mapping.get("attrib", {}))
xml_tag.text = entity
return xml_tag
# If the mapping is impossible, encode the entity with a default <name type="generique">
else:
st.warning(f"Mapping introuvable pour le label : {label}. Entité encodée par conséquence comme : <name type=\"generique\">.")
return etree.Element("name", attrib={"type": "generique"})
def entities_to_xml(xml_content, ner):
"""
Process XML content by replacing identified entities with XML elements.
:param xml_content: original xml content
:type xml_content: str
:param ner: ner model
:type ner: spacy.lang.fr.French
:returns: modified XML content if successful, None otherwise
:rtype: str or None
"""
try:
# Parse XML content
parser = etree.XMLParser(recover=True)
root = etree.fromstring(xml_content, parser=parser)
# Find <body> element in the XML namespace
body = root.xpath("//tei:body", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})
if body:
body_element = body[0]
if len(body_element) > 0:
# Strip the <body> of the <hi> tags
etree.strip_tags(body_element, "{http://www.tei-c.org/ns/1.0}hi")
# Get the <body>'s descendants
descendants = body_element.xpath("descendant::*")
# Iterate through all descendants in the <body>
for descendant in descendants:
# Apply the ner model on the text of the descendant
if descendant.text:
doc = get_doc(ner, descendant.text)[0]
list_nbsp = get_doc(ner, descendant.text)[1]
entities = get_entities(doc, list_nbsp)[0]
entities.sort(key=lambda ent: ent[2], reverse=True)
for ent in entities:
xml_tag = xml_mapping(ent[0], ent[1])
start_index = ent[2]
end_index = ent[3]
descendant.text = (
descendant.text[:start_index]
+ etree.tostring(xml_tag, encoding="unicode")
+ descendant.text[end_index:]
)
# Apply the ner model on the tail of the descendant
if descendant.tail:
doc_tail = get_doc(ner, descendant.tail)[0]
list_nbsp_tail = get_doc(ner, descendant.tail)[1]
entities_tail = get_entities(doc_tail, list_nbsp_tail)[0]
entities_tail.sort(key=lambda ent: ent[2], reverse=True)
for ent_tail in entities_tail:
xml_tag_tail = xml_mapping(ent_tail[0], ent_tail[1])
start_index_tail = ent_tail[2]
end_index_tail = ent_tail[3]
descendant.tail = (
descendant.tail[:start_index_tail]
+ etree.tostring(xml_tag_tail, encoding="unicode")
+ descendant.tail[end_index_tail:]
)
# Export modified XML content
modified_xml = etree.tostring(root, xml_declaration=True, pretty_print=True, encoding="utf-8").decode("utf-8")
return modified_xml
else:
st.warning("L'élément <body> est vide.")
return None
else:
st.warning("Aucun élément <body> n'a été détecté dans le fichier XML.")
return None
except etree.XMLSyntaxError:
st.error("Format XML incorrect. Veuillez importer un fichier XML valide.")
return None
# ===== BODY OF THE PAGE =====
st.title("La reconnaissance des entités nommées dans le projet ARCHES")
st.header("Visualisation & extraction")
st.write("")
st.write("")
# Check GPU presence
gpu = spacy.prefer_gpu()
if gpu:
st.success("GPU détecté avec succès")
else:
st.warning("Aucun GPU détecté, l'application du modèle pourra nécessiter un certain temps. Considérez une [installation locale du dépôt](https://huggingface.co/spaces/a-menu/arches_demo/blob/main/guide_installation_locale_et_gpu.md) si besoin.")
st.write("")
st.write("")
def main():
# Download and load our models
# Sentencizer
download_sentencizer()
senter = load_sentencizer()
# NER
# Choose which NER model you want
pick_model = st.radio("Quel modèle de reconnaissance d'entités nommées souhaitez-vous utiliser ?", ("fr_arches_ner (plus léger en ressources mais moins efficace)", "fr_arches_ner_trf (plus lourd en ressources mais plus efficace, GPU conseillé)"))
st.write("")
st.write("")
if pick_model == "fr_arches_ner (plus léger en ressources mais moins efficace)":
download_ner()
ner = load_ner()
if pick_model == "fr_arches_ner_trf (plus lourd en ressources mais plus efficace, GPU conseillé)":
download_ner_trf()
ner = load_ner_trf()
with st.expander("Au sujet des entités nommées recherchées"):
st.markdown("**Les différents types d'entités sont :** \n\n- **CHRONOLOGIE :** utilisé pour les références chronologiques (\"Antiquité\", \"XIIe siècle\", \"200 av. n. ère\", etc.). \n- **MOBILIER :** utilisé pour le mobilier (\"os\", \"pot\", \"tuile\", etc.). \n- **STRUCTURE :** utilisé pour les structures archéologiques (\"fosse\", \"mur\", \"fossé\", \"foyer\", etc.). \n- **MATERIAU :** utilisé pour les matériaux (\"bronze\", \"dolérite\", \"terre cuite\", etc.). \n- **ID :** utilisé pour les identifiants de vestiges (\"4\" pour \"le fossé 4\" par exemple). \n- **TECHNIQUE_STYLE :** utilisé pour les mentions de techniques et styles de fabrication ou construction (\"taillé\", \"glaçuré\", \"en petit appareil\", etc.). \n- **DECOR :** utilisé pour les éléments de décor. \n- **ESPECE :** utilisé pour signaler les taxons et noms vernaculaires rencontrés dans le texte. \n- **EDIFICE :** utilisé pour les édifices et monuments nommés (\"église Saint-Paul\", \"pont du Gard\", etc.). \n- **PEUPLE_CULTURE :** utilisé pour les cultures et peuples évoqués (tribus gauloises, cultures préhistoriques, etc.). \n- **PERSONNE :** utilisé pour les noms de personnes (historiques, fictives, équipe scientifique, etc.). \n- **ORG :** utilisé pour les institutions, sociétés, laboratoires, universités, musées, archives, etc. \n- **GPE :** utilisé pour les entités géopolitiques (villes, départements, États, etc.). \n- **LOC :** utilisé pour les lieux non-GPE (lieux naturels par exemple). \n- **LIEUDIT_SITE :** utilisé pour les lieux-dits et noms de sites archéologiques.")
st.write("")
# Select input type
use_type = st.radio("Veuillez choisir le type de données à analyser :", ("Taper un exemple", "Importer un fichier texte", "Importer un fichier xml-tei"))
st.write("")
# ===== MODE: EXAMPLE ON THE GO =====
if use_type == "Taper un exemple":
# Checkbox to apply our custom sentence segmentation model
bouton_phraseur = st.checkbox("Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
st.write("")
st.write("")
# Create a text area
raw_text = st.text_area("Veuillez saisir votre exemple dans le bloc ci-dessous (max. 5000 caractères)", "La fosse 34 a livré des restes de pinces en bronze et quelques grains d'orge.", max_chars=5000)
st.write("")
# Launch prediction
if st.button("Lancer la prédiction"):
if len(raw_text) > 0:
with st.spinner("Application du modèle.."):
# If requested, apply the sentence segmentation model
if bouton_phraseur:
raw_text = apply_senter(senter, raw_text)
# Apply the ner model
doc = get_doc(ner, raw_text)[0]
list_nbsp = get_doc(ner, raw_text)[1]
entities = get_entities(doc, list_nbsp)[0]
nbsp_text = get_entities(doc, list_nbsp)[1]
st.write("")
st.subheader("Résultats :")
st.write("")
st.write("")
# Display the entities with displacy
my_displacy = create_displacy(nbsp_text, entities)
st.markdown(my_displacy, unsafe_allow_html=True)
st.write("")
# Download results as a conll2002 file
doc_to_conll(doc)
st.write("")
df = create_df(entities)
st.write("")
# Display the entities as a table
st.markdown("**Tableau regroupant les entités détectées**")
st.write("")
st.dataframe(df, use_container_width=True)
csv = df_to_csv(df)
st.write("")
# Download results as a csv file
st.download_button(
label="Télécharger le fichier CSV",
data=csv,
file_name="prediction_arches.csv",
mime="text/csv",
)
else:
st.warning("Veuillez saisir un exemple.")
# ===== MODE: LOAD A PLAIN TEXT FILE =====
if use_type == "Importer un fichier texte":
# Checkbox to apply our custom sentence segmentation model
bouton_phraseur = st.checkbox("Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
st.write("")
st.write("")
# Upload a plain text file
uploaded_file = st.file_uploader("Importez un fichier texte (.txt)", type="txt")
if uploaded_file is not None:
# Collect the name of the uploaded file (for the future export)
updated_name = uploaded_file.name[:-4]
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
file_contents = stringio.read()
# Launch prediction
if st.button("Lancer la prédiction"):
if len(file_contents) > 0:
with st.spinner("Application du modèle.."):
# If requested, apply the sentence segmentation model
if bouton_phraseur:
file_contents = apply_senter(senter, file_contents)
# Apply the ner model
doc = get_doc(ner, file_contents)[0]
list_nbsp = get_doc(ner, file_contents)[1]
entities = get_entities(doc, list_nbsp)[0]
nbsp_text = get_entities(doc, list_nbsp)[1]
st.write("")
st.subheader("Résultats :")
st.write("")
st.write("")
# Display the entities with displacy
with st.expander("Voir les entités dans le texte"):
my_displacy = create_displacy(nbsp_text, entities)
st.markdown(my_displacy, unsafe_allow_html=True)
st.write("")
# Download the results as a conll2002 file
doc_to_conll(doc, updated_name)
st.write("")
df = create_df(entities)
st.write("")
# Display the entities as a table
with st.expander("Voir les entités sous forme de tableau"):
st.write("")
st.dataframe(df, use_container_width=True)
csv = df_to_csv(df)
st.write("")
# Download the results as a csv file
st.download_button(
label="Télécharger le fichier CSV",
data=csv,
file_name=updated_name + ".csv",
mime="text/csv",
)
else:
st.warning("Le fichier importé est vide.")
# ===== MODE: LOAD AN XML FILE =====
if use_type == "Importer un fichier xml-tei":
# User chooses between xml or conll2002 & csv export
choix_xml = st.radio("Comment souhaitez vous appliquer le modèle sur le <body> ?", ("Conserver les balises (export xml de l'intégralité* du fichier importé)", "Ne pas conserver les balises (export conll2002 ou csv du <body> uniquement)"))
# ===== MODE: XML EXPORT =====
if choix_xml == "Conserver les balises (export xml de l'intégralité* du fichier importé)":
st.write("")
st.error("\* À l'exception des balises <hi> du body.")
st.write("")
with st.expander("Au sujet du mapping XML des entités nommées"):
st.markdown(
"**Les entités nommées ont été converties comme suit :** \n\n- **CHRONOLOGIE :** ```<date>``` \n- **MOBILIER :** ```<objectType>``` \n- **STRUCTURE :** ```<name type=\"structure\">``` \n- **MATERIAU :** ```<material>``` \n- **ID :** ```<idno type=\"entite\">``` \n- **TECHNIQUE_STYLE :** ```<name type=\"technique_style\">``` \n- **DECOR :** ```<name type=\"decor\">``` \n- **ESPECE :** ```<name type=\"espece\">``` \n- **EDIFICE :** ```<placeName type=\"edifice\">``` \n- **PEUPLE_CULTURE :** ```<orgName type=\"peuple_culture\">``` \n- **PERSONNE :** ```<persName>``` \n- **ORG :** ```<orgName>``` \n- **GPE :** ```<placeName>``` \n- **LOC :** ```<geogName>``` \n- **LIEUDIT_SITE :** ```<placeName type=\"lieudit_site\">```\n- **Entité inconnue :** ```<name type=\"generique\">```")
st.write("")
st.write("")
# Upload an xml file
uploaded_file = st.file_uploader("Importez un fichier XML (.xml)", type="xml")
if uploaded_file is not None:
# Collect the name of the uploaded file (for the export later)
updated_name = uploaded_file.name[:-4]
file_contents = uploaded_file.read()
# Launch prediction
if st.button("Lancer la prédiction"):
if len(file_contents) > 0:
with st.spinner("Application du modèle.."):
# Apply the ner model to an xml file
modified_xml = entities_to_xml(file_contents, ner)
if modified_xml is not None:
# Convert HTML entities back to characters
modified_xml = html.unescape(modified_xml)
st.write("")
st.subheader("Résultats :")
st.write("")
st.write("")
# Display the modified XML
with st.expander("Contenu XML modifié"):
# Wrap the code
# Source : blackary, https://discuss.streamlit.io/t/st-code-on-multiple-lines/50511/8
with stylable_container(
"codeblock",
"""
code {
white-space: pre-wrap !important;
}
""",
):
st.code(modified_xml, language="xml")
st.write("")
# Download the modified XML
st.download_button(
label="Télécharger le fichier xml modifié",
data=modified_xml,
file_name=updated_name + ".xml",
mime="xml",
)
# ===== MODE: CONLL2002 & CSV EXPORT =====
if choix_xml == "Ne pas conserver les balises (export conll2002 ou csv du <body> uniquement)":
st.write("")
# Checkbox to apply our custom sentence segmentation model
bouton_phraseur = st.checkbox(
"Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
st.write("")
st.write("")
# Upload an xml file
uploaded_file = st.file_uploader("Importez un fichier XML (.xml)", type="xml")
if uploaded_file is not None:
# Collect the name of the file (for the export later)
updated_name = uploaded_file.name[:-4]
file_contents = uploaded_file.read()
# Launch prediction
if st.button("Lancer la prédiction"):
if len(file_contents) > 0:
with st.spinner("Application du modèle.."):
st.write("")
# Strip the <body> of its tags
body_text = get_body_text(file_contents)
if body_text is not None:
# If requested, apply the sentence segmentation model
if bouton_phraseur:
body_text = apply_senter(senter, body_text)
# Apply the ner model
doc = get_doc(ner, body_text)[0]
list_nbsp = get_doc(ner, body_text)[1]
entities = get_entities(doc, list_nbsp)[0]
nbsp_text = get_entities(doc, list_nbsp)[1]
st.write("")
st.subheader("Résultats :")
st.write("")
st.write("")
# Display the entities with displacy
with st.expander("Voir les entités dans le texte"):
my_displacy = create_displacy(nbsp_text, entities)
st.markdown(my_displacy, unsafe_allow_html=True)
st.write("")
# Download the results as a conll2002 file
doc_to_conll(doc, updated_name)
st.write("")
df = create_df(entities)
st.write("")
# Display the entities as a table
with st.expander("Voir les entités sous forme de tableau"):
st.write("")
st.dataframe(df, use_container_width=True)
csv = df_to_csv(df)
st.write("")
# Download the results as a csv file
st.download_button(
label="Télécharger le fichier CSV",
data=csv,
file_name=updated_name + ".csv",
mime="text/csv",
)
# Add a "footer"
st.markdown("# ")
st.markdown("# ")
if __name__ == "__main__":
main()