Eurovoc dataset for translation or conversion link
#2
by
scampion
- opened
If you need to translate Eurovoc terms, you can download the XML version on the official repository
You can also utilize a SPARQL endpoint for this purpose.
import pandas as pd
import time
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm
def get_eurovoc_code(label, max_retries=5, sleep_time=5):
"""This function returns the Eurovoc code for a given label."""
# Define the SPARQL endpoint and set the return format
sparql = SPARQLWrapper("https://publications.europa.eu/webapi/rdf/sparql")
sparql.setReturnFormat(JSON)
# Set the query with updated conditions
query = f"""
PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
SELECT * WHERE {{
?eurovoc <http://www.w3.org/2004/02/skos/core#prefLabel> "{label}"@en.
FILTER(regex(str(?eurovoc), "http://eurovoc.europa.eu/" ))
}}
"""
sparql.setQuery(query)
retries = 0
while retries < max_retries:
try:
# Execute the query and return the Eurovoc code
results = sparql.query().convert()
for result in results["results"]["bindings"]:
r = result["eurovoc"]["value"].split("/")[-1]
return r # assuming the Eurovoc code is the last part of the URI
return None # if no code found
except Exception as e:
if "HTTP Error 503" in str(e):
retries += 1
time.sleep(sleep_time)
else:
raise e
raise Exception(f"Failed after {max_retries} attempts for label: {label}")