|
import gradio as gr |
|
import string |
|
import re |
|
import pandas as pd |
|
|
|
from transformers import pipeline |
|
|
|
model_checkpoint = "Didier/bert-base-multilingual-uncased-finetuned-postal-can" |
|
token_classifier = pipeline( |
|
"token-classification", model=model_checkpoint, aggregation_strategy="simple" |
|
) |
|
|
|
|
|
|
|
|
|
def replace_punctuation_with_space(text): |
|
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) |
|
return text.translate(translator) |
|
|
|
def replace_multiple_spaces(text): |
|
"""Replaces multiple contiguous spaces in a string with a single space. |
|
|
|
Args: |
|
text: The input string. |
|
|
|
Returns: |
|
The string with multiple contiguous spaces replaced by a single space. |
|
""" |
|
return re.sub(r'\s+', ' ', text) |
|
|
|
def parse_postal_address_can(text): |
|
"""Parse the given Canadian address into its components. |
|
""" |
|
text = text.lower() |
|
text = text.replace(".", "") |
|
|
|
text = replace_multiple_spaces(text) |
|
results = token_classifier(text) |
|
|
|
|
|
data = [] |
|
for result in results: |
|
data.append({ |
|
'entity_group': result['entity_group'], |
|
'score': f"{result['score']:.2f}", |
|
'word': result['word'], |
|
'start': result['start'], |
|
'end': result['end'] |
|
}) |
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown(""" |
|
## Canadian postal address parsing - version 0.02 |
|
""") |
|
|
|
input_text = gr.Textbox( |
|
lines=5, |
|
placeholder="Enter Canadian postal address to parse", |
|
label="Canadian postal address", |
|
render=False |
|
) |
|
output = gr.DataFrame(value=None, row_count=6, render=False) |
|
|
|
examples = [ |
|
["405-200 René Lévesque Blvd W, Montreal, Quebec H2Z 1X4",], |
|
["1 Sussex Dr, Ottawa, ON K1A 0A1",], |
|
["5124 53 St, #205, Yellowknife, Northwest Territories, X1A 1V6",] |
|
] |
|
|
|
gr.Interface( |
|
fn=parse_postal_address_can, |
|
inputs=[input_text,], |
|
outputs=[output,], |
|
examples=examples |
|
) |
|
|
|
with gr.Accordion("Documentation", open=False): |
|
gr.Markdown(""" |
|
- Labels (address components): |
|
- O, STREET_NB, STREET_NAME, UNIT, CITY, REGION, POSTCODE |
|
- Dataset trained on: |
|
- 15+ million Canadian postal addresses available at OpenAddresses.io |
|
- (Current) Limitations: |
|
- no label for person_name / company_name (no data to train on) |
|
- trained on **post-normalized** addresses from OpenAddresses.io, |
|
hence missing un-normalized forms. E.g. "ST" (for street), but |
|
no training data with "street", "str.", ... |
|
- Enhancements: |
|
- Additional de-normalization of training data |
|
- Addition of person / companies names to the training data |
|
- Post-processing of results |
|
""") |
|
|
|
|
|
demo.launch() |