import re import requests import gradio as gr import pandas as pd from transformers import pipeline from transformers import AutoTokenizer from transformers import AutoModelForSequenceClassification def process_tweet(tweet): # remove links tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet) # remove usernames tweet = re.sub('@[^\s]+', '', tweet) # remove additional white spaces tweet = re.sub('[\s]+', ' ', tweet) # replace hashtags with words tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # trim tweet = tweet.strip('\'"') return tweet tokenizer = AutoTokenizer.from_pretrained( "azamat/geocoder_coordinates_model" ) relevancy_pipeline = pipeline("sentiment-analysis", model="azamat/geocoder_relevancy_model") coordinates_model = AutoModelForSequenceClassification.from_pretrained( "azamat/geocoder_coordinates_model", ) def predict_relevancy(text): outputs = relevancy_pipeline(text) return outputs[0]['label'], outputs[0]['score'] def predict_coordinates(text): encoding = tokenizer(text, padding="max_length", truncation=True, \ max_length=128, return_tensors='pt') outputs = coordinates_model(**encoding) return round(outputs[0][0][0].item(), 3), round(outputs[0][0][1].item(), 3) def reverse_geocode(lat, lon): payload = { 'lat' : lat, 'lon' : lon, 'zoom' : 12, 'format' : 'jsonv2', 'accept-language' : 'en' } try: r = requests.get('https://geocode.maps.co/reverse', params=payload) return r.json()['display_name'] except: return "No data" def predict(text): text = process_tweet(text) data = { "relevancy_score" : 0, "lat" : 0, "lon" : 0, "reversed lat/lon" : "" } relevancy_label, relevancy_score = predict_relevancy(text) if relevancy_label == 'relevant': data['relevancy_score'] = round(relevancy_score * 100, 2) lat, lon = predict_coordinates(text) data['lat'] = lat data['lon'] = lon reverse_geocoded = reverse_geocode(lat, lon) data['reversed lat/lon'] = reverse_geocoded return pd.DataFrame([data]) with gr.Blocks() as demo: gr.Markdown("# **

Twitter geocoding with 🤗 Transformers

**") gr.Markdown("###
Pipeline consists of:
") gr.Markdown("###
1) Relevancy scoring model - predicts whether a tweet has geocoding related information
") gr.Markdown("###
2) Coordinate predicting model - predicts exact latitude and longitude of user by tweet
") gr.Markdown("###
3) Nominatim API for reverse geocoding lat/lon - uses open street map to reverse geocode lat and lon
") inputs = gr.Textbox(placeholder="Enter the tweet") outputs = [gr.Dataframe(label="Geocoded data")] inputs.submit(predict, inputs=inputs, outputs=outputs) if __name__ == "__main__": demo.launch()