luisespinosa's picture
Update README.md
f3a03da

Twitter-roBERTa-base for Emoji prediction

This is a roBERTa-base model trained on ~58M tweets and finetuned for emoji prediction with the TweetEval benchmark.

Example of classification

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='emoji'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Looking forward to Christmas"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Looking forward to Christmas"
# text = preprocess(text)
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Output:

1) πŸŽ„ 0.5457
2) 😊 0.1417
3) 😁 0.0649
4) 😍 0.0395
5) ❀️ 0.03
6) 😜 0.028
7) ✨ 0.0263
8) πŸ˜‰ 0.0237
9) πŸ˜‚ 0.0177
10) 😎 0.0166
11) 😘 0.0143
12) πŸ’• 0.014
13) πŸ’™ 0.0076
14) πŸ’œ 0.0068
15) πŸ”₯ 0.0065
16) πŸ’― 0.004
17) πŸ‡ΊπŸ‡Έ 0.0037
18) πŸ“· 0.0034
19) β˜€ 0.0033
20) πŸ“Έ 0.0021