File size: 3,784 Bytes
e03530d
 
6796bfc
071cb1b
 
f4db971
 
afb9463
 
 
 
 
 
 
 
 
b360479
6796bfc
afb9463
 
 
 
 
 
 
 
 
 
 
3ea613d
19ded2a
afb9463
 
 
b143835
f4db971
3ea613d
 
 
 
 
 
 
 
 
f4db971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ea613d
dc1bf57
86f79dd
dc2ae70
86f79dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6796bfc
071cb1b
115ff0b
360bf8a
6796bfc
360bf8a
 
071cb1b
360bf8a
 
1cf959e
 
 
 
 
 
 
f42cc17
1cf959e
b360479
6796bfc
 
768cc89
793922a
768cc89
e063d7c
 
6796bfc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
import pandas as pd
from matplotlib import pyplot as plt
import twint 
import nest_asyncio
import multiprocessing.pool
import functools
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import IPython.display as ipd

st.write('Loading...')

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Loading pretrained model
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

# Func to get a score using the above model
def combined_score(text):
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  return -scores[0] + scores[2] # scores = [negative, neutral, positive]

# https://stackoverflow.com/questions/492519/timeout-on-a-function-call
def timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""
    def timeout_decorator(item):
        """Wrap the original function."""
        @functools.wraps(item)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            pool = multiprocessing.pool.ThreadPool(processes=1)
            async_result = pool.apply_async(item, args, kwargs)
            # raises a TimeoutError if execution exceeds max_timeout
            return async_result.get(max_timeout)
        return func_wrapper
    return timeout_decorator

# Getting tweets from a user
@timeout(120.0) 
def get_tweets(username, limit=500, save_name=None):
  #nest_asyncio.apply() # Helps avoid RuntimeError: This event loop is already running

  # Setup config
  c = twint.Config() # Create a config object to store our settings
  c.Limit = limit # Max number of tweets to fetch (increments of 20)
  c.Username = username # User of interest
  c.Pandas = True # Store tweets in a dataframe
  c.Hide_output = True # Avoid printing out tweets

  # Run the seearch
  twint.run.Search(c)

  # Get the results and optionally save to a file as well
  df = twint.storage.panda.Tweets_df
  if save_name != None:
    df.to_csv(save_name)
  return df

title = st.title('Twitter Sentiment Map Thingee')


with st.form("my_form"):
  st.write("Parameters:")
  user = st.text_input("Twitter Username")
  n_tweets = st.slider('How Many Tweets', 20, 2000, 20)

  # Every form must have a submit button.
  submitted = st.form_submit_button("Submit")
  
if submitted:
  st.write("Fetching user", user, "n_tweets", n_tweets)
  tweets = get_tweets(user, limit=n_tweets)
  st.write("Resulting dataframe shape:", tweets.shape)
  st.write("Calculating sentiments...")
  tweets['sentiment'] = tweets['tweet'].map(lambda s: combined_score(s))
  tweets['tweet_length'] = tweets['tweet'].map(lambda s: len(s))
  st.write("Average sentiment:", tweets.sentiment.mean())
  fig, axs = plt.subplots(1, 2, figsize=(12, 6))
  axs[0].hexbin(tweets['tweet_length'], tweets['sentiment']*1, 
           gridsize=20, bins=12, cmap='inferno')
  axs[0].set_title('Tweet Sentiment and Length')
  axs[1].scatter(tweets['tweet_length'], tweets['sentiment'])
  axs[1].set_title('Tweet Sentiment vs Length')
  plt.setp(axs[:], xlabel='Tweet Length')
  plt.setp(axs[:], ylabel='Sentiment')
  st.pyplot(fig)