Hector Lopez commited on
Commit
c5b702e
1 Parent(s): c6d3bd0

Upload application logic

Browse files
Files changed (4) hide show
  1. app.py +36 -0
  2. backend.py +44 -0
  3. requirements.txt +3 -0
  4. tweet_scraper.py +48 -0
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Twitter analizer application.
3
+
4
+ This module provides a gradio-based web application
5
+ for the Twitter analyzer project.
6
+ """
7
+ import gradio as gr
8
+
9
+ from tweet_scraper import retrieve_tweet_text
10
+ from backend import predict_positivity
11
+
12
+
13
+ def process_tweet(url: str) -> str:
14
+ """
15
+ Get a tweet's positivity.
16
+
17
+ Args:
18
+ url (str): Tweet's URL.
19
+
20
+ Returns:
21
+ str: Predicted positivity
22
+ """
23
+ text = retrieve_tweet_text(url)
24
+ outcome = predict_positivity(text)
25
+
26
+ return outcome
27
+
28
+
29
+ app = gr.Interface(
30
+ fn=process_tweet,
31
+ inputs=gr.inputs.Textbox(lines=2, placeholder="Tweet url..."),
32
+ outputs="text",
33
+ )
34
+
35
+ if __name__ == "__main__":
36
+ app, local_url, share_url = app.launch()
backend.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Positivity predictor.
3
+
4
+ This module provides the functionality to predict
5
+ a tweet's positivity using a BERT model.
6
+ """
7
+ import torch
8
+ from transformers import BertForSequenceClassification, BertTokenizer
9
+
10
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
11
+ model = BertForSequenceClassification.from_pretrained(
12
+ "bert-base-uncased",
13
+ num_labels=5,
14
+ output_attentions=False,
15
+ output_hidden_states=False,
16
+ local_files_only=True,
17
+ )
18
+ model.load_state_dict(torch.load("data/BERT_ft_epoch5.model"))
19
+ model.eval()
20
+
21
+
22
+ def predict_positivity(text: str) -> str:
23
+ """
24
+ Predict the positivity of a given tweet.
25
+
26
+ Args:
27
+ text (str): Tweet's text.
28
+
29
+ Returns:
30
+ str: Predicted positivity.
31
+ """
32
+ label_dict = {
33
+ 0: "Extremely Negative",
34
+ 1: "Negative",
35
+ 2: "Neutral",
36
+ 3: "Positive",
37
+ 4: "Extremely Positive",
38
+ }
39
+ encoded = tokenizer(text, return_tensors="pt")
40
+ logits = model(**encoded).logits
41
+
42
+ predicted_class_id = logits.argmax().item()
43
+
44
+ return label_dict[predicted_class_id]
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
tweet_scraper.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Twitter scraper.
3
+
4
+ This module provides the functionality to retrieve
5
+ a tweet's text given a tweet's URL.
6
+ """
7
+ import re
8
+
9
+ import requests
10
+
11
+
12
+ def retrieve_tweet_text(tweet_url: str) -> str:
13
+ """
14
+ Retrieve a tweet's text.
15
+
16
+ Args:
17
+ tweet_url (url): Tweet's URL.
18
+
19
+ Returns:
20
+ str: Tweet's parsed text.
21
+ """
22
+ # Get the url to retrieve tweet-related data
23
+ url = (
24
+ "https://publish.twitter.com/oembed?dnt=true",
25
+ f"&omit_script=true&url={tweet_url}",
26
+ )
27
+ url = str.join("", url)
28
+
29
+ # Get the raw html containing th tweet text
30
+ raw_html = requests.get(url).json()["html"]
31
+ # Remove links from text
32
+ pattern = r"<[a][^>]*>(.+?)</[a]>"
33
+ html = re.sub(pattern, "", raw_html)
34
+
35
+ # Remove the HTML tags from the text
36
+ text = [i.strip() for i in re.sub("<.*?>", "", html).splitlines() if i][0]
37
+
38
+ # If there is a picture, remove all the text after it
39
+ if "pic" in text:
40
+ idx = text.index("pic")
41
+ text = text[:idx]
42
+ # If there is no picture, the &mdash defines the tweet's
43
+ # end.
44
+ elif "&mdash" in text:
45
+ idx = text.index("&mdash")
46
+ text = text[:idx]
47
+
48
+ return text