Asutosh2003 commited on
Commit
f1866af
1 Parent(s): b805738

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -0
app.py CHANGED
@@ -3,6 +3,10 @@ from transformers import BertTokenizer, BertModel
3
  from huggingface_hub import PyTorchModelHubMixin
4
  import numpy as np
5
  import gradio as gr
 
 
 
 
6
 
7
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
8
  device
@@ -33,7 +37,30 @@ tokenizer = BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-
33
  MAX_LEN = 256
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def return_vec(text):
 
37
  encodings = tokenizer.encode_plus(
38
  text,
39
  None,
 
3
  from huggingface_hub import PyTorchModelHubMixin
4
  import numpy as np
5
  import gradio as gr
6
+ import nltk
7
+ nltk.download('stopwords')
8
+ from nltk.corpus import stopwords
9
+ import re
10
 
11
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
12
  device
 
37
  MAX_LEN = 256
38
 
39
 
40
+ def rmTrash(raw_string, remuser, remstop, remurls):
41
+ final_string = ""
42
+ raw_string_2 = ""
43
+ if remuser == True:
44
+ for i in raw_string.split():
45
+ if '@' not in i:
46
+ raw_string_2 += ' ' + i
47
+ else:
48
+ raw_string_2 = raw_string
49
+ raw_string_2 = re.sub(r'[^\w\s]', '', raw_string_2.lower())
50
+ if remurls == True:
51
+ raw_string_2 = re.sub(r'http\S+', '', raw_string_2.lower())
52
+ if remstop == True:
53
+ raw_string_tokens = raw_string_2.split()
54
+ for token in raw_string_tokens:
55
+ if (not(token in stopwords.words('english'))):
56
+ final_string = final_string + ' ' + token
57
+ else:
58
+ final_string = raw_string_2
59
+ return final_string
60
+
61
+
62
  def return_vec(text):
63
+ text = rmTrash(text,True,True,True)
64
  encodings = tokenizer.encode_plus(
65
  text,
66
  None,