Spaces:

Asutosh2003
/

Vaccine_concerns_ML

Sleeping

Asutosh2003 commited on Jan 20

Commit

f1866af

•

1 Parent(s): b805738

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,10 @@ from transformers import BertTokenizer, BertModel
 from huggingface_hub import PyTorchModelHubMixin
 import numpy as np
 import gradio as gr
 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 device
@@ -33,7 +37,30 @@ tokenizer = BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-
 MAX_LEN = 256
 def return_vec(text):
   encodings = tokenizer.encode_plus(
       text,
       None,

 from huggingface_hub import PyTorchModelHubMixin
 import numpy as np
 import gradio as gr
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+import re
 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 device
 MAX_LEN = 256
+def rmTrash(raw_string, remuser, remstop, remurls):
+    final_string = ""
+    raw_string_2 = ""
+    if remuser == True:
+      for i in raw_string.split():
+          if '@' not in i:
+              raw_string_2 += ' ' + i
+    else:
+      raw_string_2 = raw_string
+    raw_string_2 = re.sub(r'[^\w\s]', '', raw_string_2.lower())
+    if remurls == True:
+      raw_string_2 = re.sub(r'http\S+', '', raw_string_2.lower())
+    if remstop == True:
+      raw_string_tokens = raw_string_2.split()
+      for token in raw_string_tokens:
+          if (not(token in stopwords.words('english'))):
+              final_string = final_string + ' ' + token
+    else:
+       final_string = raw_string_2
+    return final_string
 def return_vec(text):
+    text = rmTrash(text,True,True,True)
   encodings = tokenizer.encode_plus(
       text,
       None,