abdulmatinomotoso
commited on
Commit
·
5dd89c8
1
Parent(s):
5273dc1
Update app.py
Browse files
app.py
CHANGED
@@ -19,10 +19,28 @@ def read_in_text(url):
|
|
19 |
with open(url, 'r') as file:
|
20 |
article = file.read()
|
21 |
return article
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
#Defining a function to get the category of the news article
|
24 |
def get_category(file):
|
25 |
-
text =
|
26 |
|
27 |
input_tensor = tokenizer.encode(text, return_tensors='pt', truncation=True)
|
28 |
logits = model(input_tensor).logits
|
|
|
19 |
with open(url, 'r') as file:
|
20 |
article = file.read()
|
21 |
return article
|
22 |
+
|
23 |
+
def clean_text(url):
|
24 |
+
text = read_in_text(url)
|
25 |
+
text = text.encode("ascii", errors="ignore").decode(
|
26 |
+
"ascii"
|
27 |
+
) # remove non-ascii, Chinese characters
|
28 |
+
|
29 |
+
text = re.sub(r"\n", " ", text)
|
30 |
+
text = re.sub(r"\n\n", " ", text)
|
31 |
+
text = re.sub(r"\t", " ", text)
|
32 |
+
text = text.strip(" ")
|
33 |
+
text = re.sub(
|
34 |
+
" +", " ", text
|
35 |
+
).strip() # get rid of multiple spaces and replace with a single
|
36 |
+
|
37 |
+
text = re.sub(r'Date\s\d{1,2}\/\d{1,2}\/\d{4}', '', text) #remove date
|
38 |
+
text = re.sub(r'\d{1,2}:\d{2}\s[A-Z]+\s[A-Z]+', '', text) #remove time
|
39 |
+
return text
|
40 |
|
41 |
#Defining a function to get the category of the news article
|
42 |
def get_category(file):
|
43 |
+
text = clean_text(file.name)
|
44 |
|
45 |
input_tensor = tokenizer.encode(text, return_tensors='pt', truncation=True)
|
46 |
logits = model(input_tensor).logits
|