DataAIDemo / pages /core_risk.py
themeetjani's picture
Upload 10 files
6060e42 verified
raw
history blame
No virus
4.51 kB
import numpy as np
import torch
import transformers
import streamlit as st
from streamlit import session_state
import json
import torch.nn.functional as F
import boto3
import pandas as pd
bucket = 'data-ai-dev2'
from transformers import BertTokenizer, BertModel
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import numpy
from numpy.random import seed
seed(1)
import emoji
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re
stemmer = PorterStemmer()
# uncomment this when run first time
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
from transformers import pipeline
stopwords = nltk.corpus.stopwords.words('english')
model = 'C:/Users/Meet/Downloads/core_risk/models/'
tokenizer = 'C:/Users/Meet/Downloads/core_risk/tokenizer/'
from transformers import pipeline
classifier = pipeline("text-classification", model= model, tokenizer = tokenizer, truncation=True, max_length=512)
def pre_processing_str_esg(df_col):
df_col = df_col.lower()
#defining the function to remove punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
#storing the puntuation free text
df_col= remove_punctuation(df_col)
df_col = re.sub(r"http\S+", " ", df_col)
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stopwords])
#applying the function
df_col = remove_stopwords(df_col)
df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
df_col = df_col.replace("¶", "")
df_col = df_col.replace("§", "")
df_col = df_col.replace('“', ' ')
df_col = df_col.replace('”', ' ')
df_col = df_col.replace('-', ' ')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
# df_col = re.sub('W*dw*','',df_col)
df_col = re.sub('[0-9]+', ' ', df_col)
df_col = re.sub(' ', ' ', df_col)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
df_col = remove_emoji(df_col)
return df_col
def pre_processing_str(df_col):
# df_col = df_col.lower()
if len(df_col.split()) >= 70:
return pre_processing_str_esg(df_col)
else:
df_col = df_col.replace('#', '')
df_col = df_col.replace('!', '')
df_col = re.sub(r"http\S+", " ", df_col)
df_col = re.sub('[0-9]+', ' ', df_col)
df_col = re.sub(' ', ' ', df_col)
def remove_emojis(text):
return emoji.replace_emoji(text)
df_col = remove_emojis(df_col)
df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
df_col = df_col.strip()
return df_col
# start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application
def process(text):
text = pre_processing_str(text)
try:
if len(text) != 0:
results = classifier(text, top_k = 2)
else:
results = 'No Text'
return {'output_16':results}
except:
return {'output_16':'something went wrong'}
st.set_page_config(page_title="core_risk", page_icon="📈")
if 'topic_class' not in session_state:
session_state['topic_class']= ""
st.title("Topic Classifier")
text= st.text_area(label= "Please write the text bellow",
placeholder="What does the tweet say?")
def classify(text):
session_state['topic_class'] = process(text)
st.text_area("result", value=session_state['topic_class'])
st.button("Classify", on_click=classify, args=[text])