import numpy as np import torch import transformers import streamlit as st from streamlit import session_state import json import torch.nn.functional as F import boto3 import pandas as pd bucket = 'data-ai-dev2' from transformers import BertTokenizer, BertModel from torch import cuda device = 'cuda' if cuda.is_available() else 'cpu' import numpy from numpy.random import seed seed(1) import emoji import string import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer from nltk.stem import WordNetLemmatizer import re stemmer = PorterStemmer() # uncomment this when run first time nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('stopwords') lemmatizer = WordNetLemmatizer() from transformers import pipeline stopwords = nltk.corpus.stopwords.words('english') model = 'C:/Users/Meet/Downloads/core_risk/models/' tokenizer = 'C:/Users/Meet/Downloads/core_risk/tokenizer/' from transformers import pipeline classifier = pipeline("text-classification", model= model, tokenizer = tokenizer, truncation=True, max_length=512) def pre_processing_str_esg(df_col): df_col = df_col.lower() #defining the function to remove punctuation def remove_punctuation(text): punctuationfree="".join([i for i in text if i not in string.punctuation]) return punctuationfree #storing the puntuation free text df_col= remove_punctuation(df_col) df_col = re.sub(r"http\S+", " ", df_col) def remove_stopwords(text): return " ".join([word for word in str(text).split() if word not in stopwords]) #applying the function df_col = remove_stopwords(df_col) df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col) df_col = df_col.replace("ΒΆ", "") df_col = df_col.replace("Β§", "") df_col = df_col.replace('β€œ', ' ') df_col = df_col.replace('”', ' ') df_col = df_col.replace('-', ' ') REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col) df_col = BAD_SYMBOLS_RE.sub(' ',df_col) # df_col = re.sub('W*dw*','',df_col) df_col = re.sub('[0-9]+', ' ', df_col) df_col = re.sub(' ', ' ', df_col) def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', string) df_col = remove_emoji(df_col) return df_col def pre_processing_str(df_col): # df_col = df_col.lower() if len(df_col.split()) >= 70: return pre_processing_str_esg(df_col) else: df_col = df_col.replace('#', '') df_col = df_col.replace('!', '') df_col = re.sub(r"http\S+", " ", df_col) df_col = re.sub('[0-9]+', ' ', df_col) df_col = re.sub(' ', ' ', df_col) def remove_emojis(text): return emoji.replace_emoji(text) df_col = remove_emojis(df_col) df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col) df_col = re.sub(r"[^\x20-\x7E]+", "", df_col) df_col = df_col.strip() return df_col # start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application def process(text): text = pre_processing_str(text) try: if len(text) != 0: results = classifier(text, top_k = 2) else: results = 'No Text' return {'output_16':results} except: return {'output_16':'something went wrong'} st.set_page_config(page_title="core_risk", page_icon="πŸ“ˆ") if 'topic_class' not in session_state: session_state['topic_class']= "" st.title("Topic Classifier") text= st.text_area(label= "Please write the text bellow", placeholder="What does the tweet say?") def classify(text): session_state['topic_class'] = process(text) st.text_area("result", value=session_state['topic_class']) st.button("Classify", on_click=classify, args=[text])