Spaces:

luispuli
/

NLP

Running

App Files Files Community

NLP / app.py

luispuli

Update app.py

f449f45 verified 3 months ago

raw

history blame contribute delete

No virus

5.16 kB

	import torch
	from transformers import AutoModel, AutoConfig
	import joblib
	import pickle
	import pandas as pd
	import numpy as np


	excel_path = "./Dict.xlsx"
	keywords = pd.read_excel(excel_path)

	keywords = keywords.assign(Words = keywords.Words.str.lower())
	keywords = keywords.assign(Words = keywords.Words.str.split(",")).explode("Words")
	keywords = keywords.assign(Words = keywords.Words.str.strip())
	keywords = keywords.assign(Fields = keywords.Fields.str.strip())

	# Load the model (joblib file) and vectorizer (pickle file) from Hugging Face
	model_path = "./classifier.joblib"
	vectorizer_path = "./vectorizer"

	classifier = joblib.load(model_path)
	with open(vectorizer_path, "rb") as f:
	tfidf_vectorizer = pickle.load(f)


	import gradio as gr
	def classify_text(text):

	import pandas as pd
	import numpy as np
	from snowballstemmer import stemmer
	import re
	import nltk

	Sample = pd.DataFrame({"Description": [text]})
	Sample = Sample.assign(Description = Sample.Description.str.lower())
	Sample = Sample.assign(Description = Sample.Description.str.lower())
	Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1)
	Sample = Sample.assign(Description = Sample.Description.astype("str"))
	Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index()
	Sample['Description'] = Sample['Description'].str.strip()
	keyword_mapping = {}
	for keyword, field in zip(keywords['Words'], keywords['Fields']):
	keyword_mapping.setdefault(keyword, []).append(field)
	# Function to find matching keywords in a description and return the first corresponding field
	def find_field(description):
	matching_fields = set()
	for keyword, fields in keyword_mapping.items():
	if re.search(r'\b{}\b'.format(re.escape(keyword)), description):
	matching_fields.update(fields)
	return list(matching_fields)[0] if matching_fields else "None"

	# Apply the function to the 'Description' column of df_sample
	Sample['AssignedField'] = Sample['Description'].apply(find_field)
	done = Sample.loc[~Sample.AssignedField.isna()]
	pos = Sample.loc[Sample.AssignedField.isna()].Doc

	nltk.download('stopwords')
	stopword_es = nltk.corpus.stopwords.words('spanish')
	stemmer_es = stemmer("spanish")

	Sample = pd.DataFrame({"Description": [text]})
	Sample = Sample.assign(Description = Sample.Description.str.lower())
	Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1)
	# Split into tokens and reshape the data to one row per token
	Sample = Sample.assign(Description = Sample.Description.str.split()).explode("Description")
	Sample = Sample.assign(Description = Sample.Description.str.replace('([&%—●•:–>.€ª,;()-/])',''))
	Sample = Sample.assign(Description = Sample.Description.str.replace('"',''))
	Sample = Sample.assign(Description = Sample.Description.str.replace('([“”\[\]\d+])',''))
	Sample = Sample.reset_index()
	# Filter Stopwords
	positions = [i for i, desc in enumerate(Sample.Description) if desc in stopword_es]
	Sample.drop(positions, inplace=True)

	stemmer_es = stemmer("spanish")
	Sample["Description"] = Sample["Description"].apply(lambda word: stemmer_es.stemWord(word))

	Sample = Sample.assign(Description = Sample.Description.astype("str"))
	Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index()

	new_text_tfidf = tfidf_vectorizer.transform(Sample["Description"])

	# Make probability predictions for the new text documents
	predicted_probabilities = classifier.predict_proba(new_text_tfidf)

	# Get the top four predicted class labels for each text document
	top_four_predictions = []
	for probs in predicted_probabilities:
	# Sort the probabilities and get the indices of the top four
	top_four_indices = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:3]

	# Get the corresponding class labels for the top four indices
	top_four_classes = [classifier.classes_[i] for i in top_four_indices]

	top_four_predictions.append(top_four_classes)

	preds = pd.DataFrame(top_four_predictions).rename(columns = {0:"Pred_1",
	1:"Pred_2",
	2: "Pred_3"})

	results_1 = pd.concat([Sample.reset_index(),preds],axis = 1).drop("index",axis = 1).assign(AssignedField = "None")[["Doc","Description","AssignedField","Pred_1","Pred_2","Pred_3"]]
	done[["Pred_1","Pred_2","Pred_3"]] = ["None","None","None"]
	final_results = results_1.merge(done, on = ["Description"], how = "left")
	return done["AssignedField"][0],results_1["Pred_1"][0],results_1["Pred_2"][0],results_1["Pred_3"][0]

	# Create a Gradio interface
	iface = gr.Interface(
	fn=classify_text,
	inputs=gr.Textbox(label = "Description", placeholder= "Enter your Service Description", lines = 3),
	outputs=([gr.Textbox() for _ in range(4)]),
	title="Text Classification App",
	theme = gr.themes.Default(primary_hue="blue"),
	examples = [["ilaap"],["finrep"],["liquidez"],["Data governance"]]) # Use a compact theme
	# Launch the Gradio interface
	iface.launch()