import torch from transformers import AutoModel, AutoConfig import joblib import pickle import pandas as pd import numpy as np excel_path = "./Dict.xlsx" keywords = pd.read_excel(excel_path) keywords = keywords.assign(Words = keywords.Words.str.lower()) keywords = keywords.assign(Words = keywords.Words.str.split(",")).explode("Words") keywords = keywords.assign(Words = keywords.Words.str.strip()) keywords = keywords.assign(Fields = keywords.Fields.str.strip()) # Load the model (joblib file) and vectorizer (pickle file) from Hugging Face model_path = "./classifier.joblib" vectorizer_path = "./vectorizer" classifier = joblib.load(model_path) with open(vectorizer_path, "rb") as f: tfidf_vectorizer = pickle.load(f) import gradio as gr def classify_text(text): import pandas as pd import numpy as np from snowballstemmer import stemmer import re import nltk Sample = pd.DataFrame({"Description": [text]}) Sample = Sample.assign(Description = Sample.Description.str.lower()) Sample = Sample.assign(Description = Sample.Description.str.lower()) Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1) Sample = Sample.assign(Description = Sample.Description.astype("str")) Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index() Sample['Description'] = Sample['Description'].str.strip() keyword_mapping = {} for keyword, field in zip(keywords['Words'], keywords['Fields']): keyword_mapping.setdefault(keyword, []).append(field) # Function to find matching keywords in a description and return the first corresponding field def find_field(description): matching_fields = set() for keyword, fields in keyword_mapping.items(): if re.search(r'\b{}\b'.format(re.escape(keyword)), description): matching_fields.update(fields) return list(matching_fields)[0] if matching_fields else "None" # Apply the function to the 'Description' column of df_sample Sample['AssignedField'] = Sample['Description'].apply(find_field) done = Sample.loc[~Sample.AssignedField.isna()] pos = Sample.loc[Sample.AssignedField.isna()].Doc nltk.download('stopwords') stopword_es = nltk.corpus.stopwords.words('spanish') stemmer_es = stemmer("spanish") Sample = pd.DataFrame({"Description": [text]}) Sample = Sample.assign(Description = Sample.Description.str.lower()) Sample = Sample.reset_index(names = "n").reset_index(names = "Doc").drop("n",axis = 1) # Split into tokens and reshape the data to one row per token Sample = Sample.assign(Description = Sample.Description.str.split()).explode("Description") Sample = Sample.assign(Description = Sample.Description.str.replace('([&%—●•:–>.€ª,;()-/])','')) Sample = Sample.assign(Description = Sample.Description.str.replace('"','')) Sample = Sample.assign(Description = Sample.Description.str.replace('([“”\[\]\d+])','')) Sample = Sample.reset_index() # Filter Stopwords positions = [i for i, desc in enumerate(Sample.Description) if desc in stopword_es] Sample.drop(positions, inplace=True) stemmer_es = stemmer("spanish") Sample["Description"] = Sample["Description"].apply(lambda word: stemmer_es.stemWord(word)) Sample = Sample.assign(Description = Sample.Description.astype("str")) Sample = Sample.groupby(["Doc"]).Description.apply(" ".join).reset_index() new_text_tfidf = tfidf_vectorizer.transform(Sample["Description"]) # Make probability predictions for the new text documents predicted_probabilities = classifier.predict_proba(new_text_tfidf) # Get the top four predicted class labels for each text document top_four_predictions = [] for probs in predicted_probabilities: # Sort the probabilities and get the indices of the top four top_four_indices = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:3] # Get the corresponding class labels for the top four indices top_four_classes = [classifier.classes_[i] for i in top_four_indices] top_four_predictions.append(top_four_classes) preds = pd.DataFrame(top_four_predictions).rename(columns = {0:"Pred_1", 1:"Pred_2", 2: "Pred_3"}) results_1 = pd.concat([Sample.reset_index(),preds],axis = 1).drop("index",axis = 1).assign(AssignedField = "None")[["Doc","Description","AssignedField","Pred_1","Pred_2","Pred_3"]] done[["Pred_1","Pred_2","Pred_3"]] = ["None","None","None"] final_results = results_1.merge(done, on = ["Description"], how = "left") return done["AssignedField"][0],results_1["Pred_1"][0],results_1["Pred_2"][0],results_1["Pred_3"][0] # Create a Gradio interface iface = gr.Interface( fn=classify_text, inputs=gr.Textbox(label = "Description", placeholder= "Enter your Service Description", lines = 3), outputs=([gr.Textbox() for _ in range(4)]), title="Text Classification App", theme = gr.themes.Default(primary_hue="blue"), examples = [["ilaap"],["finrep"],["liquidez"],["Data governance"]]) # Use a compact theme # Launch the Gradio interface iface.launch(share = True)