import csv import spacy import glob import datetime import pandas as pd import gradio as gr from transformers import pipeline from huggingface_hub import hf_hub_download from transformers import AutoModelForSequenceClassification, AutoTokenizer models = { 'en': 'facebook/bart-large-mnli' } hypothesis_templates = { 'en': 'This example is {}.' } classifiers = {'en': pipeline("zero-shot-classification", hypothesis_template=hypothesis_templates['en'], model=models['en']) } nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli') tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') labels=["contradicts_hypothesis","Neutral","Entails_hypothesis"] def prep_examples(): example_text1 = "EMI can be skipped" example_labels1 = "EMI can be skipped" #"Entails Hypothisis" example_text2 = "minimum package guranteed" example_labels2 = "minimum package guranteed" #"Entails Hypothisis" example_text3 = "100% placement gurantee" example_labels3 = "100% placement gurantee" #"Entails Hypothisis" #example_text1 = "EMI can not be skipped" #example_labels1 = "contradicts_hypothesis" examples = [ [example_text1, example_labels1, False], [example_text2, example_labels2, False], [example_text3, example_labels3, False]] return examples def inference_hypothesis(premise,hypothesis,labels): x = tokenizer.encode(premise, hypothesis, return_tensors='pt',truncation_strategy='only_first') logits = nli_model(x.to("cpu"))[0] entail_contradiction_logits = logits[:,[0,1,2]] probs = entail_contradiction_logits.softmax(dim=1) return premise,hypothesis,labels[probs.argmax()],entail_contradiction_logits def sequence_to_classify(sequence, hypothesis_df, multi_label): hypothesis_df=pd.read_csv(hypothesis_df.name) lang = 'en' classifier = classifiers[lang] inference_output={} label_clean = str(labels).split(";;") for i,keyword in enumerate(hypothesis_df.filtering_keyword.tolist()): if keyword.lower() in sequence.lower(): output = inference_hypothesis(sequence, hypothesis_df.hypothesis.tolist()[i],labels) if output[2]==hypothesis_df.expected_inference.tolist()[i]: inference_output[output[0]]={"hypothesis":output[1],"label":output[2],"score":output[3]} #inference_output.append(output) return inference_output predicted_labels = response['labels'] predicted_scores = response['scores'] clean_output = {idx: float(predicted_scores.pop(0)) for idx in predicted_labels} print("Date:{}, Sequence:{}, Labels: {}".format( str(datetime.datetime.now()), sequence, predicted_labels)) if not multi_label: top_label_key = list(clean_output.keys())[0] clean_output = {top_label_key: clean_output[top_label_key]} return clean_output def csv_to_df(file): return pd.read_csv(file) def csv_to_json(df): return df.to_json(orient="records") iface = gr.Interface( title="Sales Call Analysis AI - NS AI LABS", description="Off-the-shelf NLP classifier with no domain or task-specific training.", fn=sequence_to_classify, inputs=[gr.inputs.Textbox(lines=10, label="Please enter the text you would like to classify...", placeholder="Text here..."), gr.inputs.File(), gr.inputs.Radio(choices=[False, True], label="Multi-label?")], outputs=gr.outputs.Textbox()) iface.launch()