pd4consultingmyles's picture
Update app.py
148e02d
import gradio as gr
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
import openai
import json
import torch
import time
nltk.download('punkt')
# Zero-shot-classification models
# this model is for classifying the sentences since sentences can belong to multiple criteria
classifier_pipeline = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli", framework="pt", multi_label=True)
# this model is for classifying the whole text as include or exclude
include_pipeline = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli", framework="pt")
def get_nlp_score_old(title, abstract, inclusion_criteria, exclusion_criteria, style, verbose=True):
'''
This function takes the title and abstract of a study along with the
inclusion and exclusion criteria and returns the score on whether the study
should be included or excluded from the review along with the reasoning
in the form of dictionary of criteria:sentence pairs
Inputs: title of study (str), abstract of study(str),
inclusion_criteria of review (list of strs), and exclusion_criteria (list of strs),
verbose controls whether the reasoning dictionary is returned
Outputs: json of nlp score, reasoning dictionary (if verbose=True)
'''
# get start time
#st = time.time()
def result_to_dict(result, threshold=0):
'''
Changes the results from the zero-shot-classification into a dictionary
Inputs: zero-shot classification results, threshold for which scores should be excluded (0-1)
Output: label:score dict
'''
result_dict = {label:score for label, score in zip(result["labels"],
result["scores"]) if score > threshold}
return result_dict
def simplify_crit_openai(criteria, style):
'''
Simplifies the incoming criteria using chatgpt
Inputs: Criteria
Output: Simplified version of criteria
'''
if style == "none":
return criteria
if len(criteria) > 30:
if style == "keyphrase":
prompt = "Turn this criteria into a simple label: \n\n" + criteria
elif style == "keyword":
prompt = "Summarize this criteria using keywords \n\n" + criteria
response = openai.Completion.create(
model="text-davinci-002",
prompt=prompt,
temperature=0,
max_tokens=120,
n=10
)
print(response["choices"][0]["text"].replace("\n",""))
return response["choices"][0]["text"].replace("\n","")
else:
return criteria
def simplify_crit_spacy(criteria):
rake_nltk_var.extract_keywords_from_text(criteria)
keyword_extracted = rake_nltk_var.get_ranked_phrases()
return doc.ents
inclusion_criteria = inclusion_criteria.split("|")
exclusion_criteria = exclusion_criteria.split("|")
# combines inclusion and exclusion criteria into one. This is so we dont have to run the classification twice
criteria = inclusion_criteria + exclusion_criteria
simpl_crit = [simplify_crit_openai(crit.strip().strip("\n"), style) for crit in criteria]
crit_dict = dict(zip(simpl_crit,criteria))
# combines the title and the abstract into one str and extracts the sentences from the new str
combined = title.strip("\n") + ". " + abstract.strip("\n")
# if true then does the reasoning dictionary else returns empty list
if verbose == True:
text = sent_tokenize(combined)
sent_cat = {crit : [] for crit in criteria}
# categorizes the sentences to return which sentences match with which criteria
for sentence in text:
# zero-shot classifcation
result = classifier_pipeline(sentence, simpl_crit)
# gets only the relavent criteria for the sentnece (score above .75)
rel_label = result_to_dict(result, threshold=.75)
for crit in rel_label.keys():
og_crit = crit_dict[crit]
sent_cat[og_crit].append(sentence)
else:
sent_cat = []
# runs the zero-shot classification across both the abstract and title
incl_results = include_pipeline(combined, simpl_crit)
nlp_score = 0
for i, (label, score) in enumerate(zip(incl_results["labels"], incl_results["scores"])):
if criteria[i] in inclusion_criteria:
nlp_score += score
# get end time
#et = time.time()
#time it took to run the whole algorithm
#elapsed = et-st
result_dict = {"nlp_score": nlp_score, "reasoning": sent_cat}
return round(result_dict['nlp_score']*10), result_dict['reasoning']
with gr.Blocks() as demo:
gr.Markdown(
'''
# SLR Test Environment
This dashboard is for testing the nlp algorithm for the SLR tool
Separate Criteria by | e.g. (China | Machine Learning | Landslide)
'''
)
with gr.Row():
title = gr.Textbox(label="Title")
abstr = gr.Textbox(label="Abstract")
incl = gr.Textbox(label="Inclusion Criteria")
excl = gr.Textbox(label="Exclusion Criteria")
style = gr.Radio(["none", "keyword", 'keyphrase'], label="Criteria Preprocessing", info="How the criteria are preproccessed before going into the NLP algorithm")
with gr.Row():
with gr.Column():
score = gr.Textbox(label="NLP Score")
criteria = gr.Textbox(label="Reasoning")
btn = gr.Button("Submit")
btn.click(fn=get_nlp_score_old, inputs=[title, abstr, incl, excl, style], outputs=[score, criteria])
demo.launch()