Spaces:

pd4consultingmyles
/

nlp_test_environment

Sleeping

App Files Files Community

nlp_test_environment / app.py

pd4consultingmyles

Update app.py

148e02d over 2 years ago

raw

history blame contribute delete

5.89 kB

	import gradio as gr
	from transformers import pipeline
	import nltk
	from nltk.tokenize import sent_tokenize
	import openai
	import json
	import torch
	import time

	nltk.download('punkt')

	# Zero-shot-classification models
	# this model is for classifying the sentences since sentences can belong to multiple criteria
	classifier_pipeline = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli", framework="pt", multi_label=True)
	# this model is for classifying the whole text as include or exclude
	include_pipeline = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli", framework="pt")

	def get_nlp_score_old(title, abstract, inclusion_criteria, exclusion_criteria, style, verbose=True):
	'''
	This function takes the title and abstract of a study along with the
	inclusion and exclusion criteria and returns the score on whether the study
	should be included or excluded from the review along with the reasoning
	in the form of dictionary of criteria:sentence pairs

	Inputs: title of study (str), abstract of study(str),
	inclusion_criteria of review (list of strs), and exclusion_criteria (list of strs),
	verbose controls whether the reasoning dictionary is returned
	Outputs: json of nlp score, reasoning dictionary (if verbose=True)
	'''
	# get start time
	#st = time.time()
	def result_to_dict(result, threshold=0):
	'''
	Changes the results from the zero-shot-classification into a dictionary

	Inputs: zero-shot classification results, threshold for which scores should be excluded (0-1)

	Output: label:score dict
	'''
	result_dict = {label:score for label, score in zip(result["labels"],
	result["scores"]) if score > threshold}
	return result_dict

	def simplify_crit_openai(criteria, style):
	'''
	Simplifies the incoming criteria using chatgpt

	Inputs: Criteria

	Output: Simplified version of criteria
	'''
	if style == "none":
	return criteria
	if len(criteria) > 30:
	if style == "keyphrase":
	prompt = "Turn this criteria into a simple label: \n\n" + criteria
	elif style == "keyword":
	prompt = "Summarize this criteria using keywords \n\n" + criteria

	response = openai.Completion.create(
	model="text-davinci-002",
	prompt=prompt,
	temperature=0,
	max_tokens=120,
	n=10
	)
	print(response["choices"][0]["text"].replace("\n",""))
	return response["choices"][0]["text"].replace("\n","")
	else:
	return criteria
	def simplify_crit_spacy(criteria):

	rake_nltk_var.extract_keywords_from_text(criteria)
	keyword_extracted = rake_nltk_var.get_ranked_phrases()

	return doc.ents

	inclusion_criteria = inclusion_criteria.split("\|")
	exclusion_criteria = exclusion_criteria.split("\|")

	# combines inclusion and exclusion criteria into one. This is so we dont have to run the classification twice
	criteria = inclusion_criteria + exclusion_criteria

	simpl_crit = [simplify_crit_openai(crit.strip().strip("\n"), style) for crit in criteria]

	crit_dict = dict(zip(simpl_crit,criteria))

	# combines the title and the abstract into one str and extracts the sentences from the new str
	combined = title.strip("\n") + ". " + abstract.strip("\n")

	# if true then does the reasoning dictionary else returns empty list
	if verbose == True:

	text = sent_tokenize(combined)

	sent_cat = {crit : [] for crit in criteria}
	# categorizes the sentences to return which sentences match with which criteria

	for sentence in text:
	# zero-shot classifcation
	result = classifier_pipeline(sentence, simpl_crit)
	# gets only the relavent criteria for the sentnece (score above .75)
	rel_label = result_to_dict(result, threshold=.75)
	for crit in rel_label.keys():
	og_crit = crit_dict[crit]
	sent_cat[og_crit].append(sentence)

	else:
	sent_cat = []

	# runs the zero-shot classification across both the abstract and title
	incl_results = include_pipeline(combined, simpl_crit)

	nlp_score = 0

	for i, (label, score) in enumerate(zip(incl_results["labels"], incl_results["scores"])):
	if criteria[i] in inclusion_criteria:
	nlp_score += score

	# get end time
	#et = time.time()

	#time it took to run the whole algorithm
	#elapsed = et-st

	result_dict = {"nlp_score": nlp_score, "reasoning": sent_cat}

	return round(result_dict['nlp_score']*10), result_dict['reasoning']

	with gr.Blocks() as demo:
	gr.Markdown(
	'''
	# SLR Test Environment
	This dashboard is for testing the nlp algorithm for the SLR tool
	Separate Criteria by \| e.g. (China \| Machine Learning \| Landslide)
	'''
	)
	with gr.Row():
	title = gr.Textbox(label="Title")
	abstr = gr.Textbox(label="Abstract")
	incl = gr.Textbox(label="Inclusion Criteria")
	excl = gr.Textbox(label="Exclusion Criteria")
	style = gr.Radio(["none", "keyword", 'keyphrase'], label="Criteria Preprocessing", info="How the criteria are preproccessed before going into the NLP algorithm")
	with gr.Row():
	with gr.Column():
	score = gr.Textbox(label="NLP Score")
	criteria = gr.Textbox(label="Reasoning")

	btn = gr.Button("Submit")
	btn.click(fn=get_nlp_score_old, inputs=[title, abstr, incl, excl, style], outputs=[score, criteria])

	demo.launch()