Spaces:

yutingg
/

essay-main-idea

Sleeping

essay-main-idea / main_idea_with_pipeline.py

Predict main idea sentence with custom-distill-bert-for-sentence-label

ecf6936 11 months ago

1.3 kB

	from nltk.tokenize import sent_tokenize, word_tokenize
	import pandas as pd


	# read in an essay and resturns a df in sentence level
	def essay_to_sent(essay):
	sentences = []
	paragraphs = [l for l in essay.split('\n') if len(l) > 0]
	for para in paragraphs:
	# tokenize paragraph by "." and concatenate to sentences[]
	sentences.extend(sent_tokenize(para))
	return sentences


	######################
	# prerequisite:
	# 1. Pip install transformer
	# 2. Define tokenizer + MAX_LEN
	# 3. Construct DistillBERTClass_SL class
	# 4. Construct Triage_SL class
	# 5. Define predict__SL class
	# 6. Load model_SL & call eval()
	# 7. Pre_define predict_params_SL
	####################

	from transformers import DistilBertTokenizer
	from transformers import pipeline

	tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


	def predict_mainidea_sent(paragraph, model):
	# prepare data
	sentences = essay_to_sent(paragraph)

	pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
	probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid")
	labels = [score['score'] > 0.5 for score in probability_score]
	return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence'])