Spaces:

zmbfeng
/

testchatbot

Sleeping

App Files Files Community

testchatbot / app.py

zmbfeng

keywords extraction and intention classification

6465bf9 26 days ago

raw history blame contribute delete

No virus

8.89 kB

	import gradio as gr
	import random
	import os
	import copy
	import torch
	from huggingface_hub import login
	from transformers import pipeline
	from transformers import GPT2Tokenizer, GPT2LMHeadModel,set_seed
	from transformers import AutoTokenizer, AutoModelWithLMHead,AutoModelForSeq2SeqLM
	import datetime
	import nltk
	nltk.download('stopwords')
	nltk.download('punctuation')
	nltk.download('punkt')
	from rake_nltk import Rake

	login(os.environ["HF_TOKEN"])


	#https://huggingface.co/facebook/opt-1.3b
	#generator = pipeline('text-generation', model="microsoft/DialoGPT-medium")

	# dt stores the current date and time
	dt = datetime.datetime.now()
	print(dt)
	print("loading models")
	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
	original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')
	untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500')
	question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
	question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
	paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
	paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

	# tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium',cache_dir="G:\My Drive\Avatar\language_models_windows")
	# original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium',cache_dir="G:\My Drive\Avatar\language_models_windows")
	# untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500',cache_dir="G:\My Drive\Avatar\language_models_windows")
	# question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")
	# question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")
	# paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")
	# paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="G:\\My Drive\\Avatar\\language_models_windows")

	# tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\language_models_windows")
	# original_model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
	# untethered_model = GPT2LMHeadModel.from_pretrained('zmbfeng/untethered_20240225_epochs_500',cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
	# question_generation_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
	# question_generation_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
	# paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
	# paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws",cache_dir="C:\\Users\\zmbfeng\\Google Drive\\Avatar\\language_models_windows")
	default_temperature=0.01
	default_seed=43
	def create_response_question_generation(input_str, max_length=64):
	input_text = "answer: %s context: %s </s>" % (input_str, input_str)
	print(f"create question input_text={input_text}")
	features = question_generation_tokenizer([input_text], return_tensors='pt')

	output = question_generation_model.generate(input_ids=features['input_ids'],
	attention_mask=features['attention_mask'],
	max_length=max_length)

	return question_generation_tokenizer.decode(output[0])

	def create_response_paraphrase(input_str, max_length,num_return_sequences):
	text = "paraphrase: " + input_str + " </s>"

	encoding = paraphrase_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
	input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
	num_return_sequences = int(num_return_sequences) # Ensure this is an integer
	max_length = int(max_length)
	outputs = paraphrase_model.generate(
	input_ids=input_ids, attention_mask=attention_masks,
	# max_length=256,
	max_length=max_length,
	do_sample=True,
	top_k=120,
	top_p=0.95,
	early_stopping=True,
	num_return_sequences=num_return_sequences,
	repetition_penalty=1.5

	)
	result_output_str=""
	for output in outputs:
	line = paraphrase_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
	result_output_str=result_output_str+line+"<br/>"
	# results.append(line)
	# return results
	return result_output_str
	import string
	def contains_digit_or_punctuation(s):
	return any(char.isdigit() or char in string.punctuation for char in s)
	rake = Rake()
	def create_response_keywords_extraction(input_str):
	rake.extract_keywords_from_text(input_str)
	keywords_with_scores = rake.get_ranked_phrases_with_scores()
	filtered_keywords = []
	seen_keywords = set()
	for score, keyword in keywords_with_scores:
	# Apply filters: score must be greater than 1, keyword must not contain digits or punctuation
	if score > 1 and not contains_digit_or_punctuation(keyword) and keyword not in seen_keywords:
	filtered_keywords.append((score, keyword))
	seen_keywords.add(keyword)
	output_string=""
	for score, keyword in filtered_keywords:
	#print(f"Score: {score}, Keyword: {keyword}")
	output_string= output_string + f"Score: {score}, Keyword: {keyword} <br/>"

	return output_string

	def create_response_intention_classification(input_str):
	labels = ["dialogue", "long content generation"]

	# Perform classification
	output_string=""
	result = classifier(input_str, labels)
	for label, score in zip(result["labels"], result["scores"]):
	output_string= output_string + f"Label: {label}, Score: {score:.4f} <br/>"

	return output_string

	interface_question_generation = gr.Interface(fn=create_response_question_generation,
	title="Question Generation",
	description="Enter a statmente like Paris is the capital of France",
	inputs=[
	gr.Textbox(label="input text here", lines=3, value="Paris is the capital of France"),
	gr.Number(
	label="max length",
	value=64),
	],
	outputs="html"
	)



	interface_paraphrase = gr.Interface(fn=create_response_paraphrase,
	title="Paraphrase",
	description="Paraphrase sentences",
	#examples=examples,
	inputs=[
	gr.Textbox(label="input text here", lines=3, value="It is truly a great cosmic paradox that one of the best teachers in all of life turns out to be death. No person or situation could ever teach you as much as death has to teach you. "),

	gr.Number(
	label="max length",
	value=512),
	gr.Number(
	label="num of responses",
	value=2)
	],
	outputs="html"
	)
	interface_extract_keywords = gr.Interface(fn=create_response_keywords_extraction,
	title="Extract Keywords",
	description="Extract Keywords ",
	#examples=examples,
	inputs=[
	gr.Textbox(label="input text here", lines=3, value="It is truly a great cosmic paradox that one of the best teachers in all of life turns out to be death. No person or situation could ever teach you as much as death has to teach you. "),
	],
	outputs="html"
	)
	interface_intention_classification = gr.Interface(fn=create_response_intention_classification,
	title="Intention Classification",
	description="Find if question intention is short dialog or long content generation. How are you? versus WWhat are the implications of quantum computing on global security? (difference not very dramatic as of now)",
	#examples=examples,
	inputs=[
	gr.Textbox(label="input text here", lines=3, value="What are the implications of quantum computing on global security?"),
	],
	outputs="html"
	)


	demo = gr.TabbedInterface([interface_question_generation, interface_paraphrase,interface_extract_keywords,interface_intention_classification], ["Question Generation", "Paraphrase", "Keywords Extraction", "Intention Classification"])

	demo.launch()