import gradio as gr import json from nltk.tokenize import sent_tokenize import torch import ujson as json from transformers import AutoModelForCausalLM,LlamaTokenizer from peft import PeftModel from keybert import KeyBERT from keyphrase_vectorizers import KeyphraseCountVectorizer import nltk nltk.download('punkt') # loads Guanaco 7B model - takes around 2-3 minutes - can do this separately model_name = "llama-7b-hf" adapters_name = 'guanaco-7b' # print(f"Starting to load the model {model_name} into memory") m = AutoModelForCausalLM.from_pretrained( model_name, #load_in_4bit=True, torch_dtype=torch.bfloat16, device_map='auto' ) m = PeftModel.from_pretrained(m, adapters_name) m = m.merge_and_unload() tok = LlamaTokenizer.from_pretrained(model_name) tok.bos_token_id = 1 stop_token_ids = [0] # print(f"Successfully loaded the model {model_name} into memory") print('Guanaco model loaded into memory.') def generate(title, abstract): print("Started running.") ''' Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run. ''' newline = {} text = abstract # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text if text.lower()[0:9] == "abstract.": text = text[9:] elif text.lower()[0:8] == "abstract": text = text[8:] sentences = sent_tokenize(text) newline["target"] = sentences newline["title"] = title first_file = open("data/sample-data.jsonl", "w") first_file.write(json.dumps(newline)) first_file.close() print(newline) print("Tokenized abstract to sentences.") ''' Main part ''' ''' This is for summarization ''' tooShortForKeyword = False with open("data/sample-data.jsonl", "r") as f: obj = [json.loads(l) for l in f] doc = "" if len(obj[0]["target"]) > 1: doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " " + obj[0]["target"][1] elif len(obj[0]["target"]) == 1: tooShortForKeyword = True doc += obj[0]["title"] + ". " + obj[0]["target"][0] else: tooShortForKeyword = True doc += obj[0]["title"] text = doc prompt = """ Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.: """ formatted_prompt = ( f"A chat between a curious human and an artificial intelligence assistant." f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n" f"### Human: {prompt + doc} \n" f"### Assistant:" ) inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:1") outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300) output = tok.decode(outputs[0], skip_special_tokens=True) index_response = output.find("### Assistant: ") + 15 if (output[index_response:index_response + 10] == "Certainly!"): index_response += 10 end_response = output.rfind('.') + 1 response = output[index_response:end_response] with open("data/guanacoSummaryOutput.txt", "w") as f2: f2.write(response) print('Plain Language Summary Created.') ''' Keyphrase extraction. ''' # the document is the title and first two sentences of the abstract. with open("data/sample-data.jsonl", "r") as f: obj = [json.loads(l) for l in f] doc = "" if len(obj[0]["target"]) > 1: doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " " + obj[0]["target"][1] kw_model = KeyBERT(model="all-MiniLM-L6-v2") vectorizer = KeyphraseCountVectorizer() top_n = 2 keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True) my_keywords = [] for i in range(top_n): add = True for j in range(top_n): if i != j: if keywords[i][0] in keywords[j][0]: add = False if add: my_keywords.append(keywords[i][0]) for entry in my_keywords: print(entry) ''' This is for feeding the keyphrases into Guanaco. ''' responseTwo = "" keyword_string = "" if not tooShortForKeyword: separator = ', ' keyword_string = separator.join(my_keywords) prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application." formatted_prompt = ( f"A chat between a curious human and an artificial intelligence assistant." f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n" f"### Human: {prompt} \n" f"### Assistant:" ) inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:2") outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300) output = tok.decode(outputs[0], skip_special_tokens=True) index_response = output.find("### Assistant: ") + 15 end_response = output.rfind('.') + 1 responseTwo = output[index_response:end_response] with open("data/guanacoElaborationOutput.txt", "w") as f2: f2.write(responseTwo) print('Keyphrase elaboration ran.') return keyword_string, responseTwo, response demo = gr.Interface( fn=generate, inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")], outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")], ).launch(share = True) print('after launch') # now executes