Spaces:
Runtime error
Runtime error
File size: 5,148 Bytes
eca4d65 e77c4bf eca4d65 c2acbb6 eca4d65 316c6ac 301f283 eca4d65 316c6ac c2ce80d eca4d65 316c6ac eca4d65 e77c4bf eca4d65 e77c4bf 5001698 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf eca4d65 e77c4bf 422faf3 e77c4bf 84ea4c2 eca4d65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from nltk.tokenize import sent_tokenize
import torch
import ujson as json
from transformers import AutoModelForCausalLM,LlamaTokenizer,BitsAndBytesConfig
from peft import PeftModel
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import nltk
nltk.download('punkt')
# loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
model_name = "decapoda-research/llama-7b-hf"
adapters_name = 'timdettmers/guanaco-7b'
# print(f"Starting to load the model {model_name} into memory")
m = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tok = LlamaTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1
stop_token_ids = [0]
# print(f"Successfully loaded the model {model_name} into memory")
print('Guanaco model loaded into memory.')
def generate(title, abstract):
print("Started running.")
'''
Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
'''
newline = {}
text = abstract
# eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
if text.lower()[0:9] == "abstract.":
text = text[9:]
elif text.lower()[0:8] == "abstract":
text = text[8:]
sentences = sent_tokenize(text)
newline["target"] = sentences
newline["title"] = title
print("Tokenized abstract to sentences.")
'''
Main part
'''
'''
This is for summarization
'''
tooShortForKeyword = False
obj = newline
doc = ""
if len(obj["target"]) > 1:
doc += obj["title"] + ". " + obj["target"][0] + " " + obj["target"][1]
elif len(obj["target"]) == 1:
tooShortForKeyword = True
doc += obj["title"] + ". " + obj["target"][0]
else:
tooShortForKeyword = True
doc += obj["title"]
text = doc
prompt = """
Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
"""
formatted_prompt = (
f"A chat between a curious human and an artificial intelligence assistant."
f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
f"### Human: {prompt + doc} \n"
f"### Assistant:"
)
inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:1")
outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
output = tok.decode(outputs[0], skip_special_tokens=True)
index_response = output.find("### Assistant: ") + 15
if (output[index_response:index_response + 10] == "Certainly!"):
index_response += 10
end_response = output.rfind('.') + 1
response = output[index_response:end_response]
print('Plain Language Summary Created.')
'''
Keyphrase extraction.
'''
# the document is the title and first two sentences of the abstract.
obj = newline
doc = ""
if len(obj["target"]) > 1:
doc += obj["title"] + ". " + obj["target"][0] + " " + obj["target"][1]
kw_model = KeyBERT(model="all-MiniLM-L6-v2")
vectorizer = KeyphraseCountVectorizer()
top_n = 2
keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
my_keywords = []
for i in range(top_n):
add = True
for j in range(top_n):
if i != j:
if keywords[i][0] in keywords[j][0]:
add = False
if add:
my_keywords.append(keywords[i][0])
for entry in my_keywords:
print(entry)
'''
This is for feeding the keyphrases into Guanaco.
'''
responseTwo = ""
keyword_string = ""
if not tooShortForKeyword:
separator = ', '
keyword_string = separator.join(my_keywords)
prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
formatted_prompt = (
f"A chat between a curious human and an artificial intelligence assistant."
f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
f"### Human: {prompt} \n"
f"### Assistant:"
)
inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:2")
outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
output = tok.decode(outputs[0], skip_special_tokens=True)
index_response = output.find("### Assistant: ") + 15
end_response = output.rfind('.') + 1
responseTwo = output[index_response:end_response]
print('Keyphrase elaboration ran.')
return keyword_string, responseTwo, response
demo = gr.Interface(
fn=generate,
inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
)
demo.launch()
|