File size: 5,148 Bytes
eca4d65
e77c4bf
eca4d65
 
 
c2acbb6
eca4d65
 
 
 
 
 
 
316c6ac
301f283
eca4d65
 
316c6ac
c2ce80d
 
eca4d65
316c6ac
eca4d65
 
 
 
 
 
 
e77c4bf
 
eca4d65
e77c4bf
 
 
 
5001698
e77c4bf
 
 
 
 
 
 
 
 
 
 
eca4d65
 
 
e77c4bf
 
 
 
 
 
 
 
 
 
 
 
eca4d65
 
 
 
 
 
e77c4bf
eca4d65
 
e77c4bf
 
eca4d65
 
e77c4bf
eca4d65
 
 
e77c4bf
eca4d65
e77c4bf
 
 
 
eca4d65
e77c4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eca4d65
e77c4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
422faf3
e77c4bf
 
 
 
 
84ea4c2
eca4d65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr

from nltk.tokenize import sent_tokenize
import torch
import ujson as json
from transformers import AutoModelForCausalLM,LlamaTokenizer,BitsAndBytesConfig
from peft import PeftModel
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import nltk
nltk.download('punkt')

# loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
model_name = "decapoda-research/llama-7b-hf"
adapters_name = 'timdettmers/guanaco-7b'
# print(f"Starting to load the model {model_name} into memory")
m = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16)    

m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tok = LlamaTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1
stop_token_ids = [0]
# print(f"Successfully loaded the model {model_name} into memory")
print('Guanaco model loaded into memory.')


def generate(title, abstract):
    print("Started running.")
    '''
    Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run. 
    '''  
    newline = {}
    text = abstract
    # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
    if text.lower()[0:9] == "abstract.":
        text = text[9:]
    elif text.lower()[0:8] == "abstract":
        text = text[8:]
    sentences = sent_tokenize(text)
    newline["target"] = sentences
    newline["title"] = title
    print("Tokenized abstract to sentences.")
    '''
    Main part
    '''
    '''
    This is for summarization
    '''
    tooShortForKeyword = False
    obj = newline
    doc = ""
    if len(obj["target"]) > 1:
        doc += obj["title"] + ". " + obj["target"][0] + " "  + obj["target"][1]
    elif len(obj["target"]) == 1:
        tooShortForKeyword = True
        doc += obj["title"] + ". " + obj["target"][0]
    else:
        tooShortForKeyword = True
        doc += obj["title"]
    text = doc
    prompt = """
    Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
    """
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
        f"### Human: {prompt + doc} \n"
        f"### Assistant:"
    )
    inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:1")
    outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
    output = tok.decode(outputs[0], skip_special_tokens=True)
    index_response = output.find("### Assistant: ") + 15
    if (output[index_response:index_response + 10] == "Certainly!"):
        index_response += 10
    end_response = output.rfind('.') + 1
    response = output[index_response:end_response]
    print('Plain Language Summary Created.')

    '''
    Keyphrase extraction. 
    '''
    # the document is the title and first two sentences of the abstract. 

    obj = newline
    doc = ""
    if len(obj["target"]) > 1:
        doc += obj["title"] + ". " + obj["target"][0] + " "  + obj["target"][1]
        kw_model = KeyBERT(model="all-MiniLM-L6-v2")
        vectorizer = KeyphraseCountVectorizer()
        top_n = 2
        keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
        my_keywords = []
        for i in range(top_n):
            add = True
            for j in range(top_n):
                if i != j:
                    if keywords[i][0] in keywords[j][0]:
                        add = False
            if add:
                my_keywords.append(keywords[i][0])
        for entry in my_keywords:
            print(entry)
    '''
    This is for feeding the keyphrases into Guanaco.
    '''
    responseTwo = ""
    keyword_string = ""
    if not tooShortForKeyword:
        separator = ', '
        keyword_string = separator.join(my_keywords)
        prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."

        formatted_prompt = (
            f"A chat between a curious human and an artificial intelligence assistant."
            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
            f"### Human: {prompt} \n"
            f"### Assistant:"
        )
        inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:2")
        outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
        output = tok.decode(outputs[0], skip_special_tokens=True)
        index_response = output.find("### Assistant: ") + 15
        end_response = output.rfind('.') + 1
        responseTwo = output[index_response:end_response]
    print('Keyphrase elaboration ran.')
    return keyword_string, responseTwo, response

demo = gr.Interface(
    fn=generate,
    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
    outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
)
demo.launch()