Linsey Passarella (8lp) commited on
Commit
eca4d65
1 Parent(s): be4e89b

adding app

Browse files
Files changed (1) hide show
  1. app.py +150 -0
app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from nltk.tokenize import sent_tokenize
4
+ import torch
5
+ import ujson as json
6
+ from transformers import AutoModelForCausalLM,LlamaTokenizer
7
+ from peft import PeftModel
8
+ from keybert import KeyBERT
9
+ from keyphrase_vectorizers import KeyphraseCountVectorizer
10
+ import nltk
11
+ nltk.download('punkt')
12
+
13
+ # loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
14
+ model_name = "llama-7b-hf"
15
+ adapters_name = 'guanaco-7b'
16
+ # print(f"Starting to load the model {model_name} into memory")
17
+ m = AutoModelForCausalLM.from_pretrained(
18
+ model_name,
19
+ #load_in_4bit=True,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map='auto'
22
+ )
23
+ m = PeftModel.from_pretrained(m, adapters_name)
24
+ m = m.merge_and_unload()
25
+ tok = LlamaTokenizer.from_pretrained(model_name)
26
+ tok.bos_token_id = 1
27
+ stop_token_ids = [0]
28
+ # print(f"Successfully loaded the model {model_name} into memory")
29
+ print('Guanaco model loaded into memory.')
30
+
31
+
32
+ def generate(title, abstract):
33
+ print("Started running.")
34
+ '''
35
+ Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
36
+ '''
37
+ newline = {}
38
+ text = abstract
39
+ # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
40
+ if text.lower()[0:9] == "abstract.":
41
+ text = text[9:]
42
+ elif text.lower()[0:8] == "abstract":
43
+ text = text[8:]
44
+ sentences = sent_tokenize(text)
45
+ newline["target"] = sentences
46
+ newline["title"] = title
47
+ first_file = open("data/sample-data.jsonl", "w")
48
+ first_file.write(json.dumps(newline))
49
+ first_file.close()
50
+ print(newline)
51
+ print("Tokenized abstract to sentences.")
52
+ '''
53
+ Main part
54
+ '''
55
+ '''
56
+ This is for summarization
57
+ '''
58
+ tooShortForKeyword = False
59
+ with open("data/sample-data.jsonl", "r") as f:
60
+ obj = [json.loads(l) for l in f]
61
+ doc = ""
62
+ if len(obj[0]["target"]) > 1:
63
+ doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " " + obj[0]["target"][1]
64
+ elif len(obj[0]["target"]) == 1:
65
+ tooShortForKeyword = True
66
+ doc += obj[0]["title"] + ". " + obj[0]["target"][0]
67
+ else:
68
+ tooShortForKeyword = True
69
+ doc += obj[0]["title"]
70
+ text = doc
71
+ prompt = """
72
+ Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
73
+ """
74
+ formatted_prompt = (
75
+ f"A chat between a curious human and an artificial intelligence assistant."
76
+ f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
77
+ f"### Human: {prompt + doc} \n"
78
+ f"### Assistant:"
79
+ )
80
+ inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:1")
81
+ outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
82
+ output = tok.decode(outputs[0], skip_special_tokens=True)
83
+ index_response = output.find("### Assistant: ") + 15
84
+ if (output[index_response:index_response + 10] == "Certainly!"):
85
+ index_response += 10
86
+ end_response = output.rfind('.') + 1
87
+ response = output[index_response:end_response]
88
+ with open("data/guanacoSummaryOutput.txt", "w") as f2:
89
+ f2.write(response)
90
+ print('Plain Language Summary Created.')
91
+
92
+ '''
93
+ Keyphrase extraction.
94
+ '''
95
+ # the document is the title and first two sentences of the abstract.
96
+
97
+ with open("data/sample-data.jsonl", "r") as f:
98
+ obj = [json.loads(l) for l in f]
99
+ doc = ""
100
+ if len(obj[0]["target"]) > 1:
101
+ doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " " + obj[0]["target"][1]
102
+ kw_model = KeyBERT(model="all-MiniLM-L6-v2")
103
+ vectorizer = KeyphraseCountVectorizer()
104
+ top_n = 2
105
+ keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
106
+ my_keywords = []
107
+ for i in range(top_n):
108
+ add = True
109
+ for j in range(top_n):
110
+ if i != j:
111
+ if keywords[i][0] in keywords[j][0]:
112
+ add = False
113
+ if add:
114
+ my_keywords.append(keywords[i][0])
115
+ for entry in my_keywords:
116
+ print(entry)
117
+ '''
118
+ This is for feeding the keyphrases into Guanaco.
119
+ '''
120
+ responseTwo = ""
121
+ keyword_string = ""
122
+ if not tooShortForKeyword:
123
+ separator = ', '
124
+ keyword_string = separator.join(my_keywords)
125
+ prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
126
+
127
+ formatted_prompt = (
128
+ f"A chat between a curious human and an artificial intelligence assistant."
129
+ f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
130
+ f"### Human: {prompt} \n"
131
+ f"### Assistant:"
132
+ )
133
+ inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:2")
134
+ outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
135
+ output = tok.decode(outputs[0], skip_special_tokens=True)
136
+ index_response = output.find("### Assistant: ") + 15
137
+ end_response = output.rfind('.') + 1
138
+ responseTwo = output[index_response:end_response]
139
+ with open("data/guanacoElaborationOutput.txt", "w") as f2:
140
+ f2.write(responseTwo)
141
+ print('Keyphrase elaboration ran.')
142
+ return keyword_string, responseTwo, response
143
+
144
+ demo = gr.Interface(
145
+ fn=generate,
146
+ inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
147
+ outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
148
+ ).launch(share = True)
149
+
150
+ print('after launch') # now executes