Spaces:

Linseypass
/

PLTS

Runtime error

App Files Files Community

Linsey Passarella (8lp) commited on Aug 17, 2023

Commit

eca4d65

1 Parent(s): be4e89b

adding app

Browse files

Files changed (1) hide show

app.py +150 -0

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import gradio as gr
+import json
+from nltk.tokenize import sent_tokenize
+import torch
+import ujson as json
+from transformers import AutoModelForCausalLM,LlamaTokenizer
+from peft import PeftModel
+from keybert import KeyBERT
+from keyphrase_vectorizers import KeyphraseCountVectorizer
+import nltk
+nltk.download('punkt')
+# loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
+model_name = "llama-7b-hf"
+adapters_name = 'guanaco-7b'
+# print(f"Starting to load the model {model_name} into memory")
+m = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    #load_in_4bit=True,
+    torch_dtype=torch.bfloat16,
+    device_map='auto'
+)
+m = PeftModel.from_pretrained(m, adapters_name)
+m = m.merge_and_unload()
+tok = LlamaTokenizer.from_pretrained(model_name)
+tok.bos_token_id = 1
+stop_token_ids = [0]
+# print(f"Successfully loaded the model {model_name} into memory")
+print('Guanaco model loaded into memory.')
+def generate(title, abstract):
+    print("Started running.")
+    '''
+    Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
+    '''
+    newline = {}
+    text = abstract
+    # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
+    if text.lower()[0:9] == "abstract.":
+        text = text[9:]
+    elif text.lower()[0:8] == "abstract":
+        text = text[8:]
+    sentences = sent_tokenize(text)
+    newline["target"] = sentences
+    newline["title"] = title
+    first_file = open("data/sample-data.jsonl", "w")
+    first_file.write(json.dumps(newline))
+    first_file.close()
+    print(newline)
+    print("Tokenized abstract to sentences.")
+    '''
+    Main part
+    '''
+    '''
+    This is for summarization
+    '''
+    tooShortForKeyword = False
+    with open("data/sample-data.jsonl", "r") as f:
+        obj = [json.loads(l) for l in f]
+        doc = ""
+        if len(obj[0]["target"]) > 1:
+            doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " "  + obj[0]["target"][1]
+        elif len(obj[0]["target"]) == 1:
+            tooShortForKeyword = True
+            doc += obj[0]["title"] + ". " + obj[0]["target"][0]
+        else:
+            tooShortForKeyword = True
+            doc += obj[0]["title"]
+    text = doc
+    prompt = """
+    Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
+    """
+    formatted_prompt = (
+        f"A chat between a curious human and an artificial intelligence assistant."
+        f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+        f"### Human: {prompt + doc} \n"
+        f"### Assistant:"
+    )
+    inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:1")
+    outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
+    output = tok.decode(outputs[0], skip_special_tokens=True)
+    index_response = output.find("### Assistant: ") + 15
+    if (output[index_response:index_response + 10] == "Certainly!"):
+        index_response += 10
+    end_response = output.rfind('.') + 1
+    response = output[index_response:end_response]
+    with open("data/guanacoSummaryOutput.txt", "w") as f2:
+        f2.write(response)
+    print('Plain Language Summary Created.')
+    '''
+    Keyphrase extraction.
+    '''
+    # the document is the title and first two sentences of the abstract.
+    with open("data/sample-data.jsonl", "r") as f:
+        obj = [json.loads(l) for l in f]
+        doc = ""
+        if len(obj[0]["target"]) > 1:
+            doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " "  + obj[0]["target"][1]
+            kw_model = KeyBERT(model="all-MiniLM-L6-v2")
+            vectorizer = KeyphraseCountVectorizer()
+            top_n = 2
+            keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
+            my_keywords = []
+            for i in range(top_n):
+                add = True
+                for j in range(top_n):
+                    if i != j:
+                        if keywords[i][0] in keywords[j][0]:
+                            add = False
+                if add:
+                    my_keywords.append(keywords[i][0])
+            for entry in my_keywords:
+                print(entry)
+    '''
+    This is for feeding the keyphrases into Guanaco.
+    '''
+    responseTwo = ""
+    keyword_string = ""
+    if not tooShortForKeyword:
+        separator = ', '
+        keyword_string = separator.join(my_keywords)
+        prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
+        formatted_prompt = (
+            f"A chat between a curious human and an artificial intelligence assistant."
+            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+            f"### Human: {prompt} \n"
+            f"### Assistant:"
+        )
+        inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:2")
+        outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
+        output = tok.decode(outputs[0], skip_special_tokens=True)
+        index_response = output.find("### Assistant: ") + 15
+        end_response = output.rfind('.') + 1
+        responseTwo = output[index_response:end_response]
+        with open("data/guanacoElaborationOutput.txt", "w") as f2:
+            f2.write(responseTwo)
+    print('Keyphrase elaboration ran.')
+    return keyword_string, responseTwo, response
+demo = gr.Interface(
+    fn=generate,
+    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
+    outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
+).launch(share = True)
+print('after launch') # now executes