File size: 4,601 Bytes
c9fcb62
 
 
1572197
 
 
 
 
c9fcb62
a9fb2a7
0a69afb
f67a7e7
0a69afb
 
1572197
0a69afb
00a0b15
0a69afb
 
00a0b15
0a69afb
 
 
 
 
 
 
 
 
e33935a
0a69afb
 
 
 
 
 
 
 
 
 
f67a7e7
e33935a
c9fcb62
e33935a
0a69afb
e33935a
c9fcb62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bd401e
c9fcb62
 
 
 
288d500
c9fcb62
 
 
 
 
 
f624e9e
c9fcb62
 
 
 
 
 
 
 
 
 
e4b2b11
c9fcb62
 
39450b7
c9fcb62
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import streamlit as st
#LLAMA prep
from huggingface_hub import login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain

@st.cache_resource
def load_llm():
    global pipe, llm
    login("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")
    
   
    
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",
                                              use_auth_token=True,)
    
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",
                                                 device_map='auto',
                                                 torch_dtype=torch.float16,
                                                 use_auth_token=True,
                                                #  load_in_8bit=True,
                                                #  load_in_4bit=True
                                                 )
    # Use a pipeline for later
    from transformers import pipeline
    
    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer= tokenizer,
                    torch_dtype=torch.bfloat16,
                    device_map="auto",
                    max_new_tokens = 512,
                    do_sample=True,
                    top_k=30,
                    num_return_sequences=1,
                    eos_token_id=tokenizer.eos_token_id
                    )
    
    llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

    return pipe, llm
    
pipe, llm = load_llm()

import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""



def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")



def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text
        return wrapped_text



def answer(context, question):
    global llm
    instruction = f"conversation: '''{context}'''"+"\n based on the provided conversation in triple quotes answer next question.\n Question: {text}"

    system_prompt = "You are an expert and answer any question based on conversation. You analys the conversation in light of the question then you answer with yes, no or not clear only. You only output one or two words"
    
    template = get_prompt(instruction, system_prompt)
    print(template)

    prompt = PromptTemplate(template=template, input_variables=["text"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    output = llm_chain.run(question)

    return parse_text(output)




question = st.sidebar.text_input('Question', 'Can she answer')
context = st.text_area('Context', 'conversation')

if st.sidebar.button('Answer'):
    outputs = "none"
    outputs = answer(context, question)
    st.sidebar.write(f"Answer is {outputs}")