File size: 2,994 Bytes
2717026
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import time
import streamlit as st
import torch
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline
from constants import EMBEDDING_MODEL_NAME, db_all, db_frankl, db_inst
from langchain.vectorstores import Chroma
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    LlamaForCausalLM,
    LlamaTokenizer,
    pipeline,
)


@st.cache_resource(show_spinner=False)
def load_model(device_type, model_id):
    if device_type.lower() == "cuda":  # cuda
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )
        model.tie_weights()
    else:  # cpu
        tokenizer = LlamaTokenizer.from_pretrained(model_id)
        model = LlamaForCausalLM.from_pretrained(model_id)

    generation_config = GenerationConfig.from_pretrained(model_id)

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=2048,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.5,
        generation_config=generation_config,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)

    return local_llm


@st.cache_data(persist=True, show_spinner=False)
def get_embeddings():
    if torch.cuda.is_available():
        device_type = "cuda"
    else:
        device_type = "cpu"

    embeddings = HuggingFaceInstructEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={"device": device_type}
    )

    return embeddings


def get_llm():
    if torch.cuda.is_available():
        device_type = "cuda"
    else:
        device_type = "cpu"

    model_id = "psmathur/orca_mini_3b"

    llm = load_model(
        device_type,
        model_id=model_id
    )
    return llm


def load_qa(db_option):
    db_instance = db_inst

    match db_option:
        case "All":
            db_instance = db_all
            print("loading db_all")
        case "Frankl's Works":
            db_instance = db_frankl
            print("loading db_frankl")
        case "Journal of Search for Meaning":
            db_instance = db_inst
            print("loading db_inst")

    load_start = time.time()

    embeddings = get_embeddings()

    db = Chroma(
        persist_directory=db_instance.get_directory(),
        embedding_function=embeddings,
        client_settings=db_instance.get_chroma_settings(),
    )

    retriever = db.as_retriever()

    llm = get_llm()

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
    )

    load_end = time.time()
    print(f"\n> Completed Initial Load (took {round(load_end - load_start, 2)} s.)")

    return qa