File size: 6,159 Bytes
b87c5b6
 
 
 
 
393df19
988a46d
393df19
 
231c41d
077768b
11194f5
231c41d
393df19
231c41d
988a46d
393df19
 
231c41d
 
393df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988a46d
393df19
 
 
 
 
988a46d
b87c5b6
393df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988a46d
 
393df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b87c5b6
393df19
 
 
11194f5
 
c4cb68e
988a46d
 
c4cb68e
b9c182b
988a46d
393df19
231c41d
393df19
8a87492
231c41d
 
 
 
b9c182b
 
231c41d
b9c182b
11194f5
393df19
b9c182b
231c41d
 
b9c182b
 
 
b87c5b6
231c41d
b9c182b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import load_index_from_storage
from llama_index.core import PromptTemplate
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

STORE_DIR = "openworm.ai_store"
SOURCE_DOCUMENT = "source document"

LLM_GPT4o = "GPT4o"


def print_(text):
    print(text)


def load_index(model):
    OLLAMA_MODEL = model.replace("Ollama:", "") if model is not LLM_GPT4o else None

    print_("Creating a storage context for %s" % model)

    STORE_SUBFOLDER = (
        "" if OLLAMA_MODEL is None else "/%s" % OLLAMA_MODEL.replace(":", "_")
    )

    # index_reloaded =SimpleIndexStore.from_persist_dir(persist_dir=INDEX_STORE_DIR)
    storage_context = StorageContext.from_defaults(
        docstore=SimpleDocumentStore.from_persist_dir(
            persist_dir=STORE_DIR + STORE_SUBFOLDER
        ),
        vector_store=SimpleVectorStore.from_persist_dir(
            persist_dir=STORE_DIR + STORE_SUBFOLDER
        ),
        index_store=SimpleIndexStore.from_persist_dir(
            persist_dir=STORE_DIR + STORE_SUBFOLDER
        ),
    )
    print_("Reloading index for %s" % model)

    index_reloaded = load_index_from_storage(storage_context)

    return index_reloaded


def get_query_engine(index_reloaded, model, similarity_top_k=4):
    OLLAMA_MODEL = model.replace("Ollama:", "") if model is not LLM_GPT4o else None

    print_("Creating query engine for %s" % model)

    # Based on: https://docs.llamaindex.ai/en/stable/examples/customization/prompts/completion_prompts/

    text_qa_template_str = (
        "Context information is"
        " below.\n---------------------\n{context_str}\n---------------------\nUsing"
        " both the context information and also using your own knowledge, answer"
        " the question: {query_str}\nIf the context isn't helpful, you can also"
        " answer the question on your own.\n"
    )
    text_qa_template = PromptTemplate(text_qa_template_str)

    refine_template_str = (
        "The original question is as follows: {query_str}\nWe have provided an"
        " existing answer: {existing_answer}\nWe have the opportunity to refine"
        " the existing answer (only if needed) with some more context"
        " below.\n------------\n{context_msg}\n------------\nUsing both the new"
        " context and your own knowledge, update or repeat the existing answer.\n"
    )
    refine_template = PromptTemplate(refine_template_str)

    # create a query engine for the index
    if OLLAMA_MODEL is not None:
        llm = Ollama(model=OLLAMA_MODEL)

        ollama_embedding = OllamaEmbedding(
            model_name=OLLAMA_MODEL,
        )

        query_engine = index_reloaded.as_query_engine(
            llm=llm,
            text_qa_template=text_qa_template,
            refine_template=refine_template,
            embed_model=ollama_embedding,
        )

        query_engine.retriever.similarity_top_k = similarity_top_k

    else:  # use OpenAI...
        # configure retriever
        retriever = VectorIndexRetriever(
            index=index_reloaded,
            similarity_top_k=similarity_top_k,
        )

        # configure response synthesizer
        response_synthesizer = get_response_synthesizer(
            response_mode="refine",
            text_qa_template=text_qa_template,
            refine_template=refine_template,
        )

        query_engine = RetrieverQueryEngine(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
        )

    return query_engine


llm_ver = LLM_GPT4o
index_reloaded = load_index(llm_ver)
query_engine = get_query_engine(index_reloaded, llm_ver)


def process_query(query, model=llm_ver):
    response = query_engine.query(query)

    response_text = str(response)

    if "<think>" in response_text:  # Give deepseek a fighting chance...
        response_text = (
            response_text[0 : response_text.index("<think>")]
            + response_text[response_text.index("</think>") + 8 :]
        )

    metadata = response.metadata
    cutoff = 0.2
    files_used = []
    for sn in response.source_nodes:
        # print(sn)
        sd = sn.metadata["source document"]
        if "et_al_" in sd:
            sd = sd.replace("WormAtlas Handbook:", "Paper: ")

        if sd not in files_used:
            if len(files_used) == 0 or sn.score >= cutoff:
                files_used.append(f"{sd} (score: {sn.score})")

    file_info = ",\n   ".join(files_used)
    print_(f"""
===============================================================================
QUERY: {query}
MODEL: {model}
-------------------------------------------------------------------------------
RESPONSE: {response_text}
SOURCES: 
   {file_info}
===============================================================================
""")

    return response_text, metadata


def run_query(query):
    response_text, metadata = process_query(query, "GPT4o")
    files_used = []

    for k in metadata:
        v = metadata[k]
        if SOURCE_DOCUMENT in v:
            if v[SOURCE_DOCUMENT] not in files_used:
                sd = v[SOURCE_DOCUMENT]
                if "et_al_" in sd:
                    sd = sd.replace("WormAtlas Handbook:", "Paper: ")
                files_used.append(sd)

    srcs = "\n- ".join(files_used)
    answer = f"""{response_text}

SOURCES OF ANSWER: 
 - {srcs}"""

    return answer


if __name__ == "__main__":
    print("Running queries")

    queries = [
        "What are the main types of neurons and muscles in the C. elegans pharynx?",
        "Tell me about the egg laying apparatus",
    ]

    for query in queries:
        print("-------------------")
        print("Q: %s" % query)
        # answer = docs.query(query)
        answer = run_query(query)

        print("A: %s" % answer)