Spaces:
Runtime error
Runtime error
File size: 7,753 Bytes
b0bfa1a 1d0af05 b0bfa1a 1d0af05 b0bfa1a 1d0af05 b0bfa1a 1d0af05 b0bfa1a 1d0af05 b0bfa1a 1d0af05 b0bfa1a 1d0af05 b0bfa1a f11fe45 b0bfa1a 1d0af05 b0bfa1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import os
import streamlit as st
from PIL import Image
from llama_index import (
Document,
GPTVectorStoreIndex,
GPTListIndex,
LLMPredictor,
ServiceContext,
SimpleDirectoryReader,
PromptHelper,
StorageContext,
load_index_from_storage,
download_loader,
)
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
from constants import DEFAULT_TERM_STR, DEFAULT_TERMS, REFINE_TEMPLATE, TEXT_QA_TEMPLATE
from utils import get_llm
if "all_terms" not in st.session_state:
st.session_state["all_terms"] = DEFAULT_TERMS
@st.cache_resource
def get_file_extractor():
ImageReader = download_loader("ImageReader")
image_loader = ImageReader(text_type="plain_text")
file_extractor = DEFAULT_FILE_READER_CLS
file_extractor.update(
{
".jpg": image_loader,
".png": image_loader,
".jpeg": image_loader,
}
)
return file_extractor
file_extractor = get_file_extractor()
def extract_terms(documents, term_extract_str, llm_name, model_temperature, api_key):
llm = get_llm(llm_name, model_temperature, api_key, max_tokens=1024)
service_context = ServiceContext.from_defaults(
llm_predictor=LLMPredictor(llm=llm),
prompt_helper=PromptHelper(
max_input_size=4096, max_chunk_overlap=20, num_output=1024
),
chunk_size_limit=1024,
)
temp_index = GPTListIndex.from_documents(documents, service_context=service_context)
terms_definitions = str(
temp_index.as_query_engine(response_mode="tree_summarize").query(
term_extract_str
)
)
terms_definitions = [
x
for x in terms_definitions.split("\n")
if x and "Term:" in x and "Definition:" in x
]
# parse the text into a dict
terms_to_definition = {
x.split("Definition:")[0]
.split("Term:")[-1]
.strip(): x.split("Definition:")[-1]
.strip()
for x in terms_definitions
}
return terms_to_definition
def insert_terms(terms_to_definition):
for term, definition in terms_to_definition.items():
doc = Document(f"Term: {term}\nDefinition: {definition}")
st.session_state["llama_index"].insert(doc)
@st.cache_resource
def initialize_index(llm_name, model_temperature, api_key):
"""Create the GPTSQLStructStoreIndex object."""
llm = get_llm(llm_name, model_temperature, api_key)
service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
index = load_index_from_storage(
StorageContext.from_defaults(persist_dir="./initial_index"),
service_context=service_context,
)
return index
st.title("π¦ Llama Index Term Extractor π¦")
st.markdown(
(
"This demo allows you to upload your own documents (either a screenshot/image or the actual text) and extract terms and definitions, building a knowledge base!\n\n"
"Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html) and OpenAI, you can augment the existing knowledge of an "
"LLM using your own notes, documents, and images. Then, when you ask about a term or definition, it will use your data first! "
"The app is currently pre-loaded with terms from the NYC Wikipedia page."
)
)
setup_tab, terms_tab, upload_tab, query_tab = st.tabs(
["Setup", "All Terms", "Upload/Extract Terms", "Query Terms"]
)
with setup_tab:
st.subheader("LLM Setup")
api_key = st.text_input("Enter your OpenAI API key here", type="password")
llm_name = st.selectbox(
"Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
)
model_temperature = st.slider(
"LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
)
term_extract_str = st.text_area(
"The query to extract terms and definitions with.", value=DEFAULT_TERM_STR
)
with terms_tab:
st.subheader("Current Extracted Terms and Definitions")
st.json(st.session_state["all_terms"])
with upload_tab:
st.subheader("Extract and Query Definitions")
if st.button("Initialize Index and Reset Terms", key="init_index_1"):
st.session_state["llama_index"] = initialize_index(
llm_name, model_temperature, api_key
)
st.session_state["all_terms"] = DEFAULT_TERMS
if "llama_index" in st.session_state:
st.markdown(
"Either upload an image/screenshot of a document, or enter the text manually."
)
uploaded_file = st.file_uploader(
"Upload an image/screenshot of a document:", type=["png", "jpg", "jpeg"]
)
document_text = st.text_area("Or enter raw text")
if st.button("Extract Terms and Definitions") and (
uploaded_file or document_text
):
st.session_state["terms"] = {}
terms_docs = {}
with st.spinner("Extracting (images may be slow)..."):
if document_text:
terms_docs.update(
extract_terms(
[Document(document_text)],
term_extract_str,
llm_name,
model_temperature,
api_key,
)
)
if uploaded_file:
Image.open(uploaded_file).convert("RGB").save("temp.png")
img_reader = SimpleDirectoryReader(
input_files=["temp.png"], file_extractor=file_extractor
)
img_docs = img_reader.load_data()
os.remove("temp.png")
terms_docs.update(
extract_terms(
img_docs,
term_extract_str,
llm_name,
model_temperature,
api_key,
)
)
st.session_state["terms"].update(terms_docs)
if "terms" in st.session_state and st.session_state["terms"]:
st.markdown("Extracted terms")
st.json(st.session_state["terms"])
if st.button("Insert terms?"):
with st.spinner("Inserting terms"):
insert_terms(st.session_state["terms"])
st.session_state["all_terms"].update(st.session_state["terms"])
st.session_state["terms"] = {}
st.experimental_rerun()
with query_tab:
st.subheader("Query for Terms/Definitions!")
st.markdown(
(
"The LLM will attempt to answer your query, and augment it's answers using the terms/definitions you've inserted. "
"If a term is not in the index, it will answer using it's internal knowledge."
)
)
if st.button("Initialize Index and Reset Terms", key="init_index_2"):
st.session_state["llama_index"] = initialize_index(
llm_name, model_temperature, api_key
)
st.session_state["all_terms"] = DEFAULT_TERMS
if "llama_index" in st.session_state:
query_text = st.text_input("Ask about a term or definition:")
if query_text:
with st.spinner("Generating answer..."):
response = (
st.session_state["llama_index"]
.as_query_engine(
similarity_top_k=5,
response_mode="compact",
text_qa_template=TEXT_QA_TEMPLATE,
refine_template=REFINE_TEMPLATE,
)
.query(query_text)
)
st.markdown(str(response))
|