ask_my_thesis / app.py
jordyvl's picture
CPU/GPU support
def9328
raw
history blame
5.33 kB
# TODO: return all pages used to form answer
# TODO: question samples
# TEST: with and without GPU instance
# TODO: visual questions on page image (in same app)?
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex, SummaryIndex
from llama_index.core.prompts import PromptTemplate
from llama_index.core import Settings
from PIL import Image
import gradio as gr
def messages_to_prompt(messages):
prompt = ""
for message in messages:
if message.role == "system":
m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks."
prompt += f"<|system|>\n{m}</s>\n"
elif message.role == "user":
prompt += f"<|user|>\n{message.content}</s>\n"
elif message.role == "assistant":
prompt += f"<|assistant|>\n{message.content}</s>\n"
# ensure we start with a system prompt, insert blank if needed
if not prompt.startswith("<|system|>\n"):
prompt = "<|system|>\n</s>\n" + prompt
# add final assistant prompt
prompt = prompt + "<|assistant|>\n"
return prompt
def load_RAG_pipeline():
# LLM
quantization_config = {} # dirty fix for CPU/GPU support
if torch.cuda.is_available():
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
llm = HuggingFaceLLM(
model_name="HuggingFaceH4/zephyr-7b-alpha",
tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
context_window=3900,
max_new_tokens=256,
model_kwargs={"quantization_config": quantization_config},
# tokenizer_kwargs={},
generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
messages_to_prompt=messages_to_prompt,
device_map="auto",
)
# Llama-index
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.chunk_size = 512
# Settings.chunk_overlap = 50
# raw data
documents = SimpleDirectoryReader("assets/txts").load_data()
vector_index = VectorStoreIndex.from_documents(documents)
# vector_index.persist(persist_dir="vectors")
# https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
# summary_index = SummaryIndex.from_documents(documents)
query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3)
return query_engine
query_engine = load_RAG_pipeline()
# These are placeholder functions to simulate the behavior of the RAG setup.
# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
def get_answer(question, temperature, nucleus_sampling, max_tokens):
# Here you should implement the logic to generate an answer based on the question and the document.
# For example, you could use a machine learning model for RAG.
# answer = "This is a placeholder answer."
# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
response = query_engine.query(question)
return response
def get_answer_page(response):
# Implement logic to retrieve the page number or an image of the page with the answer.
# best image
best_match = response.source_nodes[0].metadata["file_path"]
answer_page = int(best_match[-8:-4])
image = Image.open(best_match.replace("txt", "png"))
return image, f"Navigate to page {answer_page}"
# Create the gr.Interface function
def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
image, answer_page = get_answer_page(answer)
return answer, image, answer_page
# Set up the interface options based on the design in the image.
output_image = gr.Image(label="Answer Page")
# examples
iface = gr.Interface(
fn=ask_my_thesis,
inputs=[
gr.Textbox(label="Question", placeholder="Type your question here..."),
gr.Slider(0, 1, value=0.7, label="Temperature"),
gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"),
gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"),
],
outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
description=r"""Chat with the thesis manuscript: ask questions and receive answers with multimodal references (WIP).
Spoiler: RAG application with LLM and embedding vector store can be quite slow on a 290 page document ;D
""",
allow_flagging="never",
)
# https://github.com/gradio-app/gradio/issues/4309
# https://discuss.huggingface.co/t/add-background-image/16381/4 background image
# Start the application.
if __name__ == "__main__":
iface.launch()