Spaces:
Paused
Paused
# TODO: return all pages used to form answer | |
# TODO: question samples | |
# TEST: with and without GPU instance | |
# TODO: visual questions on page image (in same app)? | |
import torch | |
from llama_index.llms.huggingface import HuggingFaceLLM | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.core import SimpleDirectoryReader | |
from llama_index.core import VectorStoreIndex, SummaryIndex | |
from llama_index.core.prompts import PromptTemplate | |
from llama_index.core import Settings | |
from PIL import Image | |
import gradio as gr | |
def messages_to_prompt(messages): | |
prompt = "" | |
for message in messages: | |
if message.role == "system": | |
m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks." | |
prompt += f"<|system|>\n{m}</s>\n" | |
elif message.role == "user": | |
prompt += f"<|user|>\n{message.content}</s>\n" | |
elif message.role == "assistant": | |
prompt += f"<|assistant|>\n{message.content}</s>\n" | |
# ensure we start with a system prompt, insert blank if needed | |
if not prompt.startswith("<|system|>\n"): | |
prompt = "<|system|>\n</s>\n" + prompt | |
# add final assistant prompt | |
prompt = prompt + "<|assistant|>\n" | |
return prompt | |
def load_RAG_pipeline(): | |
# LLM | |
quantization_config = {} # dirty fix for CPU/GPU support | |
if torch.cuda.is_available(): | |
from transformers import BitsAndBytesConfig | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_compute_dtype=torch.float16, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_use_double_quant=True, | |
) | |
llm = HuggingFaceLLM( | |
model_name="HuggingFaceH4/zephyr-7b-alpha", | |
tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", | |
query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"), | |
context_window=3900, | |
max_new_tokens=256, | |
model_kwargs={"quantization_config": quantization_config}, | |
# tokenizer_kwargs={}, | |
generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, | |
messages_to_prompt=messages_to_prompt, | |
device_map="auto", | |
) | |
# Llama-index | |
Settings.llm = llm | |
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
# Settings.chunk_size = 512 | |
# Settings.chunk_overlap = 50 | |
# raw data | |
documents = SimpleDirectoryReader("assets/txts").load_data() | |
vector_index = VectorStoreIndex.from_documents(documents) | |
# vector_index.persist(persist_dir="vectors") | |
# https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html | |
# summary_index = SummaryIndex.from_documents(documents) | |
query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3) | |
return query_engine | |
query_engine = load_RAG_pipeline() | |
# These are placeholder functions to simulate the behavior of the RAG setup. | |
# You would need to implement these with the actual logic to retrieve and generate answers based on the document. | |
def get_answer(question, temperature, nucleus_sampling, max_tokens): | |
# Here you should implement the logic to generate an answer based on the question and the document. | |
# For example, you could use a machine learning model for RAG. | |
# answer = "This is a placeholder answer." | |
# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations | |
response = query_engine.query(question) | |
return response | |
def get_answer_page(response): | |
# Implement logic to retrieve the page number or an image of the page with the answer. | |
# best image | |
best_match = response.source_nodes[0].metadata["file_path"] | |
answer_page = int(best_match[-8:-4]) | |
image = Image.open(best_match.replace("txt", "png")) | |
return image, f"Navigate to page {answer_page}" | |
# Create the gr.Interface function | |
def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens): | |
answer = get_answer(question, temperature, nucleus_sampling, max_tokens) | |
image, answer_page = get_answer_page(answer) | |
return answer, image, answer_page | |
# Set up the interface options based on the design in the image. | |
output_image = gr.Image(label="Answer Page") | |
# examples | |
iface = gr.Interface( | |
fn=ask_my_thesis, | |
inputs=[ | |
gr.Textbox(label="Question", placeholder="Type your question here..."), | |
gr.Slider(0, 1, value=0.7, label="Temperature"), | |
gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"), | |
gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"), | |
], | |
outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()], | |
title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding", | |
description=r"""Chat with the thesis manuscript: ask questions and receive answers with multimodal references (WIP). | |
Spoiler: RAG application with LLM and embedding vector store can be quite slow on a 290 page document ;D | |
""", | |
allow_flagging="never", | |
) | |
# https://github.com/gradio-app/gradio/issues/4309 | |
# https://discuss.huggingface.co/t/add-background-image/16381/4 background image | |
# Start the application. | |
if __name__ == "__main__": | |
iface.launch() | |