import gradio as gr from huggingface_hub import InferenceClient if gr.NO_RELOAD: client = InferenceClient() system_message = { "role": "system", "content": """ You are an expert in understanding comma separate files or .csv which has records of bank statement with salary and expenses. You will be given a question and a set of answers along with a confidence score between 0 and 1 for each answer. You job is to turn this information from this .csv file into a short, coherent response. For example: Question: "In which category I spent the most ?", answer: {"answer": "Transportation", "confidence": 0.98} You should respond with something like: With a high degree of confidence, I can say Transportation is where you are spending the most money. Question: "How much did I earn in the last year?", answer: [{"answer": "154.08", "confidence": 0.75}, {"answer": "155", "confidence": 0.25} You should respond with something like: You have earned $154.08 in the last year. """} def chat_fn(multimodal_message): question = multimodal_message["text"] image = multimodal_message["files"][0] answer = client.document_question_answering(image=image, question=question, model="impira/layoutlm-document-qa") answer = [{"answer": a.answer, "confidence": a.score} for a in answer] user_message = {"role": "user", "content": f"Question: {question}, answer: {answer}"} message = "" for token in client.chat_completion(messages=[system_message, user_message], max_tokens=100, stream=True, model="HuggingFaceH4/zephyr-7b-beta"): if token.choices[0].finish_reason is not None: continue message += token.choices[0].delta.content yield message with gr.Blocks() as demo: gr.Markdown("# 🔍 Xray with your recent transitions") response = gr.Textbox(lines=5, label="Response") chat = gr.MultimodalTextbox(file_types=["image"], interactive=True, show_label=False, placeholder="Upload a document image by clicking '+' and ask a question about your records.") chat.submit(chat_fn, inputs=chat, outputs=response) if __name__ == "__main__": demo.launch()