Spaces:

tmnam20
/

demo-financial-table-report-generator

Runtime error

App Files Files Community

trminhnam20082002 commited on Jun 10, 2023

Commit

55e492d

•

1 Parent(s): 59ae732

feat: add model

Browse files

Files changed (3) hide show

.gitignore +2 -0
app.py +150 -0
utils.py +177 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ cache
2	+ **/__pycache__/

app.py CHANGED Viewed

	@@ -0,0 +1,150 @@

+# -*- coding: utf-8 -*-
+import streamlit as st
+import pandas as pd
+import torch
+from utils import (
+    load_model,
+    load_tokenizer,
+    make_input_sentence_from_strings,
+    generate_description,
+)
+st.set_page_config(
+    page_title="Table-to-text generation",
+    page_icon="📝",
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items={
+        "Get Help": "https://huggingface.co/transformers/master/index.html",
+        "Report a bug": "https://github.com",
+    },  # hide the "Made with Streamlit" footer
+)
+st.title("Table-to-text generation with multilingual pre-trained models")
+st.markdown(
+    """
+This is a demo of table-to-text generation with multilingual pre-trained models.
+The models are trained on our custom dataset, which is sampling from Viettel Report Template and generated description by ChatGPT.
+"""
+)
+st.sidebar.title("Settings")
+model_name = st.sidebar.selectbox(
+    "Model name",
+    [
+        "vinai/bartpho-syllable",
+        "vinai/bartpho-syllable-base",
+        "google/byt5-base",
+        "google/byt5-small",
+        "facebook/mbart-large-50",
+    ],
+)
+if torch.cuda.is_available():
+    device = "cuda" if st.sidebar.checkbox("Use GPU", False) else "cpu"
+else:
+    st.sidebar.checkbox("Use GPU", False, disabled=True)
+    device = "cpu"
+max_len = st.sidebar.slider("Max length", 32, 512, 256, 32)
+beam_size = st.sidebar.slider("Beam size", 1, 10, 3, 1)
+tokenizer = load_tokenizer(model_name)
+model = load_model(model_name, device)
+# create a text input box for each of the following item
+# CHỈ TIÊU	ĐƠN VỊ	ĐIỀU KIỆN	KPI mục tiêu tháng	Tháng 9.2022	Đánh giá	T8.2022	So sánh T8.2022 Tăng giảm	T9.2021	So sánh T9.2021 Tăng giảm
+objective_name = st.text_input("CHỈ TIÊU", "")
+(unit_col, condition_col, kpi_target_col) = st.columns(3)
+with unit_col:
+    unit = st.text_input("ĐƠN VỊ", "")
+with condition_col:
+    condition = st.selectbox("ĐIỀU KIỆN", [">=", "<=", None])
+with kpi_target_col:
+    kpi_target = st.text_input("KPI mục tiêu tháng", "")
+current_date_col, real_value_col, evaluation_col = st.columns(3)
+with current_date_col:
+    current_date = st.date_input(
+        "Thời gian báo cáo", value=None, min_value=None, max_value=None, key=None
+    )
+    current_time = [int(x) for x in current_date.__str__().split("-")[:2]]
+with real_value_col:
+    real_value = st.text_input(f"T{current_time[1]}.{current_time[0]} thực tế", "")
+with evaluation_col:
+    evaluation_value = st.selectbox(
+        "Đánh giá",
+        ["Đạt", "Không đạt", "Theo dõi"],
+        index=2 if (kpi_target == "" or condition is None) else 0,
+    )
+# current_time is in format [year, month, day]
+previous_month = (
+    [current_time[0], current_time[1] - 1]
+    if current_time[1] > 1
+    else [current_time[0] - 1, 12]
+)
+previous_year = [current_time[0] - 1, current_time[1]]
+(
+    previous_month_value_col,
+    previous_month_compare_col,
+    previous_year_value_col,
+    previous_year_compare_col,
+) = st.columns(4)
+with previous_month_value_col:
+    previous_month_value = st.text_input(
+        f"T{previous_month[1]}.{previous_month[0]}", ""
+    )
+with previous_month_compare_col:
+    previous_month_compare = st.text_input(
+        f"So sánh T{previous_month[1]}.{previous_month[0]} Tăng giảm",
+        float(real_value) - float(previous_month_value)
+        if previous_month_value != ""
+        else "",
+        # disabled=True,
+    )
+with previous_year_value_col:
+    previous_year_value = st.text_input(f"T{previous_year[1]}.{previous_year[0]}", "")
+with previous_year_compare_col:
+    previous_year_compare = st.text_input(
+        f"So sánh T{previous_year[1]}.{previous_year[0]} Tăng giảm",
+        float(real_value) - float(previous_year_value)
+        if previous_year_value != ""
+        else "",
+        # disabled=True,
+    )
+data = {
+    "CHỈ TIÊU": objective_name,
+    "ĐƠN VỊ": unit,
+    "ĐIỀU KIỆN": condition,
+    "KPI mục tiêu tháng": kpi_target,
+    "Đánh giá": evaluation_value,
+    "Thời gian báo cáo": current_time,
+    f"T{current_time[1]}.{current_time[0]} thực tế": real_value,
+    "Previous month value key": f"T{previous_month[1]}.{previous_month[0]}",
+    f"T{previous_month[1]}.{previous_month[0]}": previous_month_value,
+    "Previous year value key": f"T{previous_year[1]}.{previous_year[0]}",
+    f"T{previous_year[1]}.{previous_year[0]}": previous_year_value,
+    "Previous month compare key": f"So sánh T{previous_month[1]}.{previous_month[0]} Tăng giảm",
+    f"So sánh T{previous_month[1]}.{previous_month[0]} Tăng giảm": previous_month_compare,
+    "Previous year compare key": f"So sánh T{previous_year[1]}.{previous_year[0]} Tăng giảm",
+    f"So sánh T{previous_year[1]}.{previous_year[0]} Tăng giảm": previous_year_compare,
+    "Previous month": previous_month,
+    "Previous year": previous_year,
+}
+if st.button("Generate"):
+    with st.spinner("Generating..."):
+        input_string = make_input_sentence_from_strings(data)
+        print(input_string)
+        descriptions = generate_description(
+            input_string, model, tokenizer, device, max_len, model_name, beam_size
+        )
+        st.success(descriptions)

utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# -*- coding: utf-8 -*-
+import os
+import re
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModel,
+    T5ForConditionalGeneration,
+    MBartForConditionalGeneration,
+    AutoModelForSeq2SeqLM,
+)
+from tqdm.auto import tqdm
+import streamlit as st
+from typing import Dict, List
+def get_model(args):
+    print(f"Using model {args.model_name}")
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name)
+    model.to(args.device)
+    if args.load_model_path:
+        print(f"Loading model from {args.load_model_path}")
+        model.load_state_dict(
+            torch.load(args.load_model_path, map_location=torch.device(args.device))
+        )
+    return model
+@st.cache(allow_output_mutation=True)
+def load_model(model_name, device):
+    print(f"Using model {model_name}")
+    os.makedirs("cache", exist_ok=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir="cache")
+    model.to(device)
+    model_name = model_name.split("/")[-1]
+    load_model_path = os.path.join("models", f"{model_name}-best_loss.bin")
+    print(f"Loading model from {load_model_path}")
+    model.load_state_dict(
+        torch.load(load_model_path, map_location=torch.device(device))
+    )
+    return model
+@st.cache(allow_output_mutation=True)
+def load_tokenizer(model_name):
+    print(f"Loading tokenizer {model_name}")
+    if "mbart" in model_name.lower():
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name, src_lang="vi_VN", tgt_lang="vi_VN"
+        )
+        # tokenizer.src_lang = "vi_VN"
+        # tokenizer.tgt_lang = "vi_VN"
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return tokenizer
+def prepare_batch_model_inputs(batch, tokenizer, max_len, is_train=False, device="cpu"):
+    inputs = tokenizer(
+        batch["src"],
+        text_target=batch["tgt"] if is_train else None,
+        padding="longest",
+        max_length=max_len,
+        truncation=True,
+        return_tensors="pt",
+    )
+    for k, v in inputs.items():
+        inputs[k] = v.to(device)
+    return inputs
+def prepare_single_model_inputs(src, tokenizer, max_len, device="cpu"):
+    inputs = tokenizer(
+        src,
+        padding="longest",
+        max_length=max_len,
+        truncation=True,
+        return_tensors="pt",
+    )
+    for k, v in inputs.items():
+        inputs[k] = v.to(device)
+    return inputs
+def make_input_sentence_from_strings(data):
+    # data = {
+    #     "CHỈ TIÊU": objective_name,
+    #     "ĐƠN VỊ": unit,
+    #     "ĐIỀU KIỆN": condition,
+    #     "KPI mục tiêu tháng": kpi_target,
+    #     "Đánh giá": evaluation_value,
+    #     "Thời gian báo cáo": current_time,
+    #     f"T{current_time[1]}.{current_time[0]} thực tế": real_value,
+    #     "Previous month value key": f"T{previous_month[1]}.{previous_month[0]}",
+    #     f"T{previous_month[1]}.{previous_month[0]}": previous_month_value,
+    #     "Previous year value key": f"T{previous_year[1]}.{previous_year[0]}",
+    #     f"T{previous_year[1]}.{previous_year[0]}": previous_year_value,
+    #     "Previous month compare key": f"So sánh T{previous_month[1]}.{previous_month[0]} Tăng giảm",
+    #     f"So sánh T{previous_month[1]}.{previous_month[0]} Tăng giảm": previous_month_compare,
+    #     "Previous year compare key": f"So sánh T{previous_year[1]}.{previous_year[0]} Tăng giảm",
+    #     "Previous month": previous_month,
+    #     "Previous year": previous_year,
+    # }
+    previous_month_value_key = data["Previous month value key"]
+    previous_year_value_key = data["Previous year value key"]
+    objective_name = data["CHỈ TIÊU"]
+    unit = data["ĐƠN VỊ"]
+    condition = data["ĐIỀU KIỆN"]
+    kpi_target = data["KPI mục tiêu tháng"]
+    current_time = data["Thời gian báo cáo"]
+    real_value = data[f"T{current_time[1]}.{current_time[0]} thực tế"]
+    evaluation_value = data["Đánh giá"]
+    previous_month_value = data[previous_month_value_key]
+    previous_year_value = data[previous_year_value_key]
+    previous_month_compare_key = data["Previous month compare key"]
+    previous_year_compare_key = data["Previous year compare key"]
+    previous_month_compare = data[previous_month_compare_key]
+    previous_year_compare = data[previous_year_compare_key]
+    previous_month = data["Previous month"]
+    previous_year = data["Previous year"]
+    # make a template string from the following example:
+    # """{"CHỈ TIÊU": "Tỷ lệ kết nối thành công đến tổng đài - KHCN_Di động Vip", "ĐƠN VỊ": "%", "ĐIỀU KIỆN": ">=", "KPI mục tiêu tháng": 95.0, "Tháng 9.2022": 97.5, "Đánh giá": "Đạt", "T8.2022": 96.6, "So sánh T8.2022 Tăng giảm": 1.0, "T9.2021": 96.8, "So sánh T9.2021 Tăng giảm": 0.8}"""
+    template_str = '"CHỈ TIÊU": "{}", "ĐƠN VỊ": "{}", "ĐIỀU KIỆN": "{}", "KPI mục tiêu tháng": {}, "Tháng {}.{}": {}, "Đánh giá": "{}", "T{}.{}": {}, "So sánh T{}.{} Tăng giảm": {}, "T{}.{}": {}, "So sánh T{}.{} Tăng giảm": {}'
+    return template_str.format(
+        objective_name,
+        unit,
+        condition,
+        kpi_target,
+        current_time[1],
+        current_time[0],
+        real_value,
+        evaluation_value,
+        previous_month[1],
+        previous_month[0],
+        previous_month_value,
+        previous_month[1],
+        previous_month[0],
+        previous_month_compare,
+        previous_year[1],
+        previous_year[0],
+        previous_year_value,
+        previous_year[1],
+        previous_year[0],
+        previous_year_compare,
+    )
+@torch.no_grad()
+def generate_description(
+    input_string, model, tokenizer, device, max_len, model_name, beam_size
+):
+    inputs = prepare_single_model_inputs(
+        input_string, tokenizer, max_len=max_len, device=device
+    )
+    if "mbart" in model_name.lower():
+        inputs["forced_bos_token_id"] = tokenizer.lang_code_to_id["vi_VN"]
+    outputs = model.generate(
+        **inputs,
+        max_length=max_len,
+        num_beams=beam_size,
+        # early_stopping=True,
+    )
+    return tokenizer.batch_decode(
+        outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )