vaishali commited on
Commit
66a54f9
·
verified ·
1 Parent(s): 4a8a385

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +98 -3
README.md CHANGED
@@ -1,3 +1,98 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Usage
6
+ ```python
7
+ import pandas as pd
8
+ from datasets import load_dataset
9
+ from transformers import MBartForConditionalGeneration
10
+ model = MBartForConditionalGeneration.from_pretrained("vaishali/HiTQA-BnTQA")
11
+ tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name, src_lang="hi_IN", tgt_lang="hi_IN")
12
+ forced_bos_id = forced_bos_token_id = tokenizer.lang_code_to_id["hi_IN"]
13
+
14
+
15
+ # linearize table
16
+ def process_header(headers: List):
17
+ return "<कलाम> " + " | ".join(headers)
18
+
19
+ def process_row(row: List, row_index: int):
20
+ hi2enDigits = {'०': '0', '१': '1', '२': '2', '३': '3', '४': '4', '५': '5', '६': '6', '७': '7', '८': '8',
21
+ '९': '9', '.': '.'}
22
+ en2hiDigits = {v:k for k, v in hi2enDigits.items()}
23
+ row_str = ""
24
+ row_cell_values = []
25
+ for cell_value in row:
26
+ if isinstance(cell_value, int) or isinstance(cell_value, float):
27
+ cell_value = convert_engDigit_to_hindi(str(cell_value))
28
+ row_cell_values.append(str(cell_value))
29
+ else:
30
+ row_cell_values.append(cell_value)
31
+ row_str += " | ".join([row_cell_values for cell_value in row])
32
+ hi_row_index = []
33
+ for c in str(row_index):
34
+ hi_row_index.append(en2hiDigits[c])
35
+ return "<रो " + "".join(hi_row_index) + "> " + row_str
36
+
37
+ def process_table(table_content: Dict):
38
+ table_str = process_header(table_content["header"]) + " "
39
+ for i, row_example in enumerate(table_content["rows"]):
40
+ table_str += self.process_row(row_example, row_index=i + 1) + " "
41
+ return table_str.strip()
42
+
43
+ # load the dataset
44
+ hinditableQA = load_dataset("vaishali/hindiTabQA")
45
+
46
+ for sample in hinditableQA['train']:
47
+ question = sample['question']
48
+ input_table = pd.read_json(sample['table'], orient='split')
49
+ answer = pd.read_json(sample['answer'], orient='split')
50
+
51
+ # create the input sequence: query + linearized input table
52
+ table_content = {"header": list(input_table.columns)[1:], "rows": [list(row.values)[1:] for i, row in input_table.iterrows()]}
53
+ linearized_inp_table = process_table(table_content)
54
+ linearized_output_table = process_table({"name": None, "header": [self.translate_column(col) for col in list(answer.columns)],
55
+ "rows": [list(row.values) for i, row in answer.iterrows()]})
56
+ source = query + " " + linearized_inp_table
57
+ target = linearized_output_table
58
+ input = tokenizer(source,
59
+ return_tensors="pt",
60
+ padding="max_length",
61
+ truncation="longest_first",
62
+ max_length=1024,
63
+ add_special_tokens=True)
64
+
65
+ with tokenizer.as_target_tokenizer():
66
+ labels = tokenizer(target,
67
+ return_tensors="pt",
68
+ padding="max_length",
69
+ truncation="longest_first",
70
+ max_length=1024,
71
+ add_special_tokens=True).input_ids
72
+
73
+ # inference
74
+ out = model.generate(input["input_ids"].to("cuda"), num_beams=5, return_dict_in_generate=True,
75
+ output_scores=True, max_length=1024)
76
+ ```
77
+ # BibTeX entry and citation info
78
+ ```
79
+ @inproceedings{pal-etal-2024-table,
80
+ title = "Table Question Answering for Low-resourced {I}ndic Languages",
81
+ author = "Pal, Vaishali and
82
+ Kanoulas, Evangelos and
83
+ Yates, Andrew and
84
+ de Rijke, Maarten",
85
+ editor = "Al-Onaizan, Yaser and
86
+ Bansal, Mohit and
87
+ Chen, Yun-Nung",
88
+ booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
89
+ month = nov,
90
+ year = "2024",
91
+ address = "Miami, Florida, USA",
92
+ publisher = "Association for Computational Linguistics",
93
+ url = "https://aclanthology.org/2024.emnlp-main.5",
94
+ pages = "75--92",
95
+ abstract = "TableQA is the task of answering questions over tables of structured information, returning individual cells or tables as output. TableQA research has focused primarily on high-resource languages, leaving medium- and low-resource languages with little progress due to scarcity of annotated data and neural models. We address this gap by introducing a fully automatic large-scale tableQA data generation process for low-resource languages with limited budget. We incorporate our data generation method on two Indic languages, Bengali and Hindi, which have no tableQA datasets or models. TableQA models trained on our large-scale datasets outperform state-of-the-art LLMs. We further study the trained models on different aspects, including mathematical reasoning capabilities and zero-shot cross-lingual transfer. Our work is the first on low-resource tableQA focusing on scalable data generation and evaluation procedures. Our proposed data generation method can be applied to any low-resource language with a web presence. We release datasets, models, and code (https://github.com/kolk/Low-Resource-TableQA-Indic-languages).",
96
+ }
97
+
98
+ ```