Elron commited on
Commit
9b31eb9
1 Parent(s): 7ba0534

Upload serializers.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. serializers.py +130 -0
serializers.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from copy import deepcopy
3
+ from typing import (
4
+ Any,
5
+ Dict,
6
+ List,
7
+ )
8
+
9
+ from .operators import FieldOperator
10
+
11
+ """
12
+ TableSerializer converts a given table into a flat sequence with special symbols.
13
+ Input table format must be:
14
+ {"header": ["col1", "col2"], "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]}
15
+ Output format varies depending on the chosen serializer. Abstract class at the top defines structure of a typical table serializer that any concrete implementation should follow.
16
+ """
17
+
18
+
19
+ class TableSerializer(ABC, FieldOperator):
20
+ # main method to serialize a table
21
+ @abstractmethod
22
+ def serialize_table(self, table_content: Dict) -> str:
23
+ pass
24
+
25
+ # method to process table header
26
+ @abstractmethod
27
+ def process_header(self, header: List):
28
+ pass
29
+
30
+ # method to process a table row
31
+ @abstractmethod
32
+ def process_row(self, row: List, row_index: int):
33
+ pass
34
+
35
+
36
+ # Concrete classes implementing table serializers follow..
37
+ """
38
+ Indexed Row Major Table Serializer.
39
+ Commonly used row major serialization format.
40
+ Format: col : col1 | col2 | col 3 row 1 : val1 | val2 | val3 | val4 row 2 : val1 | ...
41
+ """
42
+
43
+
44
+ class IndexedRowMajorTableSerializer(TableSerializer):
45
+ def process_value(self, table: Any) -> Any:
46
+ table_input = deepcopy(table)
47
+ return self.serialize_table(table_content=table_input)
48
+
49
+ # main method that processes a table
50
+ # table_content must be in the presribed input format
51
+ def serialize_table(self, table_content: Dict) -> str:
52
+ # Extract headers and rows from the dictionary
53
+ header = table_content.get("header", [])
54
+ rows = table_content.get("rows", [])
55
+
56
+ assert header and rows, "Incorrect input table format"
57
+
58
+ # Process table header first
59
+ serialized_tbl_str = self.process_header(header) + " "
60
+
61
+ # Process rows sequentially starting from row 1
62
+ for i, row in enumerate(rows, start=1):
63
+ serialized_tbl_str += self.process_row(row, row_index=i) + " "
64
+
65
+ # return serialized table as a string
66
+ return serialized_tbl_str.strip()
67
+
68
+ # serialize header into a string containing the list of column names separated by '|' symbol
69
+ def process_header(self, header: List):
70
+ return "col : " + " | ".join(header)
71
+
72
+ # serialize a table row into a string containing the list of cell values separated by '|'
73
+ def process_row(self, row: List, row_index: int):
74
+ serialized_row_str = ""
75
+ row_cell_values = [
76
+ str(value) if isinstance(value, (int, float)) else value for value in row
77
+ ]
78
+
79
+ serialized_row_str += " | ".join(row_cell_values)
80
+
81
+ return f"row {row_index} : {serialized_row_str}"
82
+
83
+
84
+ """
85
+ Markdown Table Serializer.
86
+ Markdown table format is used in GitHub code primarily.
87
+ Format:
88
+ |col1|col2|col3|
89
+ |---|---|---|
90
+ |A|4|1|
91
+ |I|2|1|
92
+ ...
93
+ """
94
+
95
+
96
+ class MarkdownTableSerializer(TableSerializer):
97
+ def process_value(self, table: Any) -> Any:
98
+ table_input = deepcopy(table)
99
+ return self.serialize_table(table_content=table_input)
100
+
101
+ # main method that serializes a table.
102
+ # table_content must be in the presribed input format.
103
+ def serialize_table(self, table_content: Dict) -> str:
104
+ # Extract headers and rows from the dictionary
105
+ header = table_content.get("header", [])
106
+ rows = table_content.get("rows", [])
107
+
108
+ assert header and rows, "Incorrect input table format"
109
+
110
+ # Process table header first
111
+ serialized_tbl_str = self.process_header(header)
112
+
113
+ # Process rows sequentially starting from row 1
114
+ for i, row in enumerate(rows, start=1):
115
+ serialized_tbl_str += self.process_row(row, row_index=i)
116
+
117
+ # return serialized table as a string
118
+ return serialized_tbl_str.strip()
119
+
120
+ # serialize header into a string containing the list of column names
121
+ def process_header(self, header: List):
122
+ header_str = "|{}|\n".format("|".join(header))
123
+ header_str += "|{}|\n".format("|".join(["---"] * len(header)))
124
+ return header_str
125
+
126
+ # serialize a table row into a string containing the list of cell values
127
+ def process_row(self, row: List, row_index: int):
128
+ row_str = ""
129
+ row_str += "|{}|\n".format("|".join(str(cell) for cell in row))
130
+ return row_str