kilachei commited on
Commit
01cf742
·
verified ·
1 Parent(s): be63fd4

Upload 11 files

Browse files
Files changed (11) hide show
  1. README.md +4 -5
  2. all_datasets.py +129 -0
  3. app.py +198 -0
  4. gitattributes +36 -0
  5. imports.py +31 -0
  6. model.py +54 -0
  7. packages.txt +1 -0
  8. parse_info.py +112 -0
  9. requirements.txt +19 -0
  10. skills.csv +0 -0
  11. utils.py +113 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
- title: Cv Parser
3
- emoji: 🚀
4
  colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Demo
3
+ emoji: 🌖
4
  colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.35.2
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
all_datasets.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
2
+ from utils import normalize, replace_all
3
+
4
+ class NerFeatures(object):
5
+ def __init__(self, input_ids, token_type_ids, attention_mask, valid_ids, labels, label_masks):
6
+ self.input_ids = torch.as_tensor(input_ids, dtype=torch.long)
7
+ self.labels = torch.as_tensor(labels, dtype=torch.long)
8
+ self.token_type_ids = torch.as_tensor(token_type_ids, dtype=torch.long)
9
+ self.attention_mask = torch.as_tensor(attention_mask, dtype=torch.long)
10
+ self.valid_ids = torch.as_tensor(valid_ids, dtype=torch.long)
11
+ self.label_masks = torch.as_tensor(label_masks, dtype=torch.long)
12
+
13
+ class NerOutput(OrderedDict):
14
+ loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0])
15
+ tags: Optional[List[int]] = []
16
+ cls_metrics: Optional[List[int]] = []
17
+ def __getitem__(self, k):
18
+ if isinstance(k, str):
19
+ inner_dict = {k: v for (k, v) in self.items()}
20
+ return inner_dict[k]
21
+ else:
22
+ return self.to_tuple()[k]
23
+ def __setattr__(self, name, value):
24
+ if name in self.keys() and value is not None:
25
+ super().__setitem__(name, value)
26
+ super().__setattr__(name, value)
27
+ def __setitem__(self, key, value):
28
+ super().__setitem__(key, value)
29
+ super().__setattr__(key, value)
30
+ def to_tuple(self) -> Tuple[Any]:
31
+ return tuple(self[k] for k in self.keys())
32
+
33
+ class NerDataset(Dataset):
34
+ def __init__(self, features: List[NerFeatures], device: str = 'cpu'):
35
+ self.examples = features
36
+ self.device = device
37
+
38
+ def __len__(self):
39
+ return len(self.examples)
40
+
41
+ def __getitem__(self, index):
42
+ return {key: val.to(self.device) for key, val in self.examples[index].__dict__.items()}
43
+
44
+ # return sentiment dataset at tensor type
45
+ def sentiment_dataset(path_folder, train_file_name, test_file_name):
46
+ def extract(path):
47
+ data = pd.read_csv(os.path.join(path), encoding="utf-8").dropna()
48
+ label = [np.argmax(i) for i in data[["negative", "positive", "neutral"]].values.astype(float)]
49
+ # text = data["text"].apply(lambda x: x.replace("_"," "))
50
+ text = data["text"]#.apply(lambda x: normalize(x))
51
+ return text, label
52
+ x_train, y_train = extract(os.path.join(path_folder, train_file_name))
53
+ x_test, y_test = extract(os.path.join(path_folder, test_file_name))
54
+ train_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_train,y_train), columns=['text','label']))
55
+ test_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_test,y_test), columns=['text','label']))
56
+ custom_dt = datasets.DatasetDict({'train': train_set, 'test': test_set})
57
+ tokenizer = AutoTokenizer.from_pretrained('wonrax/phobert-base-vietnamese-sentiment', use_fast=False)
58
+ def tokenize(batch):
59
+ return tokenizer(list(batch['text']), padding=True, truncation=True)
60
+ custom_tokenized = custom_dt.map(tokenize, batched=True, batch_size=None)
61
+ custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
62
+ return custom_tokenized
63
+
64
+ # get feature for ner task
65
+ def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
66
+ features = []
67
+ tokens = []
68
+ tag_ids = []
69
+
70
+ idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
71
+ tag2idx = {v: k for k, v in idx2tag.items()}
72
+ for id, tokens in enumerate(data):
73
+ if tokens == []:
74
+ continue
75
+ tag_ids = [tag2idx[i[1]] for i in tokens]
76
+ seq_len = len(tokens)
77
+ sentence = ' '.join([tok[0] for tok in tokens])
78
+ encoding = tokenizer(sentence, padding='max_length', truncation=True, max_length=max_seq_len)
79
+ subwords = tokenizer.tokenize(sentence)
80
+ valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
81
+ label_marks = np.zeros(len(encoding.input_ids), dtype=int)
82
+ valid_labels = np.ones(len(encoding.input_ids), dtype=int) * -100
83
+ i = 1
84
+ for idx, subword in enumerate(subwords): # subwords[:max_seq_len-2]
85
+ if idx != 0 and subwords[idx-1].endswith("@@"):
86
+ continue
87
+ if use_crf:
88
+ valid_ids[i-1] = idx + 1
89
+ else:
90
+ valid_ids[idx+1] = 1
91
+ valid_labels[idx+1] = tag_ids[i-1]
92
+ i += 1
93
+ if max_seq_len >= seq_len:
94
+ label_padding_size = (max_seq_len - seq_len)
95
+ label_marks[:seq_len] = [1] * seq_len
96
+ tag_ids.extend([0] * label_padding_size)
97
+ else:
98
+ tag_ids = tag_ids[:max_seq_len]
99
+ label_marks[:-2] = [1] * (max_seq_len - 2)
100
+ tag_ids[-2:] = [0] * 2
101
+ if use_crf and label_marks[0] == 0:
102
+ try:
103
+ raise f"{sentence} - {tag_ids} have mark == 0 at index 0!"
104
+ except:
105
+ print(f"{sentence} - {tag_ids} have mark == 0 at index 0!")
106
+ break
107
+ items = {key: val for key, val in encoding.items()}
108
+ items['labels'] = tag_ids if use_crf else valid_labels
109
+ items['valid_ids'] = valid_ids
110
+ items['label_masks'] = label_marks if use_crf else valid_ids
111
+ features.append(NerFeatures(**items))
112
+ for k, v in items.items():
113
+ assert len(v) == max_seq_len, f"Expected length of {k} is {max_seq_len} but got {len(v)}"
114
+ tokens = []
115
+ tag_ids = []
116
+ return features
117
+
118
+ # create ner dataset
119
+ def topic_dataset(path_folder, file_name, tokenizer, use_crf=True):
120
+ data = read_csv_to_ner_data(os.path.join(path_folder, file_name))
121
+ train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
122
+ # token2idx, idx2token = get_dict_map(train_data+test_data, 'token')
123
+ #tag2idx, idx2tag = get_dict_map(data, 'tag')
124
+
125
+ train_set = NerDataset(feature_for_phobert(train_data, tokenizer=tokenizer, use_crf=use_crf))
126
+ test_set = NerDataset(feature_for_phobert(test_data, tokenizer=tokenizer, use_crf=use_crf))
127
+ return train_set, test_set
128
+
129
+
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from imports import *
3
+ from parse_info import *
4
+ #os.system("apt-get install poppler-utils")
5
+ token = os.environ.get("HF_TOKEN")
6
+ login(token=token)
7
+
8
+
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ dict_ = {
11
+ 0: "negative",
12
+ 1: "positive",
13
+ 2: "neutral"}
14
+ tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
15
+ model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
16
+ def cvt2cls(data):
17
+ data = list(set(data))
18
+ try:
19
+ data.remove(20)
20
+ except:
21
+ pass
22
+ for i, num in enumerate(data):
23
+ if num == 20:
24
+ continue
25
+ if num>=10:
26
+ data[i] -= 10
27
+ return data
28
+ ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
29
+ topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
30
+ config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
31
+ tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False)
32
+ model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
33
+ model_topic.resize_token_embeddings(len(tokenizer_topic))
34
+
35
+
36
+ def sentiment(sent: str):
37
+ print("\n--------------------------------------------------------------------------------------------------------------------------\n")
38
+ print("New review inference at: ", datetime.utcnow())
39
+ print("review: ", sent)
40
+ print("\n--------------------------------------------------------------------------------------------------------------------------\n")
41
+ sent_ = normalize(text=sent)
42
+ input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
43
+ with torch.no_grad():
44
+ out_sent = model_sent(input_sent)
45
+ logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
46
+ pred_sent = dict_[np.argmax(logits_sent)]
47
+
48
+ sent = replace_all(text=sent)
49
+ sent_segment = sent.split(".")
50
+ for i, s in enumerate(sent_segment):
51
+ s = s.strip()
52
+ sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
53
+ dump = [[i, 'O'] for s in sent_segment for i in s]
54
+ dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))
55
+ dump_iter = DataLoader(dump_set, batch_size=1)
56
+ with torch.no_grad():
57
+ for idx, batch in enumerate(dump_iter):
58
+ batch = { k:v.to(device) for k, v in batch.items() }
59
+ outputs = model_topic(**batch)
60
+ pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
61
+ return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
62
+
63
+
64
+ processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False)
65
+ model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
66
+ # model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8).to(device)
67
+ label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
68
+ 'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
69
+ 'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
70
+ id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
71
+ 7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
72
+ 14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
73
+ 21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
74
+ key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
75
+ "socical_address_value","education_name","education_time","experience_name","experience_time",
76
+ "designation_value","degree_value","skill_value"]
77
+ label2id = {v: k for k, v in id2label.items()}
78
+ def pred_resume(pdf_path) -> dict:
79
+ global key_list, device
80
+ result = {}
81
+ for i in key_list:
82
+ result[i] = []
83
+ DPI = 200/77
84
+ global label_list, id2label, label2id
85
+
86
+ # read pdf, convert to img
87
+ doc = fitz.open(pdf_path.name)
88
+ num_pages = len(doc)
89
+ images = pdf2image.convert_from_path(pdf_path.name)
90
+ block_dict = {}
91
+
92
+ # get all data in pdf
93
+ page_num = 1
94
+ for page in doc:
95
+ file_dict = page.get_text('dict')
96
+ block = file_dict['blocks']
97
+ block_dict[page_num] = block
98
+ page_num += 1
99
+
100
+ # predict each page in pdf
101
+ for page_num, blocks in block_dict.items():
102
+ bboxes, words = [], [] # store bounding boxes, text in a page
103
+ image = images[page_num-1]
104
+ for block in blocks:
105
+ if block['type'] == 0:
106
+ for line in block['lines']:
107
+ for span in line['spans']:
108
+ xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
109
+ text = span['text'].strip()
110
+ if text.replace(" ","") != "":
111
+ bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
112
+ words.append(decontracted(text))
113
+ text_reverse = {str(bboxes[i]): words[i] for i,_ in enumerate(words)}
114
+ fake_label = ["O"] * len(words)
115
+ encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
116
+ padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
117
+ labels = encoding["labels"]
118
+ key_box = encoding["bbox"]
119
+ offset_mapping = encoding.pop('offset_mapping')
120
+ overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
121
+ encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
122
+ x = []
123
+ for i in range(0, len(encoding['pixel_values'])):
124
+ x.append(encoding['pixel_values'][i])
125
+ x = torch.stack(x)
126
+ encoding['pixel_values'] = x
127
+
128
+ # forawrd to model
129
+ with torch.no_grad():
130
+ outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
131
+
132
+ # process output
133
+ predictions = outputs["logits"].argmax(-1).squeeze().tolist()
134
+ if outputs["logits"].shape[0] > 1:
135
+ for i, label in enumerate(labels):
136
+ if i>0:
137
+ labels[i] = labels[i][256:]
138
+ predictions[i] = predictions[i][256:]
139
+ key_box[i] = key_box[i][256:]
140
+ predictions = [j for i in predictions for j in i]
141
+ key_box = [j for i in key_box for j in i]
142
+ labels = [j for i in labels for j in i]
143
+ true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
144
+ key_box = [box for box, label in zip(key_box, labels) if label != -100]
145
+ for box, pred in zip(key_box, true_predictions):
146
+ if pred in key_list:
147
+ result[pred].append(text_reverse[str(box)])
148
+ result = {k: list(set(v)) for k, v in result.items()}
149
+ print("\n--------------------------------------------------------------------------------------------------------------------------\n")
150
+ print("New resume inference at: ", datetime.utcnow())
151
+ print("Pdf name: ", pdf_path.name)
152
+ print("Result: ", result)
153
+ print("\n--------------------------------------------------------------------------------------------------------------------------\n")
154
+ return result
155
+ def norm(result: dict) -> str:
156
+ result = ast.literal_eval(result)
157
+ result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
158
+ result["email_value"] = parse_email(result["email_value"])
159
+ result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
160
+ result["address_value"] = parse_address(result["address_value"])
161
+ result["designation_value"] = parse_designation(result["designation_value"])
162
+ result["experience_time"] = parse_time(result["experience_time"])
163
+ result["gender_value"] = parse_gender(result["gender_value"])
164
+ result["skill_value"] = parse_skill(result["skill_value"])
165
+ result["education_name"] = parse_designation(result["education_name"])
166
+ result["experience_name"] = parse_designation(result["experience_name"])
167
+ for k, v in result.items():
168
+ if isinstance(v, list):
169
+ result[k] = ". ".join([i for i in result[k]])
170
+ if isinstance(v, int) or isinstance(v, float):
171
+ result[k] = str(result[k])
172
+ return "Tên: "+result["person_name"]+"\n"+"Ngày sinh: "+result["dob_value"]+"\n"+"Giới tính: "+result["gender_value"]+"\n"+"Chức danh: "+result["designation_value"]+"\n"+"Số điện thoại: "+result["phonenumber_value"]+"\n"+"Email: "+result["email_value"]+"\n"+"Địa chỉ: "+result["address_value"]+"\n"+"Tên công ty/công việc: "+result["experience_name"]+"\n"+"Tên trường học: "+result["education_name"]+"\n"+"Kỹ năng: "+result["skill_value"]+"\n"+"Năm kinh nghiệm: "+result["experience_time"]
173
+
174
+
175
+ with gr.Blocks() as demo:
176
+ gr.Markdown("DEMO PROJECTS: REVIEW ANALYSIS AND EXTRACT INFOMATION FROM RESUME")
177
+ with gr.Tab("Review analysis"):
178
+ text_input = gr.Textbox(label="Input sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...")
179
+ text_output = gr.Textbox(label="Result:")
180
+ text_button = gr.Button("Predict")
181
+ with gr.Tab("Extract infomation from resume"):
182
+ with gr.Column():
183
+ file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
184
+ with gr.Column():
185
+ cv_output = gr.Textbox(label="Information fields")
186
+ resume_button = gr.Button("Extract")
187
+ with gr.Column():
188
+ normalize_output = gr.Textbox(label="Normalize by rule-based:")
189
+ normalize_button = gr.Button("Normailze")
190
+
191
+ # with gr.Accordion("Open for More!"):
192
+ # gr.Markdown("Look at me...")
193
+
194
+ text_button.click(sentiment, inputs=text_input, outputs=text_output)
195
+ resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
196
+ normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)
197
+
198
+ demo.launch()
gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vncorenlp_segmenter/VnCoreNLP-1.1.1.jar filter=lfs diff=lfs merge=lfs -text
imports.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ from typing import Optional, List, Tuple, Any
5
+ from collections import OrderedDict
6
+ import os, ast, re, string, torch, transformers, datasets, chardet, gdown
7
+ from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
8
+ from torch.utils.data import Dataset, DataLoader
9
+ from sklearn.model_selection import train_test_split
10
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, logging, RobertaForTokenClassification, RobertaConfig, AutoConfig
11
+ from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
12
+ from torchcrf import CRF
13
+ from accelerate import Accelerator
14
+ import torch.nn as nn
15
+ import torch.nn.functional as F
16
+ import underthesea
17
+ from utils import *
18
+ from all_datasets import *
19
+ from model import *
20
+
21
+ from huggingface_hub import login
22
+ import PIL, fitz, pdf2image, re, unicodedata
23
+ from transformers import AutoProcessor, LayoutLMv3ForTokenClassification
24
+ from unidecode import unidecode
25
+
26
+ from pathlib import Path
27
+ from nltk import everygrams
28
+ from collections import Counter
29
+ from typing import List, Optional
30
+ from datetime import datetime
31
+ from dateutil import parser, relativedelta
model.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
2
+ from all_datasets import *
3
+
4
+ class PhoBertLstmCrf(RobertaForTokenClassification):
5
+ def __init__(self, config):
6
+ super(PhoBertLstmCrf, self).__init__(config=config)
7
+ self.num_labels = config.num_labels
8
+ self.lstm = nn.LSTM(input_size=config.hidden_size,
9
+ hidden_size=config.hidden_size // 2,
10
+ num_layers=1,
11
+ batch_first=True,
12
+ bidirectional=True)
13
+ self.crf = CRF(config.num_labels, batch_first=True)
14
+
15
+ @staticmethod
16
+ def sort_batch(src_tensor, lengths):
17
+ """
18
+ Sort a minibatch by the length of the sequences with the longest sequences first
19
+ return the sorted batch targes and sequence lengths.
20
+ This way the output can be used by pack_padd ed_sequences(...)
21
+ """
22
+ seq_lengths, perm_idx = lengths.sort(0, descending=True)
23
+ seq_tensor = src_tensor[perm_idx]
24
+ _, reversed_idx = perm_idx.sort(0, descending=False)
25
+ return seq_tensor, seq_lengths, reversed_idx
26
+
27
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
28
+ label_masks=None):
29
+ seq_outputs = self.roberta(input_ids=input_ids,
30
+ token_type_ids=token_type_ids,
31
+ attention_mask=attention_mask,
32
+ head_mask=None)[0]
33
+
34
+ batch_size, max_len, feat_dim = seq_outputs.shape
35
+ seq_lens = torch.sum(label_masks, dim=-1)
36
+ range_vector = torch.arange(0, batch_size, dtype=torch.long, device=seq_outputs.device).unsqueeze(1)
37
+ seq_outputs = seq_outputs[range_vector, valid_ids]
38
+
39
+ sorted_seq_outputs, sorted_seq_lens, reversed_idx = self.sort_batch(src_tensor=seq_outputs,
40
+ lengths=seq_lens)
41
+ packed_words = pack_padded_sequence(sorted_seq_outputs, sorted_seq_lens.cpu(), True)
42
+ lstm_outs, _ = self.lstm(packed_words)
43
+ lstm_outs, _ = pad_packed_sequence(lstm_outs, batch_first=True, total_length=max_len)
44
+ seq_outputs = lstm_outs[reversed_idx]
45
+
46
+ seq_outputs = self.dropout(seq_outputs)
47
+ logits = self.classifier(seq_outputs)
48
+ seq_tags = self.crf.decode(logits, mask=label_masks != 0)
49
+
50
+ if labels is not None:
51
+ log_likelihood = self.crf(logits, labels, mask=label_masks.type(torch.uint8))
52
+ return NerOutput(loss=-1.0 * log_likelihood, tags=seq_tags, cls_metrics=seq_tags)
53
+ else:
54
+ return NerOutput(tags=seq_tags)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ poppler-utils
parse_info.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
2
+
3
+ punc = list(string.punctuation)
4
+ def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
5
+ try:
6
+ for i in excp:
7
+ punc.remove(i)
8
+ except:
9
+ pass
10
+ inp = inp.lower()
11
+ inp = re.sub(r"won\'t", "will not", inp)
12
+ inp = re.sub(r"can\'t", "can not", inp)
13
+ inp = re.sub(r"\'re", " are", inp)
14
+ inp = re.sub(r"\'s", " of", inp)
15
+ inp = re.sub(r"\'d", " would", inp)
16
+ inp = re.sub(r"\'ll", " will", inp)
17
+ inp = re.sub(r"\'t", " not", inp)
18
+ inp = re.sub(r"\'ve", " have", inp)
19
+ inp = re.sub(r"\'m", " am", inp)
20
+ for i in punc:
21
+ inp = inp.replace(i,rep)
22
+ return " ".join(inp.split())
23
+
24
+ def parse_time(inp: List):
25
+ duration = 0
26
+ for i, _ in enumerate(inp):
27
+ inp[i] = inp[i].lower()
28
+ now = datetime.utcnow().strftime("%d/%m/%Y")
29
+ _ = ["đến", " to ", "–"] # list that split 2 time point word
30
+ __ = ["now", "hiện tại", " nay", " đến nay", "present"] # end time point
31
+ for j in _:
32
+ inp[i] = inp[i].replace(j," - ")
33
+ for j in __:
34
+ inp[i] = inp[i].replace(j,now)
35
+ for j in inp[i]:
36
+ if j.isalpha():
37
+ inp[i] = inp[i].replace(j,"").strip()
38
+ inp[i] = parse_string(" ".join(inp[i].split(" ")), rep="", excp=["/","-"])
39
+
40
+ time_point = inp[i].split("-") # split to 2 time point
41
+ if len(time_point) != 2: # must be splitted to 2 time point
42
+ continue
43
+ try:
44
+ d1 = parser.parse(time_point[0]).strftime("%d-%m-%Y")
45
+ d2 = parser.parse(time_point[1]).strftime("%d-%m-%Y")
46
+ duration += (datetime.strptime(d2, "%d-%m-%Y") - datetime.strptime(d1, "%d-%m-%Y")).days
47
+ except:
48
+ continue
49
+ return "{:.1f} năm".format(np.abs(duration/365))
50
+
51
+ filename = "./skills.csv"
52
+ detected = chardet.detect(Path(filename).read_bytes()) # "ISO-8859-1"
53
+ skill_list = pd.read_csv(filename, encoding=detected["encoding"])
54
+ skill_list = [i.replace("\n","") for i in skill_list["Skill"].to_list()]
55
+ def parse_skill(inp: List) -> list:
56
+ res = []
57
+ for i, _ in enumerate(inp):
58
+ if "," in _:
59
+ _ = [j.strip() for j in _.split(",")]
60
+ inp.extend(_)
61
+ inp = [parse_string(i) for i in inp]
62
+ for ngram in Counter(map(' '.join, everygrams(" ".join(inp).split(), 1, 3))).keys():
63
+ if ngram in skill_list:
64
+ res.append(ngram)
65
+ return ". ".join([i.capitalize() for i in list(set(res))])
66
+
67
+ def parse_gender(inp: List) -> str:
68
+ inp = " ".join([parse_string(i) for i in inp])
69
+ gender = ["nam", "nữ", "female", "male", "bisexual", "asexual", "heterosexual", "homosexual", "lgbt"]
70
+ for gen in gender:
71
+ if gen in inp:
72
+ return gen
73
+ return ""
74
+
75
+ def parse_address(inp: List) -> str:
76
+ inp = [parse_string(i, excp=",") for i in inp]
77
+ for i, _ in enumerate(inp):
78
+ inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
79
+ return ". ".join(inp)
80
+
81
+ def parse_designation(inp: List) -> str:
82
+ inp = list(set([parse_string(i) for i in inp]))
83
+ for i, _ in enumerate(inp):
84
+ inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
85
+ return ". ".join(inp)
86
+
87
+ def parse_email(inp: List) -> str:
88
+ inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
89
+ return " ".join(inp)
90
+
91
+ def decontracted(phrase) -> str:
92
+ phrase = re.sub(r"â€|™|“|”|;|ü|\xad|\xa0|\u200b|·|∙|�|●|�|§|•|!|▪|©|\?|\]|\[|\)|\(", "", phrase)
93
+ phrase = phrase.strip()
94
+ phrase = unicodedata.normalize("NFC", phrase)
95
+ if " " in phrase or " " in phrase: # check space character
96
+ phrase = phrase.replace(" ","_").replace(" ","_").replace(" ","").replace("_"," ")
97
+ tmp = phrase.split(" ")
98
+ check_parse = True
99
+ for i in tmp:
100
+ if len(i) > 1:
101
+ check_parse = False
102
+ break
103
+ if check_parse:
104
+ phrase = phrase.replace(" ","")
105
+ # phrase = phrase.replace(" "," ").replace(" "," ")
106
+ return phrase.replace("\n"," ")
107
+
108
+ def normalize_bbox(bbox, size): # must normalize bbox to [0;1000]
109
+ return [int(1000 * bbox[0] / size[0]),
110
+ int(1000 * bbox[1] / size[1]),
111
+ int(1000 * bbox[2] / size[0]),
112
+ int(1000 * bbox[3] / size[1])]
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ huggingface_hub
4
+ gdown
5
+ pymupdf
6
+ unidecode
7
+ pdf2image
8
+ chardet
9
+ python-dateutil
10
+ datasets
11
+ underthesea
12
+ accelerate
13
+ pytorch-crf==0.7.2
14
+ sklearn-crfsuite
15
+ scikit-learn
16
+ numpy
17
+ pandas
18
+ install-jdk
19
+ seaborn
skills.csv ADDED
The diff for this file is too large to render. See raw diff
 
utils.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
2
+ import unicodedata
3
+ dict_map = {
4
+ "òa": "oà",
5
+ "Òa": "Oà",
6
+ "ÒA": "OÀ",
7
+ "óa": "oá",
8
+ "Óa": "Oá",
9
+ "ÓA": "OÁ",
10
+ "ỏa": "oả",
11
+ "Ỏa": "Oả",
12
+ "ỎA": "OẢ",
13
+ "õa": "oã",
14
+ "Õa": "Oã",
15
+ "ÕA": "OÃ",
16
+ "ọa": "oạ",
17
+ "Ọa": "Oạ",
18
+ "ỌA": "OẠ",
19
+ "òe": "oè",
20
+ "Òe": "Oè",
21
+ "ÒE": "OÈ",
22
+ "óe": "oé",
23
+ "Óe": "Oé",
24
+ "ÓE": "OÉ",
25
+ "ỏe": "oẻ",
26
+ "Ỏe": "Oẻ",
27
+ "ỎE": "OẺ",
28
+ "õe": "oẽ",
29
+ "Õe": "Oẽ",
30
+ "ÕE": "OẼ",
31
+ "ọe": "oẹ",
32
+ "Ọe": "Oẹ",
33
+ "ỌE": "OẸ",
34
+ "ùy": "uỳ",
35
+ "Ùy": "Uỳ",
36
+ "ÙY": "UỲ",
37
+ "úy": "uý",
38
+ "Úy": "Uý",
39
+ "ÚY": "UÝ",
40
+ "ủy": "uỷ",
41
+ "Ủy": "Uỷ",
42
+ "ỦY": "UỶ",
43
+ "ũy": "uỹ",
44
+ "Ũy": "Uỹ",
45
+ "ŨY": "UỸ",
46
+ "ụy": "uỵ",
47
+ "Ụy": "Uỵ",
48
+ "ỤY": "UỴ",
49
+ }
50
+
51
+ ### Normalize functions ###
52
+ def replace_all(text, dict_map=dict_map):
53
+ for i, j in dict_map.items():
54
+ text = unicodedata.normalize('NFC',str(text)).replace(i, j)
55
+ return text
56
+ def normalize(text, segment=True):
57
+ text = replace_all(text, dict_map)
58
+ if segment:
59
+ text = text.split(".")
60
+ text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text])
61
+ return text
62
+ def text_preprocess(document):
63
+ punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
64
+ stopword = [" thì ", " được ", " có ", " là "]
65
+ acronyms = {" wfh": " làm việc tại nhà ", " ot": " làm tăng ca ", " team": " nhóm ", " pm": " quản lý dự án ", " flexible": " linh động ",
66
+ " office": " văn phòng ", " feedback": " phản hồi ", " cty": " công ty ", " hr": " tuyển dụng ", " effective": " hiệu quả ",
67
+ " suggest": " gợi ý ", " hong": " không ", " ko": " không ", " vp": " văn phòng ", " plan ": " kế hoạch ", " planning": " lên kế hoạch ",
68
+ " family": " gia đình ", " leaders": " trưởng nhóm ", " leader": " trưởng nhóm ", ",": " , "}
69
+
70
+ document = re.sub(r"\n"," . ", document)
71
+ document = re.sub(r"\t"," ", document)
72
+ document = re.sub(r"\r","", document)
73
+ for p in punc:
74
+ document = document.replace(p," ")
75
+ for acr in acronyms:
76
+ tmp = [acr, acr.upper(), acr[0].upper()+acr[1:]]
77
+ for j in tmp:
78
+ document = re.sub(j, acronyms[acr], document)
79
+ #document = re.sub(j, acr.upper(), document)
80
+ for sw in stopword:
81
+ document = re.sub(sw, " ", document)
82
+
83
+ document = re.sub(" ", " ", document)
84
+ document = re.sub(" ", " ", document)
85
+ try:
86
+ document = document.split(".")
87
+ document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document])
88
+ except:
89
+ pass
90
+ return document.lower()
91
+
92
+ ### Compute metrics for multiclass classification problem
93
+ def compute_metrics(pred):
94
+ labels = pred.label_ids
95
+ preds = pred.predictions.argmax(-1)
96
+ f1 = f1_score(labels, preds, average="weighted")
97
+ acc = accuracy_score(labels, preds)
98
+ return {"accuracy": acc, "f1": f1}
99
+
100
+ ### Make multilabel result from Ner result
101
+ # mb and cls_class just a dictionary map id to class name, see train.py
102
+ def convert2cls(data, mb, cls_class):
103
+ data = list(set(data))
104
+ try:
105
+ data.remove(20)
106
+ except:
107
+ pass
108
+ for i, num in enumerate(data):
109
+ if num>=10:
110
+ data[i] -= 10
111
+ data[i] = cls_class[data[i]]
112
+ data = mb.transform([data])[0]
113
+ return list(data)