|
from transformers import RobertaTokenizerFast, AutoModelForTokenClassification |
|
import re |
|
import torch |
|
from itertools import cycle |
|
|
|
tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv") |
|
model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv") |
|
|
|
id2label = {0: 'O', |
|
1: 'B-NAME', |
|
3: 'B-NATION', |
|
5: 'B-EMAIL', |
|
7: 'B-URL', |
|
9: 'B-CAMPUS', |
|
11: 'B-MAJOR', |
|
13: 'B-COMPANY', |
|
15: 'B-DESIGNATION', |
|
17: 'B-GPA', |
|
19: 'B-PHONE NUMBER', |
|
21: 'B-ACHIEVEMENT', |
|
23: 'B-EXPERIENCES DESC', |
|
25: 'B-SKILLS', |
|
27: 'B-PROJECTS', |
|
2: 'I-NAME', |
|
4: 'I-NATION', |
|
6: 'I-EMAIL', |
|
8: 'I-URL', |
|
10: 'I-CAMPUS', |
|
12: 'I-MAJOR', |
|
14: 'I-COMPANY', |
|
16: 'I-DESIGNATION', |
|
18: 'I-GPA', |
|
20: 'I-PHONE NUMBER', |
|
22: 'I-ACHIEVEMENT', |
|
24: 'I-EXPERIENCES DESC', |
|
26: 'I-SKILLS', |
|
28: 'I-PROJECTS'} |
|
|
|
def merge_subwords(tokens, labels): |
|
merged_tokens = [] |
|
merged_labels = [] |
|
|
|
current_token = "" |
|
current_label = "" |
|
|
|
for token, label in zip(tokens, labels): |
|
if token.startswith("Ġ"): |
|
if current_token: |
|
|
|
merged_tokens.append(current_token) |
|
merged_labels.append(current_label) |
|
|
|
current_token = token[1:] |
|
current_label = label |
|
else: |
|
|
|
current_token += token |
|
|
|
|
|
if current_token: |
|
merged_tokens.append(current_token) |
|
merged_labels.append(current_label) |
|
|
|
return merged_tokens, merged_labels |
|
|
|
def chunked_inference(text, tokenizer, model, max_length=512): |
|
|
|
tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE) |
|
tokens = tokenizer.tokenize(tok, is_split_into_words=True) |
|
|
|
input_ids_chunks = [] |
|
|
|
for i in range(0, len(tokens), max_length - 2): |
|
chunk = tokens[i:i + max_length - 2] |
|
|
|
chunk_ids = tokenizer.convert_tokens_to_ids(chunk) |
|
chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids) |
|
input_ids_chunks.append(chunk_ids) |
|
|
|
|
|
input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks] |
|
|
|
|
|
predictions = [] |
|
|
|
|
|
for input_ids in input_ids_chunks: |
|
attention_mask = torch.ones_like(input_ids) |
|
output = model(input_ids, attention_mask=attention_mask) |
|
logits = output[0] if isinstance(output, tuple) else output.logits |
|
predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0) |
|
predictions.append(predictions_chunk[1:-1]) |
|
|
|
|
|
|
|
predictions = torch.cat(predictions, dim=0) |
|
predicted_labels = [id2label[pred.item()] for pred in predictions] |
|
return merge_subwords(tokens,predicted_labels) |
|
|
|
def process_tokens(tokens, tag_prefix): |
|
|
|
entities = [] |
|
current_entity = {} |
|
for token, tag in tokens: |
|
if tag.startswith('B-') and tag.endswith(tag_prefix): |
|
|
|
if current_entity: |
|
|
|
entities.append(current_entity) |
|
current_entity = {} |
|
current_entity['text'] = token |
|
current_entity['type'] = tag |
|
elif tag.startswith('I-') and (('GPA') == tag_prefix or tag_prefix == ('URL')) and tag.endswith(tag_prefix) and current_entity: |
|
current_entity['text'] += '' + token |
|
elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity: |
|
|
|
current_entity['text'] += ' ' + token |
|
|
|
if current_entity: |
|
entities.append(current_entity) |
|
return entities |
|
|
|
def predict(text): |
|
tokens, predictions = chunked_inference(text, tokenizer, model) |
|
data = list(zip(tokens, predictions)) |
|
profile = { |
|
"name": "", |
|
"links": [], |
|
"skills": [], |
|
"experiences": [], |
|
"educations": [] |
|
} |
|
profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')]) |
|
|
|
for skills in process_tokens(data, 'SKILLS'): |
|
profile['skills'].append(skills['text']) |
|
|
|
for links in process_tokens(data, 'URL'): |
|
profile['links'].append(links['text']) |
|
|
|
workzip = [] |
|
exp = process_tokens(data, 'EXPERIENCES DESC') |
|
designation = process_tokens(data, 'DESIGNATION') |
|
comp = process_tokens(data, 'COMPANY') |
|
if len(exp) >= len (designation) and len(exp) >= len(comp): |
|
workzip = zip(cycle(designation),cycle(comp),exp) |
|
elif len(designation)>=len(comp): |
|
workzip = zip((designation),cycle(comp),cycle(exp)) |
|
else: |
|
workzip = zip(cycle(designation),(comp),cycle(exp)) |
|
for designation, company, experience_desc in workzip: |
|
profile['experiences'].append({ |
|
"start": None, |
|
"end": None, |
|
"designation": designation['text'], |
|
"company": company['text'], |
|
"experience_description": experience_desc['text'] |
|
}) |
|
for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')): |
|
profile['educations'].append({ |
|
"start": None, |
|
"end": None, |
|
"major": major['text'], |
|
"campus": campus['text'], |
|
"GPA": gpa['text'] |
|
}) |
|
|
|
return profile |