File size: 6,376 Bytes
c795ebb 7b5f854 c795ebb 70348f4 cd14bf0 70348f4 cd14bf0 70348f4 cd14bf0 70348f4 c795ebb 9a6af66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
import re
import torch
from itertools import cycle
tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
id2label = {0: 'O',
1: 'B-NAME',
3: 'B-NATION',
5: 'B-EMAIL',
7: 'B-URL',
9: 'B-CAMPUS',
11: 'B-MAJOR',
13: 'B-COMPANY',
15: 'B-DESIGNATION',
17: 'B-GPA',
19: 'B-PHONE NUMBER',
21: 'B-ACHIEVEMENT',
23: 'B-EXPERIENCES DESC',
25: 'B-SKILLS',
27: 'B-PROJECTS',
2: 'I-NAME',
4: 'I-NATION',
6: 'I-EMAIL',
8: 'I-URL',
10: 'I-CAMPUS',
12: 'I-MAJOR',
14: 'I-COMPANY',
16: 'I-DESIGNATION',
18: 'I-GPA',
20: 'I-PHONE NUMBER',
22: 'I-ACHIEVEMENT',
24: 'I-EXPERIENCES DESC',
26: 'I-SKILLS',
28: 'I-PROJECTS'}
def merge_subwords(tokens, labels):
merged_tokens = []
merged_labels = []
current_token = ""
current_label = ""
for token, label in zip(tokens, labels):
if token.startswith("Ġ"):
if current_token:
# Append the accumulated subwords as a new token and label
merged_tokens.append(current_token)
merged_labels.append(current_label)
# Start a new token and label
current_token = token[1:] # Remove the 'Ġ'
current_label = label
else:
# Continue accumulating subwords into the current token
current_token += token
# Append the last token and label
if current_token:
merged_tokens.append(current_token)
merged_labels.append(current_label)
return merged_tokens, merged_labels
def chunked_inference(text, tokenizer, model, max_length=512):
# Tokenize the text with truncation=False to get the full list of tokens
tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
tokens = tokenizer.tokenize(tok, is_split_into_words=True)
# Initialize containers for tokenized inputs
input_ids_chunks = []
# Create chunks of tokens that fit within the model's maximum input size
for i in range(0, len(tokens), max_length - 2): # -2 accounts for special tokens [CLS] and [SEP]
chunk = tokens[i:i + max_length - 2]
# Encode the chunks. Add special tokens via the tokenizer
chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
input_ids_chunks.append(chunk_ids)
# Convert list of token ids into a tensor
input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks]
# Predictions container
predictions = []
# Process each chunk
for input_ids in input_ids_chunks:
attention_mask = torch.ones_like(input_ids) # Create an attention mask for the inputs
output = model(input_ids, attention_mask=attention_mask)
logits = output[0] if isinstance(output, tuple) else output.logits
predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0)
predictions.append(predictions_chunk[1:-1])
# Optionally, you can convert predictions to labels here
# Flatten the list of tensors into one long tensor for label mapping
predictions = torch.cat(predictions, dim=0)
predicted_labels = [id2label[pred.item()] for pred in predictions]
return merge_subwords(tokens,predicted_labels)
def process_tokens(tokens, tag_prefix):
# Process tokens to extract entities based on the tag prefix
entities = []
current_entity = {}
for token, tag in tokens:
if tag.startswith('B-') and tag.endswith(tag_prefix):
# Start a new entity
if current_entity:
# Append the current entity before starting a new one
entities.append(current_entity)
current_entity = {}
current_entity['text'] = token
current_entity['type'] = tag
elif tag.startswith('I-') and (('GPA') == tag_prefix or tag_prefix == ('URL')) and tag.endswith(tag_prefix) and current_entity:
current_entity['text'] += '' + token
elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity:
# Continue the current entity
current_entity['text'] += ' ' + token
# Append the last entity if there is one
if current_entity:
entities.append(current_entity)
return entities
def predict(text):
tokens, predictions = chunked_inference(text, tokenizer, model)
data = list(zip(tokens, predictions))
profile = {
"name": "",
"links": [],
"skills": [],
"experiences": [],
"educations": []
}
profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')])
for skills in process_tokens(data, 'SKILLS'):
profile['skills'].append(skills['text'])
#Links
for links in process_tokens(data, 'URL'):
profile['links'].append(links['text'])
# Process experiences and education
workzip = []
exp = process_tokens(data, 'EXPERIENCES DESC')
designation = process_tokens(data, 'DESIGNATION')
comp = process_tokens(data, 'COMPANY')
if len(exp) >= len (designation) and len(exp) >= len(comp):
workzip = zip(cycle(designation),cycle(comp),exp)
elif len(designation)>=len(comp):
workzip = zip((designation),cycle(comp),cycle(exp))
else:
workzip = zip(cycle(designation),(comp),cycle(exp))
for designation, company, experience_desc in workzip:
profile['experiences'].append({
"start": None,
"end": None,
"designation": designation['text'],
"company": company['text'], # To be filled in similarly
"experience_description": experience_desc['text'] # To be filled in similarly
})
for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')):
profile['educations'].append({
"start": None,
"end": None,
"major": major['text'],
"campus": campus['text'], # To be filled in similarly
"GPA": gpa['text'] # To be filled in similarly
})
return profile |