Spaces:

panduwana
/

extractor

Sleeping

File size: 6,376 Bytes

from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
import re
import torch
from itertools import cycle

tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv")

id2label = {0: 'O',
     1: 'B-NAME',
     3: 'B-NATION',
     5: 'B-EMAIL',
     7: 'B-URL',
     9: 'B-CAMPUS',
     11: 'B-MAJOR',
     13: 'B-COMPANY',
     15: 'B-DESIGNATION',
     17: 'B-GPA',
     19: 'B-PHONE NUMBER',
     21: 'B-ACHIEVEMENT',
     23: 'B-EXPERIENCES DESC',
     25: 'B-SKILLS',
     27: 'B-PROJECTS',
     2: 'I-NAME',
     4: 'I-NATION',
     6: 'I-EMAIL',
     8: 'I-URL',
     10: 'I-CAMPUS',
     12: 'I-MAJOR',
     14: 'I-COMPANY',
     16: 'I-DESIGNATION',
     18: 'I-GPA',
     20: 'I-PHONE NUMBER',
     22: 'I-ACHIEVEMENT',
     24: 'I-EXPERIENCES DESC',
     26: 'I-SKILLS',
     28: 'I-PROJECTS'}

def merge_subwords(tokens, labels):
    merged_tokens = []
    merged_labels = []

    current_token = ""
    current_label = ""

    for token, label in zip(tokens, labels):
        if token.startswith("Ġ"):
            if current_token:
                # Append the accumulated subwords as a new token and label
                merged_tokens.append(current_token)
                merged_labels.append(current_label)
            # Start a new token and label
            current_token = token[1:]  # Remove the 'Ġ'
            current_label = label
        else:
            # Continue accumulating subwords into the current token
            current_token += token

    # Append the last token and label
    if current_token:
        merged_tokens.append(current_token)
        merged_labels.append(current_label)

    return merged_tokens, merged_labels

def chunked_inference(text, tokenizer, model, max_length=512):
    # Tokenize the text with truncation=False to get the full list of tokens
    tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    tokens = tokenizer.tokenize(tok, is_split_into_words=True)
    # Initialize containers for tokenized inputs
    input_ids_chunks = []
    # Create chunks of tokens that fit within the model's maximum input size
    for i in range(0, len(tokens), max_length - 2):  # -2 accounts for special tokens [CLS] and [SEP]
        chunk = tokens[i:i + max_length - 2]
        # Encode the chunks. Add special tokens via the tokenizer
        chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
        chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
        input_ids_chunks.append(chunk_ids)

    # Convert list of token ids into a tensor
    input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks]

    # Predictions container
    predictions = []

    # Process each chunk
    for input_ids in input_ids_chunks:
        attention_mask = torch.ones_like(input_ids)  # Create an attention mask for the inputs
        output = model(input_ids, attention_mask=attention_mask)
        logits = output[0] if isinstance(output, tuple) else output.logits
        predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0)
        predictions.append(predictions_chunk[1:-1])

    # Optionally, you can convert predictions to labels here
    # Flatten the list of tensors into one long tensor for label mapping
    predictions = torch.cat(predictions, dim=0)
    predicted_labels = [id2label[pred.item()] for pred in predictions]
    return merge_subwords(tokens,predicted_labels)

def process_tokens(tokens, tag_prefix):
    # Process tokens to extract entities based on the tag prefix
    entities = []
    current_entity = {}
    for token, tag in tokens:
        if tag.startswith('B-') and tag.endswith(tag_prefix):
            # Start a new entity
            if current_entity:
                # Append the current entity before starting a new one
                entities.append(current_entity)
                current_entity = {}
            current_entity['text'] = token
            current_entity['type'] = tag
        elif tag.startswith('I-') and (('GPA') == tag_prefix or tag_prefix == ('URL')) and tag.endswith(tag_prefix) and current_entity:
            current_entity['text'] += '' + token
        elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity:
            # Continue the current entity
            current_entity['text'] += ' ' + token
    # Append the last entity if there is one
    if current_entity:
        entities.append(current_entity)
    return entities

def predict(text):
    tokens, predictions = chunked_inference(text, tokenizer, model)
    data = list(zip(tokens, predictions))
    profile = {
        "name": "",
        "links": [],
        "skills": [],
        "experiences": [],
        "educations": []
    }
    profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')])

    for skills in process_tokens(data, 'SKILLS'):
      profile['skills'].append(skills['text'])
    #Links
    for links in process_tokens(data, 'URL'):
      profile['links'].append(links['text'])
    # Process experiences and education
    workzip = []
    exp = process_tokens(data, 'EXPERIENCES DESC')
    designation = process_tokens(data, 'DESIGNATION')
    comp = process_tokens(data, 'COMPANY')
    if len(exp) >= len (designation) and len(exp) >= len(comp):
        workzip = zip(cycle(designation),cycle(comp),exp)
    elif len(designation)>=len(comp):
        workzip = zip((designation),cycle(comp),cycle(exp))
    else:
        workzip = zip(cycle(designation),(comp),cycle(exp))
    for designation, company, experience_desc in workzip:
        profile['experiences'].append({
            "start": None,
            "end": None,
            "designation": designation['text'],
            "company": company['text'],  # To be filled in similarly
            "experience_description": experience_desc['text']  # To be filled in similarly
        })
    for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')):
        profile['educations'].append({
            "start": None,
            "end": None,
            "major": major['text'],
            "campus": campus['text'],  # To be filled in similarly
            "GPA": gpa['text'] # To be filled in similarly
        })

    return profile