File size: 5,281 Bytes
cc3cff0
 
 
 
2349e64
 
 
 
 
cc3cff0
 
 
 
 
 
ef7e7c8
cc3cff0
2349e64
 
 
4979bc7
2349e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2168bad
 
 
 
2349e64
 
 
 
 
 
2168bad
2349e64
 
 
 
06184f0
 
2349e64
6a4d8b5
 
2168bad
6a4d8b5
 
 
 
 
 
 
 
aab02c5
6a4d8b5
2168bad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2349e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aab02c5
2349e64
 
b717ac4
2349e64
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import spacy
import pytextrank
from nlp_entities import *
import torch
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)




#streamlit stuff
tags = st.text_input("Input tags separated by commas")
text = st.text_input("Input text to classify")
topkp = st.slider("Number of key phrases to extract from text", 10,30,20)
#Methods for tag processing
def pool_embeddings(out, tok):
  embeddings = out["hidden_states"][-1]
  attention_mask = tok['attention_mask']
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
  masked_embeddings = embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  return mean_pooled
import pandas as pd

def get_transcript(file):
    data = pd.io.json.read_json(file)
    transcript = data['results'].values[1][0]['transcript']
    transcript = transcript.lower()
    return transcript
    
def concat_tokens_tags(sentences):
  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': []}
  for sentence in sentences:
      # encode each sentence and append to dictionary
      new_tokens = tokenizer.encode_plus(sentence, max_length=64,
                                         truncation=True, padding='max_length',
                                         return_tensors='pt')
      tokens['input_ids'].append(new_tokens['input_ids'][0])
      tokens['attention_mask'].append(new_tokens['attention_mask'][0])
      tokens['KPS'].append(sentence)
  # reformat list of tensors into single tensor
  tokens['input_ids'] = torch.stack(tokens['input_ids'])
  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
  return tokens

# Process tags
  
if tags:
  tags = [x.lower().strip() for x in tags.split(",")]
  tags_tokens = concat_tokens_tags(tags)
  tags_tokens.pop("KPS")
  with torch.no_grad():
    outputs_tags = model(**tags_tokens)
  pools_tags = pool_embeddings(outputs_tags, tags_tokens).detach().numpy()
  token_dict = {}
  for tag,embedding in zip(tags,pools_tags):
    token_dict[tag] = embedding

#Code related with processing text, extracting KPs, and doing distance to tag

def concat_tokens(sentences):
  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': {}}
  for sentence, values in sentences.items():
      weight = values['weight']
      # encode each sentence and append to dictionary
      new_tokens = tokenizer.encode_plus(sentence, max_length=64,
                                         truncation=True, padding='max_length',
                                         return_tensors='pt')
      tokens['input_ids'].append(new_tokens['input_ids'][0])
      tokens['attention_mask'].append(new_tokens['attention_mask'][0])
      tokens['KPS'][sentence] = weight
  # reformat list of tensors into single tensor
  tokens['input_ids'] = torch.stack(tokens['input_ids'])
  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
  return tokens
 
def calculate_weighted_embed_dist(out, tokens, weight, text,kp_dict, idx, exclude_text=False,exclude_words=False):
  sim_dict = {}
  pools = pool_embeddings_count(out, tokens, idx).detach().numpy()
  for key in kp_dict.keys():
    if exclude_text and text in key:
      continue
    if exclude_words and True in [x in key for x in text.split(" ")]:
      continue

    sim_dict[key] = cosine_similarity(
        pools,
        [kp_dict[key]] 
    )[0][0] * weight
  return sim_dict
def pool_embeddings_count(out, tok, idx):
  embeddings = out["hidden_states"][-1][idx:idx+1,:,:]
  attention_mask = tok['attention_mask'][idx]
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
  masked_embeddings = embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  return mean_pooled
import pandas as pd
def extract_tokens(text,top_kp=30):
  kps = return_ners_and_kp([text], ret_ne=True)['KP']
  #only process the top_kp tokens
  kps = sorted(kps.items(), key= lambda x: x[1]['weight'], reverse = True)[:top_kp]
  kps =  {x:y for x,y in kps}
  return concat_tokens(kps)

#Process text and classify it
if text and tags:
  text = text.lower()
  t1_tokens = extract_tokens(text, topkp)
  t1_kps = t1_tokens.pop("KPS")
  with torch.no_grad():
    outputs = model(**t1_tokens)
  tag_distance = None
  for i,kp in enumerate(t1_kps):
    if tag_distance is None:
      tag_distance = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
    else:
      curr = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
      tag_distance = {x:tag_distance[x] + curr[x] for x in tag_distance.keys()}
  tag_distance = sorted(tag_distance.items(), key= lambda x: x[1], reverse = True)
  tag_distance =  {x:y for x,y in tag_distance}
  st.json(tag_distance)