Michael-Geis
changing naming convention
6fbaa28
raw
history blame
3.15 kB
import regex
import pandas as pd
import json
import sentence_transformers.util
import os
def main(raw_metadata_df, path_to_embeddings):
clean_metadata_df = pd.DataFrame(
columns=['sentences','authors','msc_tags','msc_cos_sim']
)
clean_title = raw_metadata_df.title.apply(cleanse)
clean_abstract = raw_metadata_df.summary.apply(cleanse)
clean_metadata_df.sentences = clean_title + ' ' + clean_abstract
clean_metadata_df.authors = raw_metadata_df.authors
clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
return clean_metadata_df
##
## 1. Latin-ize latex accents enclosed in brackets
def remove_latex_accents(string):
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
replacement = r'\1'
string = regex.sub(accent,replacement, string)
return string
## 2. Remove latex environments
def remove_env(string):
env = r'\\[a-z]{2,}{[^{}]+?}'
string = regex.sub(env,'',string)
return string
## 3. Latin-ize non-{} enclosed latex accents:
def remove_accents(string):
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
replacement = r'\1'
string = regex.sub(accent,replacement,string)
return string
## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
def remove_latex(string):
latex = r'\s(\$\$?)[^\$]*?\1\S*'
string = regex.sub(latex,' LATEX ',string)
return string
def cleanse(string):
string = string.replace('\n',' ')
string = remove_latex_accents(string)
string = remove_env(string)
string = remove_accents(string)
string = remove_latex(string)
return string
##
def find_msc(cat_list):
pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
out = []
for cat in cat_list:
tags = regex.findall(pattern,cat)
for tag in tags:
out.append(tag)
return out
def msc_tags():
with open('./data/msc.json','r') as file:
text = file.read()
return json.loads(text)
def cats_to_msc(cat_list):
out = []
for tag in find_msc(cat_list):
if tag in msc_tags().keys():
out.append(msc_tags()[tag])
else:
continue
if out == []:
return None
else:
return out
##
def msc_encoded_dict():
encoded_tags = pd.read_parquet('./data/msc_mini_embeddings.parquet').to_numpy()
return {k : v for (k,v) in zip(msc_tags().values(), encoded_tags)}
def doc_encoded_dict():
library_embeddings = pd.read_parquet('./data/APSP_mini_vec.parquet')
docs = library_embeddings.docs.to_list()
encoded_docs = library_embeddings.vecs.to_numpy()
return {k : v for (k,v) in zip(docs , encoded_docs)}
def score_tags(processed_arxiv_row):
tag_list = processed_arxiv_row.msc_tags
title_plus_abstract = processed_arxiv_row.docs
if tag_list is None:
return None
embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
return sentence_transformers.util.semantic_search(
query_embeddings=doc_encoded_dict()[title_plus_abstract],
corpus_embeddings=embedded_msc_tags,
)[0]