Spaces:
Runtime error
Runtime error
File size: 3,153 Bytes
6fbaa28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import regex
import pandas as pd
import json
import sentence_transformers.util
import os
def main(raw_metadata_df, path_to_embeddings):
clean_metadata_df = pd.DataFrame(
columns=['sentences','authors','msc_tags','msc_cos_sim']
)
clean_title = raw_metadata_df.title.apply(cleanse)
clean_abstract = raw_metadata_df.summary.apply(cleanse)
clean_metadata_df.sentences = clean_title + ' ' + clean_abstract
clean_metadata_df.authors = raw_metadata_df.authors
clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
return clean_metadata_df
##
## 1. Latin-ize latex accents enclosed in brackets
def remove_latex_accents(string):
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
replacement = r'\1'
string = regex.sub(accent,replacement, string)
return string
## 2. Remove latex environments
def remove_env(string):
env = r'\\[a-z]{2,}{[^{}]+?}'
string = regex.sub(env,'',string)
return string
## 3. Latin-ize non-{} enclosed latex accents:
def remove_accents(string):
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
replacement = r'\1'
string = regex.sub(accent,replacement,string)
return string
## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
def remove_latex(string):
latex = r'\s(\$\$?)[^\$]*?\1\S*'
string = regex.sub(latex,' LATEX ',string)
return string
def cleanse(string):
string = string.replace('\n',' ')
string = remove_latex_accents(string)
string = remove_env(string)
string = remove_accents(string)
string = remove_latex(string)
return string
##
def find_msc(cat_list):
pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
out = []
for cat in cat_list:
tags = regex.findall(pattern,cat)
for tag in tags:
out.append(tag)
return out
def msc_tags():
with open('./data/msc.json','r') as file:
text = file.read()
return json.loads(text)
def cats_to_msc(cat_list):
out = []
for tag in find_msc(cat_list):
if tag in msc_tags().keys():
out.append(msc_tags()[tag])
else:
continue
if out == []:
return None
else:
return out
##
def msc_encoded_dict():
encoded_tags = pd.read_parquet('./data/msc_mini_embeddings.parquet').to_numpy()
return {k : v for (k,v) in zip(msc_tags().values(), encoded_tags)}
def doc_encoded_dict():
library_embeddings = pd.read_parquet('./data/APSP_mini_vec.parquet')
docs = library_embeddings.docs.to_list()
encoded_docs = library_embeddings.vecs.to_numpy()
return {k : v for (k,v) in zip(docs , encoded_docs)}
def score_tags(processed_arxiv_row):
tag_list = processed_arxiv_row.msc_tags
title_plus_abstract = processed_arxiv_row.docs
if tag_list is None:
return None
embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
return sentence_transformers.util.semantic_search(
query_embeddings=doc_encoded_dict()[title_plus_abstract],
corpus_embeddings=embedded_msc_tags,
)[0]
|