Spaces:
Runtime error
Runtime error
import regex | |
import pandas as pd | |
import json | |
import sentence_transformers.util | |
import os | |
def main(raw_metadata_df, path_to_embeddings): | |
clean_metadata_df = pd.DataFrame( | |
columns=['sentences','authors','msc_tags','msc_cos_sim'] | |
) | |
clean_title = raw_metadata_df.title.apply(cleanse) | |
clean_abstract = raw_metadata_df.summary.apply(cleanse) | |
clean_metadata_df.sentences = clean_title + ' ' + clean_abstract | |
clean_metadata_df.authors = raw_metadata_df.authors | |
clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc) | |
return clean_metadata_df | |
## | |
## 1. Latin-ize latex accents enclosed in brackets | |
def remove_latex_accents(string): | |
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}' | |
replacement = r'\1' | |
string = regex.sub(accent,replacement, string) | |
return string | |
## 2. Remove latex environments | |
def remove_env(string): | |
env = r'\\[a-z]{2,}{[^{}]+?}' | |
string = regex.sub(env,'',string) | |
return string | |
## 3. Latin-ize non-{} enclosed latex accents: | |
def remove_accents(string): | |
accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])' | |
replacement = r'\1' | |
string = regex.sub(accent,replacement,string) | |
return string | |
## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it. | |
def remove_latex(string): | |
latex = r'\s(\$\$?)[^\$]*?\1\S*' | |
string = regex.sub(latex,' LATEX ',string) | |
return string | |
def cleanse(string): | |
string = string.replace('\n',' ') | |
string = remove_latex_accents(string) | |
string = remove_env(string) | |
string = remove_accents(string) | |
string = remove_latex(string) | |
return string | |
## | |
def find_msc(cat_list): | |
pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b' | |
out = [] | |
for cat in cat_list: | |
tags = regex.findall(pattern,cat) | |
for tag in tags: | |
out.append(tag) | |
return out | |
def msc_tags(): | |
with open('./data/msc.json','r') as file: | |
text = file.read() | |
return json.loads(text) | |
def cats_to_msc(cat_list): | |
out = [] | |
for tag in find_msc(cat_list): | |
if tag in msc_tags().keys(): | |
out.append(msc_tags()[tag]) | |
else: | |
continue | |
if out == []: | |
return None | |
else: | |
return out | |
## | |
def msc_encoded_dict(): | |
encoded_tags = pd.read_parquet('./data/msc_mini_embeddings.parquet').to_numpy() | |
return {k : v for (k,v) in zip(msc_tags().values(), encoded_tags)} | |
def doc_encoded_dict(): | |
library_embeddings = pd.read_parquet('./data/APSP_mini_vec.parquet') | |
docs = library_embeddings.docs.to_list() | |
encoded_docs = library_embeddings.vecs.to_numpy() | |
return {k : v for (k,v) in zip(docs , encoded_docs)} | |
def score_tags(processed_arxiv_row): | |
tag_list = processed_arxiv_row.msc_tags | |
title_plus_abstract = processed_arxiv_row.docs | |
if tag_list is None: | |
return None | |
embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list] | |
return sentence_transformers.util.semantic_search( | |
query_embeddings=doc_encoded_dict()[title_plus_abstract], | |
corpus_embeddings=embedded_msc_tags, | |
)[0] | |