Spaces:

mlgeis
/

ArXivRecommenderSystem

Runtime error

App Files Files Community

Michael-Geis commited on Jul 3, 2023

Commit

cbdef5e

•

1 Parent(s): 3af1705

created methods for splitting categories into arxiv and msc tags

Browse files

Files changed (2) hide show

data_cleaning.py +239 -199
data_storage.py +29 -27

data_cleaning.py CHANGED Viewed

@@ -4,21 +4,24 @@ import json
 import sentence_transformers.util
 import os
 def main(raw_metadata_df, path_to_embeddings):
     clean_metadata_df = pd.DataFrame(
-        columns=['sentences','authors','msc_tags','msc_cos_sim']
-        )
     clean_title = raw_metadata_df.title.apply(cleanse)
     clean_abstract = raw_metadata_df.summary.apply(cleanse)
-    clean_metadata_df.sentences = clean_title + ' ' + clean_abstract
     clean_metadata_df.authors = raw_metadata_df.authors
     clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
     return clean_metadata_df
 ##
 def category_map():
     """Maps arXiv subject categories to their full english names.
@@ -27,214 +30,253 @@ def category_map():
         Note that the list is not exhaustive in the sense that many categories have aliases that
         are not included. (Some are, e.g. math.MP and math-ph).
     """
-    return {'astro-ph': 'Astrophysics',
-    'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
-    'astro-ph.EP': 'Earth and Planetary Astrophysics',
-    'astro-ph.GA': 'Astrophysics of Galaxies',
-    'astro-ph.HE': 'High Energy Astrophysical Phenomena',
-    'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
-    'astro-ph.SR': 'Solar and Stellar Astrophysics',
-    'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
-    'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
-    'cond-mat.mtrl-sci': 'Materials Science',
-    'cond-mat.other': 'Other Condensed Matter',
-    'cond-mat.quant-gas': 'Quantum Gases',
-    'cond-mat.soft': 'Soft Condensed Matter',
-    'cond-mat.stat-mech': 'Statistical Mechanics',
-    'cond-mat.str-el': 'Strongly Correlated Electrons',
-    'cond-mat.supr-con': 'Superconductivity',
-    'cond-mat': 'Condensed Matter',
-    'cs.AI': 'Artificial Intelligence',
-    'cs.AR': 'Hardware Architecture',
-    'cs.CC': 'Computational Complexity',
-    'cs.CE': 'Computational Engineering, Finance, and Science',
-    'cs.CG': 'Computational Geometry',
-    'cs.CL': 'Computation and Language',
-    'cs.CR': 'Cryptography and Security',
-    'cs.CV': 'Computer Vision and Pattern Recognition',
-    'cs.CY': 'Computers and Society',
-    'cs.DB': 'Databases',
-    'cs.DC': 'Distributed, Parallel, and Cluster Computing',
-    'cs.DL': 'Digital Libraries',
-    'cs.DM': 'Discrete Mathematics',
-    'cs.DS': 'Data Structures and Algorithms',
-    'cs.ET': 'Emerging Technologies',
-    'cs.FL': 'Formal Languages and Automata Theory',
-    'cs.GL': 'General Literature',
-    'cs.GR': 'Graphics',
-    'cs.GT': 'Computer Science and Game Theory',
-    'cs.HC': 'Human-Computer Interaction',
-    'cs.IR': 'Information Retrieval',
-    'cs.IT': 'Information Theory',
-    'cs.LG': 'Machine Learning',
-    'cs.LO': 'Logic in Computer Science',
-    'cs.MA': 'Multiagent Systems',
-    'cs.MM': 'Multimedia',
-    'cs.MS': 'Mathematical Software',
-    'cs.NA': 'Numerical Analysis',
-    'cs.NE': 'Neural and Evolutionary Computing',
-    'cs.NI': 'Networking and Internet Architecture',
-    'cs.OH': 'Other Computer Science',
-    'cs.OS': 'Operating Systems',
-    'cs.PF': 'Performance',
-    'cs.PL': 'Programming Languages',
-    'cs.RO': 'Robotics',
-    'cs.SC': 'Symbolic Computation',
-    'cs.SD': 'Sound',
-    'cs.SE': 'Software Engineering',
-    'cs.SI': 'Social and Information Networks',
-    'cs.SY': 'Systems and Control',
-    'econ.EM': 'Econometrics',
-    'econ.GN': 'General Economics',
-    'econ.TH': 'Theoretical Economics',
-    'eess.AS': 'Audio and Speech Processing',
-    'eess.IV': 'Image and Video Processing',
-    'eess.SP': 'Signal Processing',
-    'eess.SY': 'Systems and Control',
-    'dg-ga': 'Differential Geometry',
-    'gr-qc': 'General Relativity and Quantum Cosmology',
-    'hep-ex': 'High Energy Physics - Experiment',
-    'hep-lat': 'High Energy Physics - Lattice',
-    'hep-ph': 'High Energy Physics - Phenomenology',
-    'hep-th': 'High Energy Physics - Theory',
-    'math.AC': 'Commutative Algebra',
-    'math.AG': 'Algebraic Geometry',
-    'math.AP': 'Analysis of PDEs',
-    'math.AT': 'Algebraic Topology',
-    'math.CA': 'Classical Analysis and ODEs',
-    'math.CO': 'Combinatorics',
-    'math.CT': 'Category Theory',
-    'math.CV': 'Complex Variables',
-    'math.DG': 'Differential Geometry',
-    'math.DS': 'Dynamical Systems',
-    'math.FA': 'Functional Analysis',
-    'math.GM': 'General Mathematics',
-    'math.GN': 'General Topology',
-    'math.GR': 'Group Theory',
-    'math.GT': 'Geometric Topology',
-    'math.HO': 'History and Overview',
-    'math.IT': 'Information Theory',
-    'math.KT': 'K-Theory and Homology',
-    'math.LO': 'Logic',
-    'math.MG': 'Metric Geometry',
-    'math.MP': 'Mathematical Physics',
-    'math.NA': 'Numerical Analysis',
-    'math.NT': 'Number Theory',
-    'math.OA': 'Operator Algebras',
-    'math.OC': 'Optimization and Control',
-    'math.PR': 'Probability',
-    'math.QA': 'Quantum Algebra',
-    'math.RA': 'Rings and Algebras',
-    'math.RT': 'Representation Theory',
-    'math.SG': 'Symplectic Geometry',
-    'math.SP': 'Spectral Theory',
-    'math.ST': 'Statistics Theory',
-    'math-ph': 'Mathematical Physics',
-    'funct-an': 'Functional Analysis',
-    'alg-geom': 'Algebraic Geometry',
-    'nlin.AO': 'Adaptation and Self-Organizing Systems',
-    'chao-dyn': 'Chaotic Dynamics',
-    'nlin.CD': 'Chaotic Dynamics',
-    'nlin.CG': 'Cellular Automata and Lattice Gases',
-    'nlin.PS': 'Pattern Formation and Solitons',
-    'nlin.SI': 'Exactly Solvable and Integrable Systems',
-    'nucl-ex': 'Nuclear Experiment',
-    'nucl-th': 'Nuclear Theory',
-    'physics.acc-ph': 'Accelerator Physics',
-    'physics.ao-ph': 'Atmospheric and Oceanic Physics',
-    'physics.app-ph': 'Applied Physics',
-    'physics.atm-clus': 'Atomic and Molecular Clusters',
-    'physics.atom-ph': 'Atomic Physics',
-    'physics.bio-ph': 'Biological Physics',
-    'physics.chem-ph': 'Chemical Physics',
-    'physics.class-ph': 'Classical Physics',
-    'physics.comp-ph': 'Computational Physics',
-    'physics.data-an': 'Data Analysis, Statistics and Probability',
-    'physics.ed-ph': 'Physics Education',
-    'physics.flu-dyn': 'Fluid Dynamics',
-    'physics.gen-ph': 'General Physics',
-    'physics.geo-ph': 'Geophysics',
-    'physics.hist-ph': 'History and Philosophy of Physics',
-    'physics.ins-det': 'Instrumentation and Detectors',
-    'physics.med-ph': 'Medical Physics',
-    'physics.optics': 'Optics',
-    'physics.plasm-ph': 'Plasma Physics',
-    'physics.pop-ph': 'Popular Physics',
-    'physics.soc-ph': 'Physics and Society',
-    'physics.space-ph': 'Space Physics',
-    'q-bio.BM': 'Biomolecules',
-    'q-bio.CB': 'Cell Behavior',
-    'q-bio.GN': 'Genomics',
-    'q-bio.MN': 'Molecular Networks',
-    'q-bio.NC': 'Neurons and Cognition',
-    'q-bio.OT': 'Other Quantitative Biology',
-    'q-bio.PE': 'Populations and Evolution',
-    'q-bio.QM': 'Quantitative Methods',
-    'q-bio.SC': 'Subcellular Processes',
-    'q-bio.TO': 'Tissues and Organs',
-    'q-fin.CP': 'Computational Finance',
-    'q-fin.EC': 'Economics',
-    'q-fin.GN': 'General Finance',
-    'q-fin.MF': 'Mathematical Finance',
-    'q-fin.PM': 'Portfolio Management',
-    'q-fin.PR': 'Pricing of Securities',
-    'q-fin.RM': 'Risk Management',
-    'q-fin.ST': 'Statistical Finance',
-    'q-fin.TR': 'Trading and Market Microstructure',
-    'quant-ph': 'Quantum Physics',
-    'q-alg' : 'Quantum Algebra',
-    'stat.AP': 'Applications',
-    'stat.CO': 'Computation',
-    'stat.ME': 'Methodology',
-    'stat.ML': 'Machine Learning',
-    'stat.OT': 'Other Statistics',
-    'stat.TH': 'Statistics Theory'}
 ## 1. Latin-ize latex accents enclosed in brackets
 def remove_latex_accents(string):
-    accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
-    replacement = r'\1'
-    string = regex.sub(accent,replacement, string)
     return string
 ## 2. Remove latex environments
 def remove_env(string):
-    env = r'\\[a-z]{2,}{[^{}]+?}'
-    string = regex.sub(env,'',string)
     return string
 ## 3. Latin-ize non-{} enclosed latex accents:
 def remove_accents(string):
-    accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
-    replacement = r'\1'
-    string = regex.sub(accent,replacement,string)
-    return string
 ## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
 def remove_latex(string):
-    latex = r'\s(\$\$?)[^\$]*?\1\S*'
-    string = regex.sub(latex,' LATEX ',string)
-    return string
 def cleanse(string):
-    string = string.replace('\n',' ')
     string = remove_latex_accents(string)
     string = remove_env(string)
     string = remove_accents(string)
     string = remove_latex(string)
     return string
-##
 def find_hyph(text):
-    pattern = r'(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b'
-    keywords = regex.findall(pattern,text)
     if keywords == []:
         return None
@@ -242,17 +284,14 @@ def find_hyph(text):
         return list(set(keywords))
-def find_msc(cat_list):
-    pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
-    out = []
-    for cat in cat_list:
-        tags = regex.findall(pattern,cat)
-        for tag in tags:
-            out.append(tag)
-    return out
 def msc_tags():
-    with open('./data/msc.json','r') as file:
         text = file.read()
         return json.loads(text)
@@ -268,33 +307,34 @@ def cats_to_msc(cat_list):
         return None
     else:
         return out
 ##
 def msc_encoded_dict():
-    encoded_tags = pd.read_parquet('./data/msc_mini_embeddings.parquet').to_numpy()
-    return {k : v for (k,v) in zip(msc_tags().values(), encoded_tags)}
 def doc_encoded_dict():
-    library_embeddings = pd.read_parquet('./data/APSP_mini_vec.parquet')
     docs = library_embeddings.docs.to_list()
     encoded_docs = library_embeddings.vecs.to_numpy()
-    return {k : v for (k,v) in zip(docs , encoded_docs)}
-def score_tags(processed_arxiv_row):
     tag_list = processed_arxiv_row.msc_tags
     title_plus_abstract = processed_arxiv_row.docs
     if tag_list is None:
         return None
     embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
     return sentence_transformers.util.semantic_search(
         query_embeddings=doc_encoded_dict()[title_plus_abstract],
         corpus_embeddings=embedded_msc_tags,
-        )[0]

 import sentence_transformers.util
 import os
 def main(raw_metadata_df, path_to_embeddings):
     clean_metadata_df = pd.DataFrame(
+        columns=["sentences", "authors", "msc_tags", "msc_cos_sim"]
+    )
     clean_title = raw_metadata_df.title.apply(cleanse)
     clean_abstract = raw_metadata_df.summary.apply(cleanse)
+    clean_metadata_df.sentences = clean_title + " " + clean_abstract
     clean_metadata_df.authors = raw_metadata_df.authors
     clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
     return clean_metadata_df
 ##
 def category_map():
     """Maps arXiv subject categories to their full english names.
         Note that the list is not exhaustive in the sense that many categories have aliases that
         are not included. (Some are, e.g. math.MP and math-ph).
     """
+    return {
+        "astro-ph": "Astrophysics",
+        "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
+        "astro-ph.EP": "Earth and Planetary Astrophysics",
+        "astro-ph.GA": "Astrophysics of Galaxies",
+        "astro-ph.HE": "High Energy Astrophysical Phenomena",
+        "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
+        "astro-ph.SR": "Solar and Stellar Astrophysics",
+        "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
+        "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
+        "cond-mat.mtrl-sci": "Materials Science",
+        "cond-mat.other": "Other Condensed Matter",
+        "cond-mat.quant-gas": "Quantum Gases",
+        "cond-mat.soft": "Soft Condensed Matter",
+        "cond-mat.stat-mech": "Statistical Mechanics",
+        "cond-mat.str-el": "Strongly Correlated Electrons",
+        "cond-mat.supr-con": "Superconductivity",
+        "cond-mat": "Condensed Matter",
+        "cs.AI": "Artificial Intelligence",
+        "cs.AR": "Hardware Architecture",
+        "cs.CC": "Computational Complexity",
+        "cs.CE": "Computational Engineering, Finance, and Science",
+        "cs.CG": "Computational Geometry",
+        "cs.CL": "Computation and Language",
+        "cs.CR": "Cryptography and Security",
+        "cs.CV": "Computer Vision and Pattern Recognition",
+        "cs.CY": "Computers and Society",
+        "cs.DB": "Databases",
+        "cs.DC": "Distributed, Parallel, and Cluster Computing",
+        "cs.DL": "Digital Libraries",
+        "cs.DM": "Discrete Mathematics",
+        "cs.DS": "Data Structures and Algorithms",
+        "cs.ET": "Emerging Technologies",
+        "cs.FL": "Formal Languages and Automata Theory",
+        "cs.GL": "General Literature",
+        "cs.GR": "Graphics",
+        "cs.GT": "Computer Science and Game Theory",
+        "cs.HC": "Human-Computer Interaction",
+        "cs.IR": "Information Retrieval",
+        "cs.IT": "Information Theory",
+        "cs.LG": "Machine Learning",
+        "cs.LO": "Logic in Computer Science",
+        "cs.MA": "Multiagent Systems",
+        "cs.MM": "Multimedia",
+        "cs.MS": "Mathematical Software",
+        "cs.NA": "Numerical Analysis",
+        "cs.NE": "Neural and Evolutionary Computing",
+        "cs.NI": "Networking and Internet Architecture",
+        "cs.OH": "Other Computer Science",
+        "cs.OS": "Operating Systems",
+        "cs.PF": "Performance",
+        "cs.PL": "Programming Languages",
+        "cs.RO": "Robotics",
+        "cs.SC": "Symbolic Computation",
+        "cs.SD": "Sound",
+        "cs.SE": "Software Engineering",
+        "cs.SI": "Social and Information Networks",
+        "cs.SY": "Systems and Control",
+        "econ.EM": "Econometrics",
+        "econ.GN": "General Economics",
+        "econ.TH": "Theoretical Economics",
+        "eess.AS": "Audio and Speech Processing",
+        "eess.IV": "Image and Video Processing",
+        "eess.SP": "Signal Processing",
+        "eess.SY": "Systems and Control",
+        "dg-ga": "Differential Geometry",
+        "gr-qc": "General Relativity and Quantum Cosmology",
+        "hep-ex": "High Energy Physics - Experiment",
+        "hep-lat": "High Energy Physics - Lattice",
+        "hep-ph": "High Energy Physics - Phenomenology",
+        "hep-th": "High Energy Physics - Theory",
+        "math.AC": "Commutative Algebra",
+        "math.AG": "Algebraic Geometry",
+        "math.AP": "Analysis of PDEs",
+        "math.AT": "Algebraic Topology",
+        "math.CA": "Classical Analysis and ODEs",
+        "math.CO": "Combinatorics",
+        "math.CT": "Category Theory",
+        "math.CV": "Complex Variables",
+        "math.DG": "Differential Geometry",
+        "math.DS": "Dynamical Systems",
+        "math.FA": "Functional Analysis",
+        "math.GM": "General Mathematics",
+        "math.GN": "General Topology",
+        "math.GR": "Group Theory",
+        "math.GT": "Geometric Topology",
+        "math.HO": "History and Overview",
+        "math.IT": "Information Theory",
+        "math.KT": "K-Theory and Homology",
+        "math.LO": "Logic",
+        "math.MG": "Metric Geometry",
+        "math.MP": "Mathematical Physics",
+        "math.NA": "Numerical Analysis",
+        "math.NT": "Number Theory",
+        "math.OA": "Operator Algebras",
+        "math.OC": "Optimization and Control",
+        "math.PR": "Probability",
+        "math.QA": "Quantum Algebra",
+        "math.RA": "Rings and Algebras",
+        "math.RT": "Representation Theory",
+        "math.SG": "Symplectic Geometry",
+        "math.SP": "Spectral Theory",
+        "math.ST": "Statistics Theory",
+        "math-ph": "Mathematical Physics",
+        "funct-an": "Functional Analysis",
+        "alg-geom": "Algebraic Geometry",
+        "nlin.AO": "Adaptation and Self-Organizing Systems",
+        "chao-dyn": "Chaotic Dynamics",
+        "nlin.CD": "Chaotic Dynamics",
+        "nlin.CG": "Cellular Automata and Lattice Gases",
+        "nlin.PS": "Pattern Formation and Solitons",
+        "nlin.SI": "Exactly Solvable and Integrable Systems",
+        "nucl-ex": "Nuclear Experiment",
+        "nucl-th": "Nuclear Theory",
+        "physics.acc-ph": "Accelerator Physics",
+        "physics.ao-ph": "Atmospheric and Oceanic Physics",
+        "physics.app-ph": "Applied Physics",
+        "physics.atm-clus": "Atomic and Molecular Clusters",
+        "physics.atom-ph": "Atomic Physics",
+        "physics.bio-ph": "Biological Physics",
+        "physics.chem-ph": "Chemical Physics",
+        "physics.class-ph": "Classical Physics",
+        "physics.comp-ph": "Computational Physics",
+        "physics.data-an": "Data Analysis, Statistics and Probability",
+        "physics.ed-ph": "Physics Education",
+        "physics.flu-dyn": "Fluid Dynamics",
+        "physics.gen-ph": "General Physics",
+        "physics.geo-ph": "Geophysics",
+        "physics.hist-ph": "History and Philosophy of Physics",
+        "physics.ins-det": "Instrumentation and Detectors",
+        "physics.med-ph": "Medical Physics",
+        "physics.optics": "Optics",
+        "physics.plasm-ph": "Plasma Physics",
+        "physics.pop-ph": "Popular Physics",
+        "physics.soc-ph": "Physics and Society",
+        "physics.space-ph": "Space Physics",
+        "q-bio.BM": "Biomolecules",
+        "q-bio.CB": "Cell Behavior",
+        "q-bio.GN": "Genomics",
+        "q-bio.MN": "Molecular Networks",
+        "q-bio.NC": "Neurons and Cognition",
+        "q-bio.OT": "Other Quantitative Biology",
+        "q-bio.PE": "Populations and Evolution",
+        "q-bio.QM": "Quantitative Methods",
+        "q-bio.SC": "Subcellular Processes",
+        "q-bio.TO": "Tissues and Organs",
+        "q-fin.CP": "Computational Finance",
+        "q-fin.EC": "Economics",
+        "q-fin.GN": "General Finance",
+        "q-fin.MF": "Mathematical Finance",
+        "q-fin.PM": "Portfolio Management",
+        "q-fin.PR": "Pricing of Securities",
+        "q-fin.RM": "Risk Management",
+        "q-fin.ST": "Statistical Finance",
+        "q-fin.TR": "Trading and Market Microstructure",
+        "quant-ph": "Quantum Physics",
+        "q-alg": "Quantum Algebra",
+        "stat.AP": "Applications",
+        "stat.CO": "Computation",
+        "stat.ME": "Methodology",
+        "stat.ML": "Machine Learning",
+        "stat.OT": "Other Statistics",
+        "stat.TH": "Statistics Theory",
+    }
+def split_categories_by_row(raw_metadata_row):
+    """Takes in row of a dataframe returned by an arxiv query search, returns a tuple with the list
+    of arXiv subject tags in the first slot, msc_tags in the second slot.
+    Args:
+        raw_metadata_row: row of a dataframe returned by an arXiv query request
+    Returns:
+        (x , y): x and y are lists; x is a list of arxiv subjects, y is a list of msc_tags.
+    """
+    categories = raw_metadata_row.categories
+    expanded_categories = pd.Series(categories)
+    arxiv_subject_labels = category_map()
+    if expanded_categories.isin(arxiv_subject_labels.keys()).all():
+        return (raw_metadata_row.categories, None)
+    else:
+        msc_tags = find_msc(raw_metadata_row.categories[-1])
+        return (raw_metadata_row.categories[:-2], msc_tags)
+def extract_tags(raw_metadata, arxiv_tag):
+    split_categories = raw_metadata.apply(split_categories_by_row, axis=0)
+    flag = 1
+    if arxiv_tag:
+        flag = 0
+    return split_categories.apply(lambda x: x[flag])
 ## 1. Latin-ize latex accents enclosed in brackets
 def remove_latex_accents(string):
+    accent = r"\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}"
+    replacement = r"\1"
+    string = regex.sub(accent, replacement, string)
     return string
 ## 2. Remove latex environments
 def remove_env(string):
+    env = r"\\[a-z]{2,}{[^{}]+?}"
+    string = regex.sub(env, "", string)
     return string
 ## 3. Latin-ize non-{} enclosed latex accents:
 def remove_accents(string):
+    accent = r"\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])"
+    replacement = r"\1"
+    string = regex.sub(accent, replacement, string)
+    return string
 ## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
 def remove_latex(string):
+    latex = r"\s(\$\$?)[^\$]*?\1\S*"
+    string = regex.sub(latex, " LATEX ", string)
+    return string
 def cleanse(string):
+    string = string.replace("\n", " ")
     string = remove_latex_accents(string)
     string = remove_env(string)
     string = remove_accents(string)
     string = remove_latex(string)
     return string
+##
 def find_hyph(text):
+    pattern = r"(?<!-)\b(?:\w+)(?=-)(?:-(?=\w)\w+)+(?!-)\b"
+    keywords = regex.findall(pattern, text)
     if keywords == []:
         return None
         return list(set(keywords))
+def find_msc(msc_string):
+    pattern = r"\b\d{2}[0-9a-zA-Z]{3}\b"
+    tags = regex.findall(pattern, msc_string)
+    return tags
 def msc_tags():
+    with open("./data/msc.json", "r") as file:
         text = file.read()
         return json.loads(text)
         return None
     else:
         return out
 ##
 def msc_encoded_dict():
+    encoded_tags = pd.read_parquet("./data/msc_mini_embeddings.parquet").to_numpy()
+    return {k: v for (k, v) in zip(msc_tags().values(), encoded_tags)}
 def doc_encoded_dict():
+    library_embeddings = pd.read_parquet("./data/APSP_mini_vec.parquet")
     docs = library_embeddings.docs.to_list()
     encoded_docs = library_embeddings.vecs.to_numpy()
+    return {k: v for (k, v) in zip(docs, encoded_docs)}
+def score_tags(processed_arxiv_row):
     tag_list = processed_arxiv_row.msc_tags
     title_plus_abstract = processed_arxiv_row.docs
     if tag_list is None:
         return None
     embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
     return sentence_transformers.util.semantic_search(
         query_embeddings=doc_encoded_dict()[title_plus_abstract],
         corpus_embeddings=embedded_msc_tags,
+    )[0]

data_storage.py CHANGED Viewed

@@ -2,38 +2,31 @@ import arxiv
 import pandas as pd
 import data_cleaning as clean
 from sklearn.preprocessing import MultiLabelBinarizer
 class ArXivData:
     """A light class for storing the metadata of a collection of arXiv papers."""
     def __init__(self):
-        """
-        data: dataframe holding the metadata. Each row represents a paper and each column is
-        a separate piece of metadata.
-        query: A tuple of the form (query_string,max_results) where query_string is the formatted
-        string that produced the raw data and max_results is the value of that parameter passed to the
-        arXiv API.
-        raw: The original, raw dataset as returned by the arXiv API, if current data is clean.
-        cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
-        """
-        self.data = None
-        self.query = None
-        self.categories = None
-    def load_from_file():
-        pass
-    def load_from_query(self, query_string, max_results, offset):
-        self.data = query_to_df(
             query=query_string, max_results=max_results, offset=offset
         )
-        self.query = (query_string, max_results)
-        # self.categories = self.get_OHE_cats()
     def clean(self, dataset):
         """Constructs this dataset by cleaning another one.
@@ -46,11 +39,14 @@ class ArXivData:
         self.raw = dataset.raw
         self.categories = dataset.categories
-    def get_OHE_cats(self):
         mlb = MultiLabelBinarizer()
-        OHE_category_array = mlb.fit_transform(self.data.categories)
-        return pd.DataFrame(OHE_category_array, columns=mlb.classes_).rename(
-            mapper=clean.category_map()
         )
@@ -117,6 +113,12 @@ def query_to_df(query, max_results, offset):
         for result in results
     )
-    metadata_dataframe = pd.DataFrame(metadata_generator, columns=columns, index=index)
-    return metadata_dataframe

 import pandas as pd
 import data_cleaning as clean
 from sklearn.preprocessing import MultiLabelBinarizer
+import os
 class ArXivData:
     """A light class for storing the metadata of a collection of arXiv papers."""
     def __init__(self):
+        self.metadata = None
+        self.arxiv_subjects = None
+        self._returned_metadata = None
+    def load_from_file(self, dataset_file_name, path_to_data_dir):
+        path_to_dataset = os.path.join(path_to_data_dir, dataset_file_name)
+        self._returned_metadata = pd.read_feather(path_to_dataset)
+        self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
+        self.metadata = self._returned_metadata.drop(columns=["arxiv_subjects"])
+    def load_from_query(self, query_string, max_results, offset=0):
+        self._returned_metadata = query_to_df(
             query=query_string, max_results=max_results, offset=offset
         )
+        self.metadata = self._returned_metadata.drop(columns="arxiv_subjects")
+        self.arxiv_subjects = self.get_OHE_arxiv_subjects(self._returned_metadata)
     def clean(self, dataset):
         """Constructs this dataset by cleaning another one.
         self.raw = dataset.raw
         self.categories = dataset.categories
+    def get_OHE_arxiv_subjects(returned_metadata):
         mlb = MultiLabelBinarizer()
+        OHE_arxiv_subjects_array = mlb.fit_transform(returned_metadata.arxiv_subjects)
+        arxiv_subject_labels = clean.category_map()
+        return pd.DataFrame(OHE_arxiv_subjects_array, columns=mlb.classes_).rename(
+            columns=arxiv_subject_labels
         )
         for result in results
     )
+    raw_metadata = pd.DataFrame(metadata_generator, columns=columns, index=index)
+    returned_metadata = raw_metadata.copy().drop(columns=["categories"])
+    returned_metadata["arxiv_subjects"] = clean.extract_tags(
+        raw_metadata, arxiv_tag=True
+    )
+    returned_metadata["msc_tags"] = clean.extract_tags(raw_metadata, arxiv_tag=False)
+    return returned_metadata