Michael-Geis commited on
Commit
6fbaa28
1 Parent(s): 4df760a

changing naming convention

Browse files
Files changed (4) hide show
  1. cleaning/__init__.py +0 -0
  2. cleaning/cleaning.py +119 -0
  3. collection.ipynb +65 -47
  4. util.py +1 -4
cleaning/__init__.py ADDED
File without changes
cleaning/cleaning.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex
2
+ import pandas as pd
3
+ import json
4
+ import sentence_transformers.util
5
+ import os
6
+
7
+ def main(raw_metadata_df, path_to_embeddings):
8
+ clean_metadata_df = pd.DataFrame(
9
+ columns=['sentences','authors','msc_tags','msc_cos_sim']
10
+ )
11
+
12
+ clean_title = raw_metadata_df.title.apply(cleanse)
13
+ clean_abstract = raw_metadata_df.summary.apply(cleanse)
14
+ clean_metadata_df.sentences = clean_title + ' ' + clean_abstract
15
+ clean_metadata_df.authors = raw_metadata_df.authors
16
+ clean_metadata_df.msc_tags = raw_metadata_df.categories.apply(cats_to_msc)
17
+
18
+ return clean_metadata_df
19
+
20
+ ##
21
+
22
+
23
+ ## 1. Latin-ize latex accents enclosed in brackets
24
+ def remove_latex_accents(string):
25
+ accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]\{([a-z])\}'
26
+ replacement = r'\1'
27
+
28
+ string = regex.sub(accent,replacement, string)
29
+ return string
30
+
31
+ ## 2. Remove latex environments
32
+ def remove_env(string):
33
+ env = r'\\[a-z]{2,}{[^{}]+?}'
34
+
35
+ string = regex.sub(env,'',string)
36
+ return string
37
+
38
+ ## 3. Latin-ize non-{} enclosed latex accents:
39
+ def remove_accents(string):
40
+ accent = r'\\[\'\"\^\`H\~ckl=bdruvtoi]([a-z])'
41
+ replacement = r'\1'
42
+
43
+ string = regex.sub(accent,replacement,string)
44
+ return string
45
+
46
+ ## 4. ONLY remove latex'd math that is separated as a 'word' i.e. has space characters on either side of it.
47
+
48
+ def remove_latex(string):
49
+ latex = r'\s(\$\$?)[^\$]*?\1\S*'
50
+ string = regex.sub(latex,' LATEX ',string)
51
+ return string
52
+
53
+
54
+ def cleanse(string):
55
+ string = string.replace('\n',' ')
56
+ string = remove_latex_accents(string)
57
+ string = remove_env(string)
58
+ string = remove_accents(string)
59
+ string = remove_latex(string)
60
+ return string
61
+
62
+ ##
63
+
64
+ def find_msc(cat_list):
65
+ pattern = r'\b\d{2}[0-9a-zA-Z]{3}\b'
66
+ out = []
67
+ for cat in cat_list:
68
+ tags = regex.findall(pattern,cat)
69
+ for tag in tags:
70
+ out.append(tag)
71
+ return out
72
+
73
+ def msc_tags():
74
+ with open('./data/msc.json','r') as file:
75
+ text = file.read()
76
+ return json.loads(text)
77
+
78
+
79
+ def cats_to_msc(cat_list):
80
+ out = []
81
+ for tag in find_msc(cat_list):
82
+ if tag in msc_tags().keys():
83
+ out.append(msc_tags()[tag])
84
+ else:
85
+ continue
86
+ if out == []:
87
+ return None
88
+ else:
89
+ return out
90
+
91
+
92
+ ##
93
+
94
+ def msc_encoded_dict():
95
+ encoded_tags = pd.read_parquet('./data/msc_mini_embeddings.parquet').to_numpy()
96
+ return {k : v for (k,v) in zip(msc_tags().values(), encoded_tags)}
97
+
98
+ def doc_encoded_dict():
99
+ library_embeddings = pd.read_parquet('./data/APSP_mini_vec.parquet')
100
+
101
+ docs = library_embeddings.docs.to_list()
102
+ encoded_docs = library_embeddings.vecs.to_numpy()
103
+
104
+ return {k : v for (k,v) in zip(docs , encoded_docs)}
105
+
106
+ def score_tags(processed_arxiv_row):
107
+
108
+ tag_list = processed_arxiv_row.msc_tags
109
+ title_plus_abstract = processed_arxiv_row.docs
110
+
111
+ if tag_list is None:
112
+ return None
113
+ embedded_msc_tags = [msc_encoded_dict()[tag] for tag in tag_list]
114
+
115
+ return sentence_transformers.util.semantic_search(
116
+ query_embeddings=doc_encoded_dict()[title_plus_abstract],
117
+ corpus_embeddings=embedded_msc_tags,
118
+ )[0]
119
+
collection.ipynb CHANGED
@@ -35813,7 +35813,25 @@
35813
  },
35814
  {
35815
  "cell_type": "code",
35816
- "execution_count": 22,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35817
  "metadata": {},
35818
  "outputs": [
35819
  {
@@ -35837,14 +35855,11 @@
35837
  " <thead>\n",
35838
  " <tr style=\"text-align: right;\">\n",
35839
  " <th></th>\n",
35840
- " <th>title</th>\n",
35841
- " <th>summary</th>\n",
35842
  " <th>authors</th>\n",
35843
- " <th>primary_category</th>\n",
35844
- " <th>categories</th>\n",
35845
- " <th>hyph_in_summary</th>\n",
35846
- " <th>hyph_in_title</th>\n",
35847
  " <th>msc_tags</th>\n",
 
35848
  " </tr>\n",
35849
  " </thead>\n",
35850
  " <tbody>\n",
@@ -35853,87 +35868,90 @@
35853
  " <td>The Laplace spectrum on conformally compact manifolds</td>\n",
35854
  " <td>We consider the spectrum of the Laplace operator acting on LATEX over a conformally compact manifold for LATEX We prove that for LATEX this spectrum always contains an open region of the complex plane. We further show that the spectrum is contained within a certain parabolic region of the complex plane. These regions depend on the value of LATEX the dimension of the manifold, and the values of the sectional curvatures approaching the boundary.</td>\n",
35855
  " <td>[Nelia Charalambous, Julie Rowlett]</td>\n",
35856
- " <td>math.SP</td>\n",
35857
- " <td>[math.SP, 58c40]</td>\n",
35858
- " <td>None</td>\n",
35859
- " <td>None</td>\n",
35860
  " <td>None</td>\n",
 
35861
  " </tr>\n",
35862
  " <tr>\n",
35863
  " <th>1</th>\n",
35864
  " <td>On the inviscid limit connecting Brinkman's and Darcy's models of tissue growth with nonlinear pressure</td>\n",
35865
  " <td>Several recent papers have addressed modelling of the tissue growth by the multi-phase models where the velocity is related to the pressure by one of the physical laws (Stoke's, Brinkman's or Darcy's). While each of these models has been extensively studied, not so much is known about the connection between them. In the recent paper (arXiv:2303.10620), assuming the linear form of the pressure, the Authors connected two multi-phase models by an inviscid limit: the viscoelastic one (of Brinkman's type) and the inviscid one (of Darcy's type). Here, we prove that the same is true for a nonlinear, power-law pressure. The new ingredient is that we use relation between the pressure LATEX and the Brinkman potential LATEX to deduce compactness in space of LATEX from the compactness in space of LATEX</td>\n",
35866
  " <td>[Charles Elbar, Jakub Skrzeczkowski]</td>\n",
35867
- " <td>math.AP</td>\n",
35868
- " <td>[math.AP, 35K45, 35K65, 35J60, 35Q92, 92C10]</td>\n",
35869
- " <td>[power-law, multi-phase]</td>\n",
35870
- " <td>None</td>\n",
35871
- " <td>[Initial value problems for second-order parabolic systems]</td>\n",
35872
  " </tr>\n",
35873
  " <tr>\n",
35874
  " <th>2</th>\n",
35875
  " <td>A sparse approximation of the Lieb functional with moment constraints</td>\n",
35876
  " <td>The aim of this paper is to present new sparsity results about the so-called Lieb functional, which is a key quantity in Density Functional Theory for electronic structure calculations for molecules. The Lieb functional was actually shown by Lieb to be a convexification of the so-called Levy-Lieb functional. Given an electronic density for a system of LATEX electrons, which may be seen as a probability density on LATEX the value of the Lieb functional for this density is defined as the solution of a quantum multi-marginal optimal transport problem, which reads as a minimization problem defined on the set of trace-class operators acting on the space of electronic wave-functions that are anti-symmetric LATEX functions of LATEX with partial trace equal to the prescribed electronic density. We introduce a relaxation of this quantum optimal transport problem where the full partial trace constraint is replaced by a finite number of moment constraints on the partial trace of the set of operators. We show that, under mild assumptions on the electronic density, there exist sparse minimizers to the resulting moment constrained approximation of the Lieb (MCAL) functional that read as operators with rank at most equal to the number of moment constraints. We also prove under appropriate assumptions on the set of moment functions that the value of the MCAL functional converges to the value of the exact Lieb functional as the number of moments go to infinity. We also prove some rates of convergence on the associated approximation of the ground state energy. We finally study the mathematical properties of the associated dual problem.</td>\n",
35877
  " <td>[Virginie Ehrlacher, Luca Nenna]</td>\n",
35878
- " <td>math-ph</td>\n",
35879
- " <td>[math-ph, math.MP, math.OC, math.SP]</td>\n",
35880
- " <td>[so-called, anti-symmetric, Levy-Lieb, trace-class, multi-marginal, wave-functions]</td>\n",
35881
- " <td>None</td>\n",
35882
  " <td>None</td>\n",
 
35883
  " </tr>\n",
35884
  " <tr>\n",
35885
  " <th>3</th>\n",
35886
  " <td>Stationarity and Fredholm theory in subextremal Kerr-de Sitter spacetimes</td>\n",
35887
  " <td>In a recent paper, we proved that solutions to linear wave equations in a subextremal Kerr-de Sitter spacetime have asymptotic expansions in quasinormal modes up to a decay order given by the normally hyperbolic trapping, extending the existing results. One central ingredient in the argument was a new definition of quasinormal modes, where a non-standard choice of stationary Killing vector field had to be used in order for the Fredholm theory to be applicable. In this paper, we show that there is in fact a variety of allowed choices of stationary Killing vector fields. In particular, the horizon Killing vector fields work for the analysis, in which case one of the corresponding ergoregions is completely removed.</td>\n",
35888
  " <td>[Oliver Petersen, András Vasy]</td>\n",
35889
- " <td>math.AP</td>\n",
35890
- " <td>[math.AP, gr-qc, math.DG, 35L05, 35P25, 58J45, 83C30]</td>\n",
35891
- " <td>[non-standard, Kerr-de]</td>\n",
35892
- " <td>[Kerr-de]</td>\n",
35893
- " <td>[Wave equation]</td>\n",
35894
  " </tr>\n",
35895
  " <tr>\n",
35896
  " <th>4</th>\n",
35897
  " <td>Remarks on paper \"Two-term spectral asymptotics in linear elasticity''</td>\n",
35898
  " <td>In this note, by pointing out several serious mistakes in we show that the conclusions published by Matteo Capoferri, Leonid Friedlander, Michael Levitin and Dmitri Vassiliev (J Geom Anal (2023)33:242) are completely wrong. Then, we explain the correctness of proof of Theorem 1.1 in our paper by giving some remarks and putting the whole proof in Appendix (see also and ).</td>\n",
35899
  " <td>[Genqian Liu]</td>\n",
35900
- " <td>math.SP</td>\n",
35901
- " <td>[math.SP, math-ph, math.AP, math.DG, math.MP]</td>\n",
35902
- " <td>None</td>\n",
35903
- " <td>[Two-term]</td>\n",
35904
  " <td>None</td>\n",
 
35905
  " </tr>\n",
35906
  " </tbody>\n",
35907
  "</table>\n",
35908
  "</div>"
35909
  ],
35910
  "text/plain": [
35911
- " title ... msc_tags\n",
35912
- "0 The Laplace spectrum on conformally compact manifolds ... None \n",
35913
- "1 On the inviscid limit connecting Brinkman's and Darcy's models of tissue growth with nonlinear pressure ... [Initial value problems for second-order parabolic systems]\n",
35914
- "2 A sparse approximation of the Lieb functional with moment constraints ... None \n",
35915
- "3 Stationarity and Fredholm theory in subextremal Kerr-de Sitter spacetimes ... [Wave equation] \n",
35916
- "4 Remarks on paper \"Two-term spectral asymptotics in linear elasticity'' ... None \n",
35917
  "\n",
35918
- "[5 rows x 8 columns]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35919
  ]
35920
  },
35921
- "execution_count": 22,
35922
  "metadata": {},
35923
  "output_type": "execute_result"
35924
  }
35925
  ],
35926
  "source": [
35927
- "lib.clean_lib.head()"
35928
- ]
35929
- },
35930
- {
35931
- "cell_type": "code",
35932
- "execution_count": 23,
35933
- "metadata": {},
35934
- "outputs": [],
35935
- "source": [
35936
- "lib.clean_lib.to_parquet('./data/clean_APSP.parquet')"
35937
  ]
35938
  }
35939
  ],
 
35813
  },
35814
  {
35815
  "cell_type": "code",
35816
+ "execution_count": 3,
35817
+ "metadata": {},
35818
+ "outputs": [],
35819
+ "source": [
35820
+ "from cleaning import cleaning\n",
35821
+ "import pandas as pd\n",
35822
+ "import importlib\n",
35823
+ "importlib.reload(cleaning)\n",
35824
+ "\n",
35825
+ "data = pd.read_parquet('./data/APSP.parquet')\n",
35826
+ "\n",
35827
+ "clean_data = cleaning.main(\n",
35828
+ " raw_arxiv_results=data,path_to_embeddings='./data/APSP_mini_vec.parquet'\n",
35829
+ ")\n"
35830
+ ]
35831
+ },
35832
+ {
35833
+ "cell_type": "code",
35834
+ "execution_count": 4,
35835
  "metadata": {},
35836
  "outputs": [
35837
  {
 
35855
  " <thead>\n",
35856
  " <tr style=\"text-align: right;\">\n",
35857
  " <th></th>\n",
35858
+ " <th>clean_title</th>\n",
35859
+ " <th>clean_abstract</th>\n",
35860
  " <th>authors</th>\n",
 
 
 
 
35861
  " <th>msc_tags</th>\n",
35862
+ " <th>msc_cos_sim</th>\n",
35863
  " </tr>\n",
35864
  " </thead>\n",
35865
  " <tbody>\n",
 
35868
  " <td>The Laplace spectrum on conformally compact manifolds</td>\n",
35869
  " <td>We consider the spectrum of the Laplace operator acting on LATEX over a conformally compact manifold for LATEX We prove that for LATEX this spectrum always contains an open region of the complex plane. We further show that the spectrum is contained within a certain parabolic region of the complex plane. These regions depend on the value of LATEX the dimension of the manifold, and the values of the sectional curvatures approaching the boundary.</td>\n",
35870
  " <td>[Nelia Charalambous, Julie Rowlett]</td>\n",
 
 
 
 
35871
  " <td>None</td>\n",
35872
+ " <td>NaN</td>\n",
35873
  " </tr>\n",
35874
  " <tr>\n",
35875
  " <th>1</th>\n",
35876
  " <td>On the inviscid limit connecting Brinkman's and Darcy's models of tissue growth with nonlinear pressure</td>\n",
35877
  " <td>Several recent papers have addressed modelling of the tissue growth by the multi-phase models where the velocity is related to the pressure by one of the physical laws (Stoke's, Brinkman's or Darcy's). While each of these models has been extensively studied, not so much is known about the connection between them. In the recent paper (arXiv:2303.10620), assuming the linear form of the pressure, the Authors connected two multi-phase models by an inviscid limit: the viscoelastic one (of Brinkman's type) and the inviscid one (of Darcy's type). Here, we prove that the same is true for a nonlinear, power-law pressure. The new ingredient is that we use relation between the pressure LATEX and the Brinkman potential LATEX to deduce compactness in space of LATEX from the compactness in space of LATEX</td>\n",
35878
  " <td>[Charles Elbar, Jakub Skrzeczkowski]</td>\n",
35879
+ " <td>[Initial value problems for second-order parabolic systems, Degenerate parabolic equations, Nonlinear elliptic equations, Biomechanics]</td>\n",
35880
+ " <td>NaN</td>\n",
 
 
 
35881
  " </tr>\n",
35882
  " <tr>\n",
35883
  " <th>2</th>\n",
35884
  " <td>A sparse approximation of the Lieb functional with moment constraints</td>\n",
35885
  " <td>The aim of this paper is to present new sparsity results about the so-called Lieb functional, which is a key quantity in Density Functional Theory for electronic structure calculations for molecules. The Lieb functional was actually shown by Lieb to be a convexification of the so-called Levy-Lieb functional. Given an electronic density for a system of LATEX electrons, which may be seen as a probability density on LATEX the value of the Lieb functional for this density is defined as the solution of a quantum multi-marginal optimal transport problem, which reads as a minimization problem defined on the set of trace-class operators acting on the space of electronic wave-functions that are anti-symmetric LATEX functions of LATEX with partial trace equal to the prescribed electronic density. We introduce a relaxation of this quantum optimal transport problem where the full partial trace constraint is replaced by a finite number of moment constraints on the partial trace of the set of operators. We show that, under mild assumptions on the electronic density, there exist sparse minimizers to the resulting moment constrained approximation of the Lieb (MCAL) functional that read as operators with rank at most equal to the number of moment constraints. We also prove under appropriate assumptions on the set of moment functions that the value of the MCAL functional converges to the value of the exact Lieb functional as the number of moments go to infinity. We also prove some rates of convergence on the associated approximation of the ground state energy. We finally study the mathematical properties of the associated dual problem.</td>\n",
35886
  " <td>[Virginie Ehrlacher, Luca Nenna]</td>\n",
 
 
 
 
35887
  " <td>None</td>\n",
35888
+ " <td>NaN</td>\n",
35889
  " </tr>\n",
35890
  " <tr>\n",
35891
  " <th>3</th>\n",
35892
  " <td>Stationarity and Fredholm theory in subextremal Kerr-de Sitter spacetimes</td>\n",
35893
  " <td>In a recent paper, we proved that solutions to linear wave equations in a subextremal Kerr-de Sitter spacetime have asymptotic expansions in quasinormal modes up to a decay order given by the normally hyperbolic trapping, extending the existing results. One central ingredient in the argument was a new definition of quasinormal modes, where a non-standard choice of stationary Killing vector field had to be used in order for the Fredholm theory to be applicable. In this paper, we show that there is in fact a variety of allowed choices of stationary Killing vector fields. In particular, the horizon Killing vector fields work for the analysis, in which case one of the corresponding ergoregions is completely removed.</td>\n",
35894
  " <td>[Oliver Petersen, András Vasy]</td>\n",
35895
+ " <td>[Wave equation, Scattering theory for PDEs, Hyperbolic equations on manifolds]</td>\n",
35896
+ " <td>NaN</td>\n",
 
 
 
35897
  " </tr>\n",
35898
  " <tr>\n",
35899
  " <th>4</th>\n",
35900
  " <td>Remarks on paper \"Two-term spectral asymptotics in linear elasticity''</td>\n",
35901
  " <td>In this note, by pointing out several serious mistakes in we show that the conclusions published by Matteo Capoferri, Leonid Friedlander, Michael Levitin and Dmitri Vassiliev (J Geom Anal (2023)33:242) are completely wrong. Then, we explain the correctness of proof of Theorem 1.1 in our paper by giving some remarks and putting the whole proof in Appendix (see also and ).</td>\n",
35902
  " <td>[Genqian Liu]</td>\n",
 
 
 
 
35903
  " <td>None</td>\n",
35904
+ " <td>NaN</td>\n",
35905
  " </tr>\n",
35906
  " </tbody>\n",
35907
  "</table>\n",
35908
  "</div>"
35909
  ],
35910
  "text/plain": [
35911
+ " clean_title \\\n",
35912
+ "0 The Laplace spectrum on conformally compact manifolds \n",
35913
+ "1 On the inviscid limit connecting Brinkman's and Darcy's models of tissue growth with nonlinear pressure \n",
35914
+ "2 A sparse approximation of the Lieb functional with moment constraints \n",
35915
+ "3 Stationarity and Fredholm theory in subextremal Kerr-de Sitter spacetimes \n",
35916
+ "4 Remarks on paper \"Two-term spectral asymptotics in linear elasticity'' \n",
35917
  "\n",
35918
+ " clean_abstract \\\n",
35919
+ "0 We consider the spectrum of the Laplace operator acting on LATEX over a conformally compact manifold for LATEX We prove that for LATEX this spectrum always contains an open region of the complex plane. We further show that the spectrum is contained within a certain parabolic region of the complex plane. These regions depend on the value of LATEX the dimension of the manifold, and the values of the sectional curvatures approaching the boundary. \n",
35920
+ "1 Several recent papers have addressed modelling of the tissue growth by the multi-phase models where the velocity is related to the pressure by one of the physical laws (Stoke's, Brinkman's or Darcy's). While each of these models has been extensively studied, not so much is known about the connection between them. In the recent paper (arXiv:2303.10620), assuming the linear form of the pressure, the Authors connected two multi-phase models by an inviscid limit: the viscoelastic one (of Brinkman's type) and the inviscid one (of Darcy's type). Here, we prove that the same is true for a nonlinear, power-law pressure. The new ingredient is that we use relation between the pressure LATEX and the Brinkman potential LATEX to deduce compactness in space of LATEX from the compactness in space of LATEX \n",
35921
+ "2 The aim of this paper is to present new sparsity results about the so-called Lieb functional, which is a key quantity in Density Functional Theory for electronic structure calculations for molecules. The Lieb functional was actually shown by Lieb to be a convexification of the so-called Levy-Lieb functional. Given an electronic density for a system of LATEX electrons, which may be seen as a probability density on LATEX the value of the Lieb functional for this density is defined as the solution of a quantum multi-marginal optimal transport problem, which reads as a minimization problem defined on the set of trace-class operators acting on the space of electronic wave-functions that are anti-symmetric LATEX functions of LATEX with partial trace equal to the prescribed electronic density. We introduce a relaxation of this quantum optimal transport problem where the full partial trace constraint is replaced by a finite number of moment constraints on the partial trace of the set of operators. We show that, under mild assumptions on the electronic density, there exist sparse minimizers to the resulting moment constrained approximation of the Lieb (MCAL) functional that read as operators with rank at most equal to the number of moment constraints. We also prove under appropriate assumptions on the set of moment functions that the value of the MCAL functional converges to the value of the exact Lieb functional as the number of moments go to infinity. We also prove some rates of convergence on the associated approximation of the ground state energy. We finally study the mathematical properties of the associated dual problem. \n",
35922
+ "3 In a recent paper, we proved that solutions to linear wave equations in a subextremal Kerr-de Sitter spacetime have asymptotic expansions in quasinormal modes up to a decay order given by the normally hyperbolic trapping, extending the existing results. One central ingredient in the argument was a new definition of quasinormal modes, where a non-standard choice of stationary Killing vector field had to be used in order for the Fredholm theory to be applicable. In this paper, we show that there is in fact a variety of allowed choices of stationary Killing vector fields. In particular, the horizon Killing vector fields work for the analysis, in which case one of the corresponding ergoregions is completely removed. \n",
35923
+ "4 In this note, by pointing out several serious mistakes in we show that the conclusions published by Matteo Capoferri, Leonid Friedlander, Michael Levitin and Dmitri Vassiliev (J Geom Anal (2023)33:242) are completely wrong. Then, we explain the correctness of proof of Theorem 1.1 in our paper by giving some remarks and putting the whole proof in Appendix (see also and ). \n",
35924
+ "\n",
35925
+ " authors \\\n",
35926
+ "0 [Nelia Charalambous, Julie Rowlett] \n",
35927
+ "1 [Charles Elbar, Jakub Skrzeczkowski] \n",
35928
+ "2 [Virginie Ehrlacher, Luca Nenna] \n",
35929
+ "3 [Oliver Petersen, András Vasy] \n",
35930
+ "4 [Genqian Liu] \n",
35931
+ "\n",
35932
+ " msc_tags \\\n",
35933
+ "0 None \n",
35934
+ "1 [Initial value problems for second-order parabolic systems, Degenerate parabolic equations, Nonlinear elliptic equations, Biomechanics] \n",
35935
+ "2 None \n",
35936
+ "3 [Wave equation, Scattering theory for PDEs, Hyperbolic equations on manifolds] \n",
35937
+ "4 None \n",
35938
+ "\n",
35939
+ " msc_cos_sim \n",
35940
+ "0 NaN \n",
35941
+ "1 NaN \n",
35942
+ "2 NaN \n",
35943
+ "3 NaN \n",
35944
+ "4 NaN "
35945
  ]
35946
  },
35947
+ "execution_count": 4,
35948
  "metadata": {},
35949
  "output_type": "execute_result"
35950
  }
35951
  ],
35952
  "source": [
35953
+ "pd.set_option('display.max_colwidth', 0)\n",
35954
+ "clean_data.head()"
 
 
 
 
 
 
 
 
35955
  ]
35956
  }
35957
  ],
util.py CHANGED
@@ -253,10 +253,7 @@ def find_msc(cat_list):
253
  tags = regex.findall(pattern,cat)
254
  for tag in tags:
255
  out.append(tag)
256
- if out == []:
257
- return None
258
- else:
259
- return out
260
 
261
 
262
  def format_query(author='',title='',cat='',abstract=''):
 
253
  tags = regex.findall(pattern,cat)
254
  for tag in tags:
255
  out.append(tag)
256
+ return out
 
 
 
257
 
258
 
259
  def format_query(author='',title='',cat='',abstract=''):