Spaces:
Runtime error
Runtime error
Michael-Geis
commited on
Commit
•
9b818c8
1
Parent(s):
d129f38
wrote class for holding arxiv data
Browse files- arxiv_query_retrieval.py +54 -0
- cleaning-abstracts.ipynb +221 -0
arxiv_query_retrieval.py
CHANGED
@@ -1,5 +1,59 @@
|
|
1 |
import arxiv
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def format_query(author='',title='',cat='',abstract=''):
|
5 |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
|
|
1 |
import arxiv
|
2 |
import pandas as pd
|
3 |
+
import data_cleaning as clean
|
4 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
5 |
+
|
6 |
+
class ArXivData():
|
7 |
+
"""A light class for storing the metadata of a collection of arXiv papers.
|
8 |
+
"""
|
9 |
+
|
10 |
+
def __init__(self):
|
11 |
+
"""
|
12 |
+
data: dataframe holding the metadata. Each row represents a paper and each column is
|
13 |
+
a separate piece of metadata.
|
14 |
+
|
15 |
+
query: A tuple of the form (query_string,max_results) where query_string is the formatted
|
16 |
+
string that produced the raw data and max_results is the value of that parameter passed to the
|
17 |
+
arXiv API.
|
18 |
+
|
19 |
+
raw: The original, raw dataset as returned by the arXiv API, if current data is clean.
|
20 |
+
|
21 |
+
cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
|
22 |
+
"""
|
23 |
+
|
24 |
+
self.data = None
|
25 |
+
self.query = None
|
26 |
+
self.raw = None
|
27 |
+
self.categories = None
|
28 |
+
|
29 |
+
def get_from_query(self,query_string,max_results):
|
30 |
+
self.data = query_to_df(query=query_string,max_results=max_results)
|
31 |
+
self.query = (query_string,max_results)
|
32 |
+
self.raw = self.data
|
33 |
+
self.categories = self.get_OHE_cats()
|
34 |
+
|
35 |
+
|
36 |
+
def clean(self,dataset):
|
37 |
+
"""Constructs this dataset by cleaning another one.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
dataset: An ArXivData object containing data to be cleaned.
|
41 |
+
"""
|
42 |
+
self.data = clean.clean(dataset)
|
43 |
+
self.query = dataset.query
|
44 |
+
self.raw = dataset.raw
|
45 |
+
self.categories = dataset.categories
|
46 |
+
|
47 |
+
def get_OHE_cats(self):
|
48 |
+
mlb = MultiLabelBinarizer()
|
49 |
+
OHE_category_array = mlb.fit_transform(self.data.categories)
|
50 |
+
return pd.DataFrame(
|
51 |
+
OHE_category_array, columns = mlb.classes_).rename(
|
52 |
+
mapper=clean.category_map())
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
|
58 |
def format_query(author='',title='',cat='',abstract=''):
|
59 |
"""Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
|
cleaning-abstracts.ipynb
CHANGED
@@ -6486,6 +6486,227 @@
|
|
6486 |
"source": [
|
6487 |
" Include the cleaning utilities applied to the data in the util file."
|
6488 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6489 |
}
|
6490 |
],
|
6491 |
"metadata": {
|
|
|
6486 |
"source": [
|
6487 |
" Include the cleaning utilities applied to the data in the util file."
|
6488 |
]
|
6489 |
+
},
|
6490 |
+
{
|
6491 |
+
"cell_type": "code",
|
6492 |
+
"execution_count": null,
|
6493 |
+
"metadata": {},
|
6494 |
+
"outputs": [],
|
6495 |
+
"source": []
|
6496 |
+
},
|
6497 |
+
{
|
6498 |
+
"cell_type": "code",
|
6499 |
+
"execution_count": 16,
|
6500 |
+
"metadata": {},
|
6501 |
+
"outputs": [],
|
6502 |
+
"source": [
|
6503 |
+
"import arxiv_query_retrieval\n",
|
6504 |
+
"from arxiv_query_retrieval import ArXivData\n",
|
6505 |
+
"import importlib\n",
|
6506 |
+
"importlib.reload(arxiv_query_retrieval)\n",
|
6507 |
+
"\n",
|
6508 |
+
"data = ArXivData()\n",
|
6509 |
+
"\n",
|
6510 |
+
"data.get_from_query(query_string='cat:math.AP',max_results=10)"
|
6511 |
+
]
|
6512 |
+
},
|
6513 |
+
{
|
6514 |
+
"cell_type": "code",
|
6515 |
+
"execution_count": 18,
|
6516 |
+
"metadata": {},
|
6517 |
+
"outputs": [
|
6518 |
+
{
|
6519 |
+
"data": {
|
6520 |
+
"text/html": [
|
6521 |
+
"<div>\n",
|
6522 |
+
"<style scoped>\n",
|
6523 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
6524 |
+
" vertical-align: middle;\n",
|
6525 |
+
" }\n",
|
6526 |
+
"\n",
|
6527 |
+
" .dataframe tbody tr th {\n",
|
6528 |
+
" vertical-align: top;\n",
|
6529 |
+
" }\n",
|
6530 |
+
"\n",
|
6531 |
+
" .dataframe thead th {\n",
|
6532 |
+
" text-align: right;\n",
|
6533 |
+
" }\n",
|
6534 |
+
"</style>\n",
|
6535 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
6536 |
+
" <thead>\n",
|
6537 |
+
" <tr style=\"text-align: right;\">\n",
|
6538 |
+
" <th></th>\n",
|
6539 |
+
" <th>35K55, 35K65, 35Q74, 74A30, 35A01, 76S05, 35K51, 74B20</th>\n",
|
6540 |
+
" <th>35K58, 49K20, 90C46, 92D25</th>\n",
|
6541 |
+
" <th>35L05, 35L71, 35B44</th>\n",
|
6542 |
+
" <th>35P30, 35A15, 53C17</th>\n",
|
6543 |
+
" <th>35Q53, 93D15, 93D30, 93C20</th>\n",
|
6544 |
+
" <th>35R25 (Primary) 42B35 35B30 (Secondary)</th>\n",
|
6545 |
+
" <th>39A12, 39A14, 39A22</th>\n",
|
6546 |
+
" <th>math.AP</th>\n",
|
6547 |
+
" <th>math.DS</th>\n",
|
6548 |
+
" <th>math.OC</th>\n",
|
6549 |
+
" </tr>\n",
|
6550 |
+
" </thead>\n",
|
6551 |
+
" <tbody>\n",
|
6552 |
+
" <tr>\n",
|
6553 |
+
" <th>0</th>\n",
|
6554 |
+
" <td>0</td>\n",
|
6555 |
+
" <td>0</td>\n",
|
6556 |
+
" <td>0</td>\n",
|
6557 |
+
" <td>1</td>\n",
|
6558 |
+
" <td>0</td>\n",
|
6559 |
+
" <td>0</td>\n",
|
6560 |
+
" <td>0</td>\n",
|
6561 |
+
" <td>1</td>\n",
|
6562 |
+
" <td>0</td>\n",
|
6563 |
+
" <td>0</td>\n",
|
6564 |
+
" </tr>\n",
|
6565 |
+
" <tr>\n",
|
6566 |
+
" <th>1</th>\n",
|
6567 |
+
" <td>0</td>\n",
|
6568 |
+
" <td>0</td>\n",
|
6569 |
+
" <td>1</td>\n",
|
6570 |
+
" <td>0</td>\n",
|
6571 |
+
" <td>0</td>\n",
|
6572 |
+
" <td>0</td>\n",
|
6573 |
+
" <td>0</td>\n",
|
6574 |
+
" <td>1</td>\n",
|
6575 |
+
" <td>0</td>\n",
|
6576 |
+
" <td>0</td>\n",
|
6577 |
+
" </tr>\n",
|
6578 |
+
" <tr>\n",
|
6579 |
+
" <th>2</th>\n",
|
6580 |
+
" <td>0</td>\n",
|
6581 |
+
" <td>0</td>\n",
|
6582 |
+
" <td>0</td>\n",
|
6583 |
+
" <td>0</td>\n",
|
6584 |
+
" <td>1</td>\n",
|
6585 |
+
" <td>0</td>\n",
|
6586 |
+
" <td>0</td>\n",
|
6587 |
+
" <td>1</td>\n",
|
6588 |
+
" <td>0</td>\n",
|
6589 |
+
" <td>1</td>\n",
|
6590 |
+
" </tr>\n",
|
6591 |
+
" <tr>\n",
|
6592 |
+
" <th>3</th>\n",
|
6593 |
+
" <td>0</td>\n",
|
6594 |
+
" <td>0</td>\n",
|
6595 |
+
" <td>0</td>\n",
|
6596 |
+
" <td>0</td>\n",
|
6597 |
+
" <td>0</td>\n",
|
6598 |
+
" <td>0</td>\n",
|
6599 |
+
" <td>0</td>\n",
|
6600 |
+
" <td>1</td>\n",
|
6601 |
+
" <td>0</td>\n",
|
6602 |
+
" <td>0</td>\n",
|
6603 |
+
" </tr>\n",
|
6604 |
+
" <tr>\n",
|
6605 |
+
" <th>4</th>\n",
|
6606 |
+
" <td>0</td>\n",
|
6607 |
+
" <td>0</td>\n",
|
6608 |
+
" <td>0</td>\n",
|
6609 |
+
" <td>0</td>\n",
|
6610 |
+
" <td>0</td>\n",
|
6611 |
+
" <td>0</td>\n",
|
6612 |
+
" <td>1</td>\n",
|
6613 |
+
" <td>1</td>\n",
|
6614 |
+
" <td>1</td>\n",
|
6615 |
+
" <td>0</td>\n",
|
6616 |
+
" </tr>\n",
|
6617 |
+
" <tr>\n",
|
6618 |
+
" <th>5</th>\n",
|
6619 |
+
" <td>0</td>\n",
|
6620 |
+
" <td>0</td>\n",
|
6621 |
+
" <td>0</td>\n",
|
6622 |
+
" <td>0</td>\n",
|
6623 |
+
" <td>0</td>\n",
|
6624 |
+
" <td>0</td>\n",
|
6625 |
+
" <td>0</td>\n",
|
6626 |
+
" <td>1</td>\n",
|
6627 |
+
" <td>0</td>\n",
|
6628 |
+
" <td>0</td>\n",
|
6629 |
+
" </tr>\n",
|
6630 |
+
" <tr>\n",
|
6631 |
+
" <th>6</th>\n",
|
6632 |
+
" <td>1</td>\n",
|
6633 |
+
" <td>0</td>\n",
|
6634 |
+
" <td>0</td>\n",
|
6635 |
+
" <td>0</td>\n",
|
6636 |
+
" <td>0</td>\n",
|
6637 |
+
" <td>0</td>\n",
|
6638 |
+
" <td>0</td>\n",
|
6639 |
+
" <td>1</td>\n",
|
6640 |
+
" <td>0</td>\n",
|
6641 |
+
" <td>0</td>\n",
|
6642 |
+
" </tr>\n",
|
6643 |
+
" <tr>\n",
|
6644 |
+
" <th>7</th>\n",
|
6645 |
+
" <td>0</td>\n",
|
6646 |
+
" <td>0</td>\n",
|
6647 |
+
" <td>0</td>\n",
|
6648 |
+
" <td>0</td>\n",
|
6649 |
+
" <td>0</td>\n",
|
6650 |
+
" <td>0</td>\n",
|
6651 |
+
" <td>0</td>\n",
|
6652 |
+
" <td>1</td>\n",
|
6653 |
+
" <td>0</td>\n",
|
6654 |
+
" <td>1</td>\n",
|
6655 |
+
" </tr>\n",
|
6656 |
+
" <tr>\n",
|
6657 |
+
" <th>8</th>\n",
|
6658 |
+
" <td>0</td>\n",
|
6659 |
+
" <td>0</td>\n",
|
6660 |
+
" <td>0</td>\n",
|
6661 |
+
" <td>0</td>\n",
|
6662 |
+
" <td>0</td>\n",
|
6663 |
+
" <td>1</td>\n",
|
6664 |
+
" <td>0</td>\n",
|
6665 |
+
" <td>1</td>\n",
|
6666 |
+
" <td>0</td>\n",
|
6667 |
+
" <td>0</td>\n",
|
6668 |
+
" </tr>\n",
|
6669 |
+
" <tr>\n",
|
6670 |
+
" <th>9</th>\n",
|
6671 |
+
" <td>0</td>\n",
|
6672 |
+
" <td>1</td>\n",
|
6673 |
+
" <td>0</td>\n",
|
6674 |
+
" <td>0</td>\n",
|
6675 |
+
" <td>0</td>\n",
|
6676 |
+
" <td>0</td>\n",
|
6677 |
+
" <td>0</td>\n",
|
6678 |
+
" <td>1</td>\n",
|
6679 |
+
" <td>0</td>\n",
|
6680 |
+
" <td>1</td>\n",
|
6681 |
+
" </tr>\n",
|
6682 |
+
" </tbody>\n",
|
6683 |
+
"</table>\n",
|
6684 |
+
"</div>"
|
6685 |
+
],
|
6686 |
+
"text/plain": [
|
6687 |
+
" 35K55, 35K65, 35Q74, 74A30, 35A01, 76S05, 35K51, 74B20 ... math.OC\n",
|
6688 |
+
"0 0 ... 0\n",
|
6689 |
+
"1 0 ... 0\n",
|
6690 |
+
"2 0 ... 1\n",
|
6691 |
+
"3 0 ... 0\n",
|
6692 |
+
"4 0 ... 0\n",
|
6693 |
+
"5 0 ... 0\n",
|
6694 |
+
"6 1 ... 0\n",
|
6695 |
+
"7 0 ... 1\n",
|
6696 |
+
"8 0 ... 0\n",
|
6697 |
+
"9 0 ... 1\n",
|
6698 |
+
"\n",
|
6699 |
+
"[10 rows x 10 columns]"
|
6700 |
+
]
|
6701 |
+
},
|
6702 |
+
"execution_count": 18,
|
6703 |
+
"metadata": {},
|
6704 |
+
"output_type": "execute_result"
|
6705 |
+
}
|
6706 |
+
],
|
6707 |
+
"source": [
|
6708 |
+
"data.categories"
|
6709 |
+
]
|
6710 |
}
|
6711 |
],
|
6712 |
"metadata": {
|