Michael-Geis commited on
Commit
9b818c8
1 Parent(s): d129f38

wrote class for holding arxiv data

Browse files
Files changed (2) hide show
  1. arxiv_query_retrieval.py +54 -0
  2. cleaning-abstracts.ipynb +221 -0
arxiv_query_retrieval.py CHANGED
@@ -1,5 +1,59 @@
1
  import arxiv
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def format_query(author='',title='',cat='',abstract=''):
5
  """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
 
1
  import arxiv
2
  import pandas as pd
3
+ import data_cleaning as clean
4
+ from sklearn.preprocessing import MultiLabelBinarizer
5
+
6
+ class ArXivData():
7
+ """A light class for storing the metadata of a collection of arXiv papers.
8
+ """
9
+
10
+ def __init__(self):
11
+ """
12
+ data: dataframe holding the metadata. Each row represents a paper and each column is
13
+ a separate piece of metadata.
14
+
15
+ query: A tuple of the form (query_string,max_results) where query_string is the formatted
16
+ string that produced the raw data and max_results is the value of that parameter passed to the
17
+ arXiv API.
18
+
19
+ raw: The original, raw dataset as returned by the arXiv API, if current data is clean.
20
+
21
+ cats: A DataFrame containing one-hot-encoded categories of the self.data DataFrame.
22
+ """
23
+
24
+ self.data = None
25
+ self.query = None
26
+ self.raw = None
27
+ self.categories = None
28
+
29
+ def get_from_query(self,query_string,max_results):
30
+ self.data = query_to_df(query=query_string,max_results=max_results)
31
+ self.query = (query_string,max_results)
32
+ self.raw = self.data
33
+ self.categories = self.get_OHE_cats()
34
+
35
+
36
+ def clean(self,dataset):
37
+ """Constructs this dataset by cleaning another one.
38
+
39
+ Args:
40
+ dataset: An ArXivData object containing data to be cleaned.
41
+ """
42
+ self.data = clean.clean(dataset)
43
+ self.query = dataset.query
44
+ self.raw = dataset.raw
45
+ self.categories = dataset.categories
46
+
47
+ def get_OHE_cats(self):
48
+ mlb = MultiLabelBinarizer()
49
+ OHE_category_array = mlb.fit_transform(self.data.categories)
50
+ return pd.DataFrame(
51
+ OHE_category_array, columns = mlb.classes_).rename(
52
+ mapper=clean.category_map())
53
+
54
+
55
+
56
+
57
 
58
  def format_query(author='',title='',cat='',abstract=''):
59
  """Returns a formatted arxiv query string to handle simple queries of at most one instance each of these fields. To leave a field unspecified,
cleaning-abstracts.ipynb CHANGED
@@ -6486,6 +6486,227 @@
6486
  "source": [
6487
  " Include the cleaning utilities applied to the data in the util file."
6488
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6489
  }
6490
  ],
6491
  "metadata": {
 
6486
  "source": [
6487
  " Include the cleaning utilities applied to the data in the util file."
6488
  ]
6489
+ },
6490
+ {
6491
+ "cell_type": "code",
6492
+ "execution_count": null,
6493
+ "metadata": {},
6494
+ "outputs": [],
6495
+ "source": []
6496
+ },
6497
+ {
6498
+ "cell_type": "code",
6499
+ "execution_count": 16,
6500
+ "metadata": {},
6501
+ "outputs": [],
6502
+ "source": [
6503
+ "import arxiv_query_retrieval\n",
6504
+ "from arxiv_query_retrieval import ArXivData\n",
6505
+ "import importlib\n",
6506
+ "importlib.reload(arxiv_query_retrieval)\n",
6507
+ "\n",
6508
+ "data = ArXivData()\n",
6509
+ "\n",
6510
+ "data.get_from_query(query_string='cat:math.AP',max_results=10)"
6511
+ ]
6512
+ },
6513
+ {
6514
+ "cell_type": "code",
6515
+ "execution_count": 18,
6516
+ "metadata": {},
6517
+ "outputs": [
6518
+ {
6519
+ "data": {
6520
+ "text/html": [
6521
+ "<div>\n",
6522
+ "<style scoped>\n",
6523
+ " .dataframe tbody tr th:only-of-type {\n",
6524
+ " vertical-align: middle;\n",
6525
+ " }\n",
6526
+ "\n",
6527
+ " .dataframe tbody tr th {\n",
6528
+ " vertical-align: top;\n",
6529
+ " }\n",
6530
+ "\n",
6531
+ " .dataframe thead th {\n",
6532
+ " text-align: right;\n",
6533
+ " }\n",
6534
+ "</style>\n",
6535
+ "<table border=\"1\" class=\"dataframe\">\n",
6536
+ " <thead>\n",
6537
+ " <tr style=\"text-align: right;\">\n",
6538
+ " <th></th>\n",
6539
+ " <th>35K55, 35K65, 35Q74, 74A30, 35A01, 76S05, 35K51, 74B20</th>\n",
6540
+ " <th>35K58, 49K20, 90C46, 92D25</th>\n",
6541
+ " <th>35L05, 35L71, 35B44</th>\n",
6542
+ " <th>35P30, 35A15, 53C17</th>\n",
6543
+ " <th>35Q53, 93D15, 93D30, 93C20</th>\n",
6544
+ " <th>35R25 (Primary) 42B35 35B30 (Secondary)</th>\n",
6545
+ " <th>39A12, 39A14, 39A22</th>\n",
6546
+ " <th>math.AP</th>\n",
6547
+ " <th>math.DS</th>\n",
6548
+ " <th>math.OC</th>\n",
6549
+ " </tr>\n",
6550
+ " </thead>\n",
6551
+ " <tbody>\n",
6552
+ " <tr>\n",
6553
+ " <th>0</th>\n",
6554
+ " <td>0</td>\n",
6555
+ " <td>0</td>\n",
6556
+ " <td>0</td>\n",
6557
+ " <td>1</td>\n",
6558
+ " <td>0</td>\n",
6559
+ " <td>0</td>\n",
6560
+ " <td>0</td>\n",
6561
+ " <td>1</td>\n",
6562
+ " <td>0</td>\n",
6563
+ " <td>0</td>\n",
6564
+ " </tr>\n",
6565
+ " <tr>\n",
6566
+ " <th>1</th>\n",
6567
+ " <td>0</td>\n",
6568
+ " <td>0</td>\n",
6569
+ " <td>1</td>\n",
6570
+ " <td>0</td>\n",
6571
+ " <td>0</td>\n",
6572
+ " <td>0</td>\n",
6573
+ " <td>0</td>\n",
6574
+ " <td>1</td>\n",
6575
+ " <td>0</td>\n",
6576
+ " <td>0</td>\n",
6577
+ " </tr>\n",
6578
+ " <tr>\n",
6579
+ " <th>2</th>\n",
6580
+ " <td>0</td>\n",
6581
+ " <td>0</td>\n",
6582
+ " <td>0</td>\n",
6583
+ " <td>0</td>\n",
6584
+ " <td>1</td>\n",
6585
+ " <td>0</td>\n",
6586
+ " <td>0</td>\n",
6587
+ " <td>1</td>\n",
6588
+ " <td>0</td>\n",
6589
+ " <td>1</td>\n",
6590
+ " </tr>\n",
6591
+ " <tr>\n",
6592
+ " <th>3</th>\n",
6593
+ " <td>0</td>\n",
6594
+ " <td>0</td>\n",
6595
+ " <td>0</td>\n",
6596
+ " <td>0</td>\n",
6597
+ " <td>0</td>\n",
6598
+ " <td>0</td>\n",
6599
+ " <td>0</td>\n",
6600
+ " <td>1</td>\n",
6601
+ " <td>0</td>\n",
6602
+ " <td>0</td>\n",
6603
+ " </tr>\n",
6604
+ " <tr>\n",
6605
+ " <th>4</th>\n",
6606
+ " <td>0</td>\n",
6607
+ " <td>0</td>\n",
6608
+ " <td>0</td>\n",
6609
+ " <td>0</td>\n",
6610
+ " <td>0</td>\n",
6611
+ " <td>0</td>\n",
6612
+ " <td>1</td>\n",
6613
+ " <td>1</td>\n",
6614
+ " <td>1</td>\n",
6615
+ " <td>0</td>\n",
6616
+ " </tr>\n",
6617
+ " <tr>\n",
6618
+ " <th>5</th>\n",
6619
+ " <td>0</td>\n",
6620
+ " <td>0</td>\n",
6621
+ " <td>0</td>\n",
6622
+ " <td>0</td>\n",
6623
+ " <td>0</td>\n",
6624
+ " <td>0</td>\n",
6625
+ " <td>0</td>\n",
6626
+ " <td>1</td>\n",
6627
+ " <td>0</td>\n",
6628
+ " <td>0</td>\n",
6629
+ " </tr>\n",
6630
+ " <tr>\n",
6631
+ " <th>6</th>\n",
6632
+ " <td>1</td>\n",
6633
+ " <td>0</td>\n",
6634
+ " <td>0</td>\n",
6635
+ " <td>0</td>\n",
6636
+ " <td>0</td>\n",
6637
+ " <td>0</td>\n",
6638
+ " <td>0</td>\n",
6639
+ " <td>1</td>\n",
6640
+ " <td>0</td>\n",
6641
+ " <td>0</td>\n",
6642
+ " </tr>\n",
6643
+ " <tr>\n",
6644
+ " <th>7</th>\n",
6645
+ " <td>0</td>\n",
6646
+ " <td>0</td>\n",
6647
+ " <td>0</td>\n",
6648
+ " <td>0</td>\n",
6649
+ " <td>0</td>\n",
6650
+ " <td>0</td>\n",
6651
+ " <td>0</td>\n",
6652
+ " <td>1</td>\n",
6653
+ " <td>0</td>\n",
6654
+ " <td>1</td>\n",
6655
+ " </tr>\n",
6656
+ " <tr>\n",
6657
+ " <th>8</th>\n",
6658
+ " <td>0</td>\n",
6659
+ " <td>0</td>\n",
6660
+ " <td>0</td>\n",
6661
+ " <td>0</td>\n",
6662
+ " <td>0</td>\n",
6663
+ " <td>1</td>\n",
6664
+ " <td>0</td>\n",
6665
+ " <td>1</td>\n",
6666
+ " <td>0</td>\n",
6667
+ " <td>0</td>\n",
6668
+ " </tr>\n",
6669
+ " <tr>\n",
6670
+ " <th>9</th>\n",
6671
+ " <td>0</td>\n",
6672
+ " <td>1</td>\n",
6673
+ " <td>0</td>\n",
6674
+ " <td>0</td>\n",
6675
+ " <td>0</td>\n",
6676
+ " <td>0</td>\n",
6677
+ " <td>0</td>\n",
6678
+ " <td>1</td>\n",
6679
+ " <td>0</td>\n",
6680
+ " <td>1</td>\n",
6681
+ " </tr>\n",
6682
+ " </tbody>\n",
6683
+ "</table>\n",
6684
+ "</div>"
6685
+ ],
6686
+ "text/plain": [
6687
+ " 35K55, 35K65, 35Q74, 74A30, 35A01, 76S05, 35K51, 74B20 ... math.OC\n",
6688
+ "0 0 ... 0\n",
6689
+ "1 0 ... 0\n",
6690
+ "2 0 ... 1\n",
6691
+ "3 0 ... 0\n",
6692
+ "4 0 ... 0\n",
6693
+ "5 0 ... 0\n",
6694
+ "6 1 ... 0\n",
6695
+ "7 0 ... 1\n",
6696
+ "8 0 ... 0\n",
6697
+ "9 0 ... 1\n",
6698
+ "\n",
6699
+ "[10 rows x 10 columns]"
6700
+ ]
6701
+ },
6702
+ "execution_count": 18,
6703
+ "metadata": {},
6704
+ "output_type": "execute_result"
6705
+ }
6706
+ ],
6707
+ "source": [
6708
+ "data.categories"
6709
+ ]
6710
  }
6711
  ],
6712
  "metadata": {