nianlonggu commited on
Commit
02ae0bf
1 Parent(s): a57888c
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +3 -0
  2. Dockerfile +55 -0
  3. docker-compose.yaml +7 -0
  4. requirements.txt +21 -0
  5. s2orc-doc2json/LICENSE +201 -0
  6. s2orc-doc2json/README.md +138 -0
  7. s2orc-doc2json/doc2json.egg-info/PKG-INFO +4 -0
  8. s2orc-doc2json/doc2json.egg-info/SOURCES.txt +42 -0
  9. s2orc-doc2json/doc2json.egg-info/dependency_links.txt +1 -0
  10. s2orc-doc2json/doc2json.egg-info/not-zip-safe +1 -0
  11. s2orc-doc2json/doc2json.egg-info/top_level.txt +1 -0
  12. s2orc-doc2json/doc2json/__init__.py +0 -0
  13. s2orc-doc2json/doc2json/config.py +2 -0
  14. s2orc-doc2json/doc2json/flask/app.py +57 -0
  15. s2orc-doc2json/doc2json/flask/static/style.css +40 -0
  16. s2orc-doc2json/doc2json/flask/templates/home.html +18 -0
  17. s2orc-doc2json/doc2json/grobid2json/__init__.py +0 -0
  18. s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md +92 -0
  19. s2orc-doc2json/doc2json/grobid2json/grobid/__init__.py +0 -0
  20. s2orc-doc2json/doc2json/grobid2json/grobid/client.py +225 -0
  21. s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml +36 -0
  22. s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties +59 -0
  23. s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py +249 -0
  24. s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py +7 -0
  25. s2orc-doc2json/doc2json/grobid2json/process_pdf.py +104 -0
  26. s2orc-doc2json/doc2json/grobid2json/tei_to_json.py +750 -0
  27. s2orc-doc2json/doc2json/jats2json/__init__.py +0 -0
  28. s2orc-doc2json/doc2json/jats2json/jats_to_json.py +341 -0
  29. s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py +0 -0
  30. s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py +300 -0
  31. s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py +56 -0
  32. s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py +106 -0
  33. s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py +381 -0
  34. s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py +347 -0
  35. s2orc-doc2json/doc2json/jats2json/process_jats.py +104 -0
  36. s2orc-doc2json/doc2json/s2orc.py +527 -0
  37. s2orc-doc2json/doc2json/spp2json/__init__.py +0 -0
  38. s2orc-doc2json/doc2json/spp2json/process_pdf.py +72 -0
  39. s2orc-doc2json/doc2json/spp2json/spp/__init__.py +0 -0
  40. s2orc-doc2json/doc2json/spp2json/spp/spp_client.py +32 -0
  41. s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py +7 -0
  42. s2orc-doc2json/doc2json/tex2json/__init__.py +0 -0
  43. s2orc-doc2json/doc2json/tex2json/process_tex.py +127 -0
  44. s2orc-doc2json/doc2json/tex2json/tex_to_xml.py +201 -0
  45. s2orc-doc2json/doc2json/tex2json/xml_to_json.py +1396 -0
  46. s2orc-doc2json/doc2json/utils/__init__.py +0 -0
  47. s2orc-doc2json/doc2json/utils/citation_util.py +75 -0
  48. s2orc-doc2json/doc2json/utils/grobid_util.py +388 -0
  49. s2orc-doc2json/doc2json/utils/latex_util.py +204 -0
  50. s2orc-doc2json/doc2json/utils/refspan_util.py +115 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .ipynb_checkpoints/
2
+ *.gz
3
+ *.pdf
Dockerfile ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ # Set Environment Variable
4
+ ENV HOME="/root"
5
+ ENV JAVA_TOOL_OPTIONS="-Dhttps.protocols=TLSv1.2"
6
+ ENV PDF2JSON_HOME="/app/src/s2orc-doc2json"
7
+
8
+ # install system-wide deps for python and node
9
+ RUN apt-get -yqq update && \
10
+ apt-get -yqq install software-properties-common curl wget zip screen git gcc build-essential openjdk-8-jdk
11
+
12
+ # Install Miniconda
13
+ RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
14
+ bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
15
+ rm Miniconda3-latest-Linux-x86_64.sh
16
+ ENV PATH=/miniconda/bin:${PATH}
17
+
18
+ # Create a Python 3.10 environment
19
+ RUN conda create -n my_env python=3.10
20
+
21
+ SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
22
+
23
+ WORKDIR /app/src
24
+ COPY ./requirements.txt .
25
+ RUN pip install -r requirements.txt
26
+
27
+ WORKDIR $PDF2JSON_HOME
28
+ COPY ./s2orc-doc2json/ .
29
+ RUN python setup.py develop
30
+
31
+ WORKDIR $HOME
32
+ RUN wget https://github.com/kermitt2/grobid/archive/0.6.1.zip && \
33
+ unzip 0.6.1.zip && \
34
+ rm 0.6.1.zip
35
+
36
+ WORKDIR $HOME/grobid-0.6.1
37
+ RUN ./gradlew clean install && \
38
+ cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/config.yaml $HOME/grobid-0.6.1/grobid-service/config/config.yaml && \
39
+ cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/grobid.properties $HOME/grobid-0.6.1/grobid-home/config/grobid.properties
40
+
41
+ WORKDIR /app/models/
42
+ # Download necessary model checkpoint
43
+ RUN python -c "from huggingface_hub import snapshot_download; model_folder = '/app/models/'; snapshot_download('nianlong/memsum-word-embedding', local_dir = model_folder + 'word_embedding'); snapshot_download('nianlong/memsum-arxiv-summarization', local_dir = model_folder + 'memsum_arxiv' )"
44
+
45
+ WORKDIR /app/src
46
+ COPY ./Dockerfile .
47
+
48
+ WORKDIR /app/src/services
49
+ RUN git clone https://github.com/nianlonggu/MemSum
50
+
51
+ COPY ./services/ .
52
+
53
+ # start app
54
+ # will use the pure bash, ignoring the bash environment specified by SHELL command above
55
+ CMD [ "bash", "./start_service.sh" ]
docker-compose.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ summarization_service:
5
+ build: .
6
+ ports:
7
+ - 7860:7860
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tqdm
2
+ beautifulsoup4==4.7.1
3
+ boto3==1.9.147
4
+ requests==2.21.0
5
+ flask==2.3.2
6
+ flask_cors==4.0.0
7
+ python-magic==0.4.18
8
+ latex2mathml==2.16.2
9
+ gunicorn==20.1.0
10
+ lxml==4.9.0
11
+ unidecode
12
+ nltk==3.7
13
+ jsonschema==4.17.3
14
+ six==1.16.0
15
+ numpy==1.21.6
16
+ ujson==5.2.0
17
+ more-itertools==9.1.0
18
+ dateparser==1.1.8
19
+ streamlit
20
+ transformers==4.30.0
21
+ torch==2.2.2
s2orc-doc2json/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
s2orc-doc2json/README.md ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert scientific papers to S2ORC JSON
2
+
3
+ This project is a part of [S2ORC](https://github.com/allenai/s2orc). For S2ORC, we convert PDFs to JSON using Grobid and a custom TEI.XML to JSON parser. That TEI.XML to JSON parser (`grobid2json`) is made available here. We additionally process LaTeX dumps from arXiv. That parser (`tex2json`) is also made available here.
4
+
5
+ The S2ORC github page includes a JSON schema, but it may be easier to understand that schema based on the python classes in `doc2json/s2orc.py`.
6
+
7
+ This custom JSON schema is also used for the [CORD-19](https://github.com/allenai/cord19) project, so those who have interacted with CORD-19 may find this format familiar.
8
+
9
+ Possible future components (no promises):
10
+ - Linking bibliography entries (bibliography consolidation) to papers in S2ORC
11
+
12
+ ## Setup your environment
13
+
14
+ NOTE: Conda is shown but any other python env manager should be fine
15
+
16
+ Go [here](https://docs.conda.io/en/latest/miniconda.html) to install the latest version of miniconda.
17
+
18
+ Then, create an environment:
19
+
20
+ ```console
21
+ conda create -n doc2json python=3.8 pytest
22
+ conda activate doc2json
23
+ pip install -r requirements.txt
24
+ python setup.py develop
25
+ ```
26
+
27
+ ## PDF Processing
28
+
29
+ The current `grobid2json` tool uses Grobid to first process each PDF into XML, then extracts paper components from the XML.
30
+
31
+ ### Install Grobid
32
+
33
+ You will need to have Java installed on your machine. Then, you can install your own version of Grobid and get it running, or you can run the following script:
34
+
35
+ ```console
36
+ bash scripts/setup_grobid.sh
37
+ ```
38
+
39
+ This will setup Grobid, currently hard-coded as version 0.6.1. Then run:
40
+
41
+ ```console
42
+ bash scripts/run_grobid.sh
43
+ ```
44
+
45
+ to start the Grobid server. Don't worry if it gets stuck at 87%; this is normal and means Grobid is ready to process PDFs.
46
+
47
+ The expected port for the Grobid service is 8070, but you can change this as well. Make sure to edit the port in both the Grobid config file as well as `grobid/grobid_client.py`.
48
+
49
+ ### Process a PDF
50
+
51
+ There are a couple of test PDFs in `tests/input/` if you'd like to try with that.
52
+
53
+ For example, you can try:
54
+
55
+ ```console
56
+ python doc2json/grobid2json/process_pdf.py -i tests/pdf/N18-3011.pdf -t temp_dir/ -o output_dir/
57
+ ```
58
+
59
+ This will generate a JSON file in the specified `output_dir`. If unspecified, the file will be in the `output/` directory from your path.
60
+
61
+ ## LaTeX Processing
62
+
63
+ If you want to process LaTeX, you also need to install the following libraries:
64
+
65
+ - [latexpand](https://ctan.org/pkg/latexpand?lang=en) (`apt install texlive-extra-utils`)
66
+ - [tralics](http://www-sop.inria.fr/marelle/tralics/) (`apt install tralics`)
67
+
68
+ To process LaTeX, all files must be in a zip file, similar to the `*.gz` files you can download from arXiv.
69
+
70
+ A few examples are available under `tests/latex/`. For example, you can try:
71
+
72
+ ```console
73
+ python doc2json/tex2json/process_tex.py -i test/latex/1911.02782.gz -t temp_dir/ -o output_dir/
74
+ ```
75
+
76
+ Again, this will produce a JSON file in the specified `output_dir`.
77
+
78
+ ## PMC JATS XML Processing
79
+
80
+ To process JATS XML, try:
81
+
82
+ ```console
83
+ python doc2json/jats2json/process_jats.py -i test/jats/PMC5828200.nxml -o output_dir/
84
+ ```
85
+
86
+ This will create a JSON file with the same paper id in the specified output directory.
87
+
88
+ ## Loading a S2ORC JSON file
89
+
90
+ The format of S2ORC releases have drifted over time. Use the `load_s2orc` function in `doc2json/s2orc.py` to try and load historic and currect S2ORC JSON.
91
+
92
+ ## Run a Flask app and process documents through a web service
93
+
94
+ To process PDFs, you will first need to start Grobid (defaults to port 8070). If you are processing LaTeX, no need for this step.
95
+
96
+ ```console
97
+ bash scripts/run_grobid.sh
98
+ ```
99
+
100
+ Then, start the Flask app (defaults to port 8080).
101
+
102
+ ```console
103
+ python doc2json/flask/app.py
104
+ ```
105
+
106
+ Go to [localhost:8080](localhost:8080) to upload and process papers.
107
+
108
+ Or alternatively, you can do things like:
109
+
110
+ ```console
111
+ curl localhost:8080/ -F file=@tests/pdf/N18-3011.pdf
112
+ ```
113
+
114
+ ## Citation
115
+
116
+ If you use this utility in your research, please cite:
117
+
118
+ ```
119
+ @inproceedings{lo-wang-2020-s2orc,
120
+ title = "{S}2{ORC}: The Semantic Scholar Open Research Corpus",
121
+ author = "Lo, Kyle and Wang, Lucy Lu and Neumann, Mark and Kinney, Rodney and Weld, Daniel",
122
+ booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
123
+ month = jul,
124
+ year = "2020",
125
+ address = "Online",
126
+ publisher = "Association for Computational Linguistics",
127
+ url = "https://www.aclweb.org/anthology/2020.acl-main.447",
128
+ doi = "10.18653/v1/2020.acl-main.447",
129
+ pages = "4969--4983"
130
+ }
131
+ ```
132
+
133
+ ## Contact
134
+
135
+ Contributions are welcome. Note the embarassingly poor test coverage. Also, please note this pipeline is not perfect. It will miss text or make errors on most PDFs. The current PDF to JSON step uses Grobid; we may replace this with a different model in the future.
136
+
137
+ Issues: contact `lucyw@allenai.org` or `kylel@allenai.org`
138
+
s2orc-doc2json/doc2json.egg-info/PKG-INFO ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: doc2json
3
+ Version: 0.1
4
+ License-File: LICENSE
s2orc-doc2json/doc2json.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ doc2json/__init__.py
5
+ doc2json/config.py
6
+ doc2json/s2orc.py
7
+ doc2json.egg-info/PKG-INFO
8
+ doc2json.egg-info/SOURCES.txt
9
+ doc2json.egg-info/dependency_links.txt
10
+ doc2json.egg-info/not-zip-safe
11
+ doc2json.egg-info/top_level.txt
12
+ doc2json/grobid2json/__init__.py
13
+ doc2json/grobid2json/pdf_to_tei.py
14
+ doc2json/grobid2json/process_pdf.py
15
+ doc2json/grobid2json/tei_to_json.py
16
+ doc2json/grobid2json/grobid/__init__.py
17
+ doc2json/grobid2json/grobid/client.py
18
+ doc2json/grobid2json/grobid/grobid_client.py
19
+ doc2json/jats2json/__init__.py
20
+ doc2json/jats2json/jats_to_json.py
21
+ doc2json/jats2json/process_jats.py
22
+ doc2json/jats2json/pmc_utils/__init__.py
23
+ doc2json/jats2json/pmc_utils/all_tag_utils.py
24
+ doc2json/jats2json/pmc_utils/back_tag_utils.py
25
+ doc2json/jats2json/pmc_utils/extract_utils.py
26
+ doc2json/jats2json/pmc_utils/front_tag_utils.py
27
+ doc2json/jats2json/pmc_utils/tests.py
28
+ doc2json/spp2json/__init__.py
29
+ doc2json/spp2json/process_pdf.py
30
+ doc2json/spp2json/spp/__init__.py
31
+ doc2json/spp2json/spp/spp_client.py
32
+ doc2json/spp2json/spp/spp_json_to_s2orc_json.py
33
+ doc2json/tex2json/__init__.py
34
+ doc2json/tex2json/process_tex.py
35
+ doc2json/tex2json/tex_to_xml.py
36
+ doc2json/tex2json/xml_to_json.py
37
+ doc2json/utils/__init__.py
38
+ doc2json/utils/citation_util.py
39
+ doc2json/utils/grobid_util.py
40
+ doc2json/utils/latex_util.py
41
+ doc2json/utils/refspan_util.py
42
+ doc2json/utils/soup_utils.py
s2orc-doc2json/doc2json.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
s2orc-doc2json/doc2json.egg-info/not-zip-safe ADDED
@@ -0,0 +1 @@
 
 
1
+
s2orc-doc2json/doc2json.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ doc2json
s2orc-doc2json/doc2json/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/config.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ S2ORC_NAME_STRING = 'S2ORC'
2
+ S2ORC_VERSION_STRING = '1.0.0'
s2orc-doc2json/doc2json/flask/app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Flask app for S2ORC pdf2json utility
3
+ """
4
+ import hashlib
5
+ from flask import Flask, request, jsonify, flash, url_for, redirect, render_template, send_file
6
+ from doc2json.grobid2json.process_pdf import process_pdf_stream
7
+ from doc2json.tex2json.process_tex import process_tex_stream
8
+ from doc2json.jats2json.process_jats import process_jats_stream
9
+
10
+ app = Flask(__name__)
11
+
12
+ ALLOWED_EXTENSIONS = {'pdf', 'gz', 'nxml'}
13
+
14
+
15
+ @app.route('/')
16
+ def home():
17
+ return render_template("home.html")
18
+
19
+ @app.route('/', methods=['POST'])
20
+ def upload_file():
21
+ uploaded_file = request.files['file']
22
+ if uploaded_file.filename != '':
23
+ filename = uploaded_file.filename
24
+ # read pdf file
25
+ if filename.endswith('pdf'):
26
+ pdf_stream = uploaded_file.stream
27
+ pdf_content = pdf_stream.read()
28
+ # compute hash
29
+ pdf_sha = hashlib.sha1(pdf_content).hexdigest()
30
+ # get results
31
+ results = process_pdf_stream(filename, pdf_sha, pdf_content)
32
+ return jsonify(results)
33
+ # read latex file
34
+ elif filename.endswith('gz'):
35
+ zip_stream = uploaded_file.stream
36
+ zip_content = zip_stream.read()
37
+ # get results
38
+ results = process_tex_stream(filename, zip_content)
39
+ return jsonify(results)
40
+ # read nxml file (jats)
41
+ elif filename.endswith('nxml'):
42
+ xml_stream = uploaded_file.stream
43
+ xml_content = xml_stream.read()
44
+ # get results
45
+ results = process_jats_stream(filename, xml_content)
46
+ return jsonify(results)
47
+ # unknown
48
+ else:
49
+ return {
50
+ "Error": "Unknown file type!"
51
+ }
52
+
53
+ return redirect(url_for('index'))
54
+
55
+
56
+ if __name__ == '__main__':
57
+ app.run(port=8080, host='0.0.0.0')
s2orc-doc2json/doc2json/flask/static/style.css ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ html {
2
+ box-sizing: border-box;
3
+ }
4
+
5
+ * {
6
+ box-sizing: inherit;
7
+ font-family: Calibri, Arial, sans-serif !important;
8
+ }
9
+
10
+ h1 {
11
+ font-size: 32px;
12
+ }
13
+
14
+ h2, h3 {
15
+ font-size: 24px;
16
+ }
17
+
18
+ body {
19
+ margin: 20px;
20
+ font-size: 125%;
21
+ line-height: 1.4;
22
+ max-width: 800px;
23
+ margin: 0 auto;
24
+ }
25
+
26
+ footer {
27
+ margin-top: 50px;
28
+ border-top: 1px solid silver;
29
+ font-size: 0.8em;
30
+ }
31
+
32
+ footer ol {
33
+ padding-left: 20px;
34
+ }
35
+
36
+ .p {
37
+ text-align: center;
38
+ font-size: .75em;
39
+ padding-top: 150px;
40
+ }
s2orc-doc2json/doc2json/flask/templates/home.html ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>S2ORC doc2json</title>
6
+ <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
7
+ </head>
8
+ <body>
9
+ <h1>S2ORC doc2json utility</h1>
10
+ <p>Upload a scientific PDF, LaTeX zip file, or JATS XML file and get back a JSON: </p>
11
+ <p>(Accepted file extensions: *.pdf, *.gz, *.nxml)</p>
12
+ <form method=post enctype=multipart/form-data>
13
+ <p><input type="file" name="file" accept=".pdf,.gz,.nxml"></p>
14
+ <p><input type="submit" value="Upload"></p>
15
+ </form>
16
+ <p>Please wait, processing takes time...</p>
17
+ </body>
18
+ </html>
s2orc-doc2json/doc2json/grobid2json/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Simple python client for GROBID REST services
2
+
3
+ **NOTE: This README is adapted from GROBID**
4
+
5
+ This Python client can be used to process in an efficient concurrent manner a set of PDF in a given directory by the [GROBID](https://github.com/kermitt2/grobid) service. Results are written in a given output directory and include the resulting XML TEI representation of the PDF.
6
+
7
+ ## Build and run
8
+
9
+ You need first to install and start the *grobid* service, latest stable version, see the [documentation](http://grobid.readthedocs.io/). It is assumed that the server will run on the address `http://localhost:8070`. You can change the server address by editing the file `config.json`.
10
+
11
+ ## Requirements
12
+
13
+ This client has been developed and tested with Python 3.5.
14
+
15
+ ## Install
16
+
17
+ Get the github repo:
18
+
19
+ > git clone https://github.com/kermitt2/grobid-client-python
20
+
21
+ > cd grobid-client-python
22
+
23
+ It is advised to setup first a virtual environment to avoid falling into one of these gloomy python dependency marshlands:
24
+
25
+ > virtualenv --system-site-packages -p python3 env
26
+
27
+ > source env/bin/activate
28
+
29
+ ## Usage and options
30
+
31
+ ```
32
+ usage: grobid-client.py [-h] [--input INPUT] [--config CONFIG]
33
+ [--output OUTPUT] [--n N]
34
+ service
35
+
36
+ Client for GROBID services
37
+
38
+ positional arguments:
39
+ service one of [processFulltextDocument,
40
+ processHeaderDocument, processReferences]
41
+
42
+ optional arguments:
43
+ -h, --help show this help message and exit
44
+ --input INPUT path to the directory containing PDF to process
45
+ --output OUTPUT path to the directory where to put the results
46
+ --config CONFIG path to the config file, default is ./config.json
47
+ --n N concurrency for service usage
48
+ --generateIDs generate random xml:id to textual XML elements of the
49
+ result files
50
+ --consolidate_header call GROBID with consolidation of the metadata
51
+ extracted from the header
52
+ --consolidate_citations
53
+ call GROBID with consolidation of the extracted
54
+ bibliographical references
55
+ ```
56
+
57
+ Examples:
58
+
59
+ > python3 grobid-client.py --input ~/tmp/in2 --output ~/tmp/out processFulltextDocument
60
+
61
+ This command will process all the PDF files present in the input directory (files with extension `.pdf` only) with the `processFulltextDocument` service of GROBID, and write the resulting XML TEI files under the output directory, reusing the file name with a different file extension (`.tei.xml`), using the default `10` concurrent workers.
62
+
63
+ > python3 grobid-client.py --input ~/tmp/in2 --output ~/tmp/out --n 20 processHeaderDocument
64
+
65
+ This command will process all the PDF files present in the input directory (files with extension `.pdf` only) with the `processHeaderDocument` service of GROBID, and write the resulting XML TEI files under the output directory, reusing the file name with a different file extension (`.tei.xml`), using `20` concurrent workers.
66
+
67
+ ## Benchmarking
68
+
69
+ Full text processing of __136 PDF__ (total 3443 pages, in average 25 pages per PDF) on Intel Core i7-4790K CPU 4.00GHz, 4 cores (8 threads), 16GB memory, `n` being the concurrency parameter:
70
+
71
+ | n | runtime (s)| s/PDF | PDF/s |
72
+ |----|------------|-------|-------|
73
+ | 1 | 209.0 | 1.54 | 0.65 |
74
+ | 2 | 112.0 | 0.82 | 1.21 |
75
+ | 3 | 80.4 | 0.59 | 1.69 |
76
+ | 5 | 62.9 | 0.46 | 2.16 |
77
+ | 8 | 55.7 | 0.41 | 2.44 |
78
+ | 10 | 55.3 | 0.40 | 2.45 |
79
+
80
+ ![Runtime Plot](resources/20180928112135.png)
81
+
82
+ As complementary info, GROBID processing of header of the 136 PDF and with `n=10` takes 3.74 s (15 times faster than the complete full text processing because only the two first pages of the PDF are considered), 36 PDF/s. In similar conditions, extraction and structuring of bibliographical references takes 26.9 s (5.1 PDF/s).
83
+
84
+ ## Todo
85
+
86
+ Benchmarking with more files (e.g. million ISTEX PDF). Also implement existing GROBID services for text input (date, name, affiliation/address, raw bibliographical references, etc.). Better support for parameters (including elements where to put coordinates).
87
+
88
+ ## License and contact
89
+
90
+ Distributed under [Apache 2.0 license](http://www.apache.org/licenses/LICENSE-2.0).
91
+
92
+ Main author and contact: Patrice Lopez (<patrice.lopez@science-miner.com>)
s2orc-doc2json/doc2json/grobid2json/grobid/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/grobid2json/grobid/client.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Generic API Client """
2
+ from copy import deepcopy
3
+ import json
4
+ import requests
5
+
6
+ try:
7
+ from urlparse import urljoin
8
+ except ImportError:
9
+ from urllib.parse import urljoin
10
+
11
+
12
+ class ApiClient(object):
13
+ """ Client to interact with a generic Rest API.
14
+
15
+ Subclasses should implement functionality accordingly with the provided
16
+ service methods, i.e. ``get``, ``post``, ``put`` and ``delete``.
17
+ """
18
+
19
+ accept_type = 'application/xml'
20
+ api_base = None
21
+
22
+ def __init__(
23
+ self,
24
+ base_url,
25
+ username=None,
26
+ api_key=None,
27
+ status_endpoint=None,
28
+ timeout=60
29
+ ):
30
+ """ Initialise client.
31
+
32
+ Args:
33
+ base_url (str): The base URL to the service being used.
34
+ username (str): The username to authenticate with.
35
+ api_key (str): The API key to authenticate with.
36
+ timeout (int): Maximum time before timing out.
37
+ """
38
+ self.base_url = base_url
39
+ self.username = username
40
+ self.api_key = api_key
41
+ self.status_endpoint = urljoin(self.base_url, status_endpoint)
42
+ self.timeout = timeout
43
+
44
+ @staticmethod
45
+ def encode(request, data):
46
+ """ Add request content data to request body, set Content-type header.
47
+
48
+ Should be overridden by subclasses if not using JSON encoding.
49
+
50
+ Args:
51
+ request (HTTPRequest): The request object.
52
+ data (dict, None): Data to be encoded.
53
+
54
+ Returns:
55
+ HTTPRequest: The request object.
56
+ """
57
+ if data is None:
58
+ return request
59
+
60
+ request.add_header('Content-Type', 'application/json')
61
+ request.data = json.dumps(data)
62
+
63
+ return request
64
+
65
+ @staticmethod
66
+ def decode(response):
67
+ """ Decode the returned data in the response.
68
+
69
+ Should be overridden by subclasses if something else than JSON is
70
+ expected.
71
+
72
+ Args:
73
+ response (HTTPResponse): The response object.
74
+
75
+ Returns:
76
+ dict or None.
77
+ """
78
+ try:
79
+ return response.json()
80
+ except ValueError as e:
81
+ return e.message
82
+
83
+ def get_credentials(self):
84
+ """ Returns parameters to be added to authenticate the request.
85
+
86
+ This lives on its own to make it easier to re-implement it if needed.
87
+
88
+ Returns:
89
+ dict: A dictionary containing the credentials.
90
+ """
91
+ return {"username": self.username, "api_key": self.api_key}
92
+
93
+ def call_api(
94
+ self,
95
+ method,
96
+ url,
97
+ headers=None,
98
+ params=None,
99
+ data=None,
100
+ files=None,
101
+ timeout=None,
102
+ ):
103
+ """ Call API.
104
+
105
+ This returns object containing data, with error details if applicable.
106
+
107
+ Args:
108
+ method (str): The HTTP method to use.
109
+ url (str): Resource location relative to the base URL.
110
+ headers (dict or None): Extra request headers to set.
111
+ params (dict or None): Query-string parameters.
112
+ data (dict or None): Request body contents for POST or PUT requests.
113
+ files (dict or None: Files to be passed to the request.
114
+ timeout (int): Maximum time before timing out.
115
+
116
+ Returns:
117
+ ResultParser or ErrorParser.
118
+ """
119
+ headers = deepcopy(headers) or {}
120
+ headers['Accept'] = self.accept_type
121
+ params = deepcopy(params) or {}
122
+ data = data or {}
123
+ files = files or {}
124
+ #if self.username is not None and self.api_key is not None:
125
+ # params.update(self.get_credentials())
126
+ r = requests.request(
127
+ method,
128
+ url,
129
+ headers=headers,
130
+ params=params,
131
+ files=files,
132
+ data=data,
133
+ timeout=timeout,
134
+ )
135
+
136
+ return r, r.status_code
137
+
138
+ def get(self, url, params=None, **kwargs):
139
+ """ Call the API with a GET request.
140
+
141
+ Args:
142
+ url (str): Resource location relative to the base URL.
143
+ params (dict or None): Query-string parameters.
144
+
145
+ Returns:
146
+ ResultParser or ErrorParser.
147
+ """
148
+ return self.call_api(
149
+ "GET",
150
+ url,
151
+ params=params,
152
+ **kwargs
153
+ )
154
+
155
+ def delete(self, url, params=None, **kwargs):
156
+ """ Call the API with a DELETE request.
157
+
158
+ Args:
159
+ url (str): Resource location relative to the base URL.
160
+ params (dict or None): Query-string parameters.
161
+
162
+ Returns:
163
+ ResultParser or ErrorParser.
164
+ """
165
+ return self.call_api(
166
+ "DELETE",
167
+ url,
168
+ params=params,
169
+ **kwargs
170
+ )
171
+
172
+ def put(self, url, params=None, data=None, files=None, **kwargs):
173
+ """ Call the API with a PUT request.
174
+
175
+ Args:
176
+ url (str): Resource location relative to the base URL.
177
+ params (dict or None): Query-string parameters.
178
+ data (dict or None): Request body contents.
179
+ files (dict or None: Files to be passed to the request.
180
+
181
+ Returns:
182
+ An instance of ResultParser or ErrorParser.
183
+ """
184
+ return self.call_api(
185
+ "PUT",
186
+ url,
187
+ params=params,
188
+ data=data,
189
+ files=files,
190
+ **kwargs
191
+ )
192
+
193
+ def post(self, url, params=None, data=None, files=None, **kwargs):
194
+ """ Call the API with a POST request.
195
+
196
+ Args:
197
+ url (str): Resource location relative to the base URL.
198
+ params (dict or None): Query-string parameters.
199
+ data (dict or None): Request body contents.
200
+ files (dict or None: Files to be passed to the request.
201
+
202
+ Returns:
203
+ An instance of ResultParser or ErrorParser.
204
+ """
205
+ return self.call_api(
206
+ method="POST",
207
+ url=url,
208
+ params=params,
209
+ data=data,
210
+ files=files,
211
+ **kwargs
212
+ )
213
+
214
+ def service_status(self, **kwargs):
215
+ """ Call the API to get the status of the service.
216
+
217
+ Returns:
218
+ An instance of ResultParser or ErrorParser.
219
+ """
220
+ return self.call_api(
221
+ 'GET',
222
+ self.status_endpoint,
223
+ params={'format': 'json'},
224
+ **kwargs
225
+ )
s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ grobid:
2
+ # NOTE: change these values to absolute paths when running on production
3
+ grobidHome: "grobid-home"
4
+
5
+ # how to load the models,
6
+ # false -> models are loaded when needed (default), avoiding puting in memory useless models
7
+ # true -> all the models are loaded into memory at the server statup, slow the start of the services and models not
8
+ # used will take some memory
9
+ modelPreload: true
10
+
11
+ server:
12
+ type: custom
13
+ applicationConnectors:
14
+ - type: http
15
+ port: 8070
16
+ adminConnectors:
17
+ - type: http
18
+ port: 8071
19
+ registerDefaultExceptionMappers: false
20
+
21
+
22
+ logging:
23
+ level: WARN
24
+ loggers:
25
+ org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
26
+ appenders:
27
+ - type: console
28
+ threshold: ALL
29
+ timeZone: UTC
30
+ # - type: file
31
+ # currentLogFilename: logs/grobid-service.log
32
+ # threshold: ALL
33
+ # archive: true
34
+ # archivedLogFilenamePattern: logs/grobid-service-%d.log
35
+ # archivedFileCount: 5
36
+ # timeZone: UTC
s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-------------------- resource directories ---------------------
2
+ # properties of where to find directories necessary for GROBID
3
+ # EACH KEY REFERENCING A PATH HAS TO ENDS WITH ".path"
4
+ grobid.resource.path=./resources
5
+ grobid.temp.path=./tmp
6
+ grobid.bin.path=./bin
7
+
8
+ #-------------------- external/native libs ---------------------
9
+ #path to folder containing native libraries of 3rd parties
10
+ grobid.nativelibrary.path=./lib
11
+ grobid.3rdparty.pdf2xml.path=./pdf2xml
12
+ grobid.3rdparty.pdf2xml.memory.limit.mb=6096
13
+ grobid.3rdparty.pdf2xml.timeout.sec=60
14
+ #-------------------------------------------------------------
15
+
16
+ #-------------------- consolidation --------------------
17
+ # Define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or "glutton" for https://github.com/kermitt2/biblio-glutton
18
+ grobid.consolidation.service=crossref
19
+ #grobid.consolidation.service=glutton
20
+ #org.grobid.glutton.host=cloud.science-miner.com/glutton
21
+ #org.grobid.glutton.port=0
22
+ org.grobid.glutton.host=localhost
23
+ org.grobid.glutton.port=8070
24
+ #org.grobid.crossref.mailto=toto@titi.tutu
25
+ #org.grobid.crossref.token=yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere
26
+
27
+ #-------------------- proxy --------------------
28
+ #proxy to be used for external call to the crossref REST API service or Glutton service if not deployed under proxy ("null" when no proxy)
29
+ grobid.proxy_host=null
30
+ grobid.proxy_port=null
31
+ #------------------------------------------------------
32
+
33
+ #-------------------- runtime ------------------
34
+ grobid.crf.engine=wapiti
35
+ #grobid.crf.engine=delft
36
+ #grobid.crf.engine=crfpp
37
+ grobid.delft.install=../delft
38
+ grobid.delft.useELMo=false
39
+ grobid.pdf.blocks.max=100000
40
+ grobid.pdf.tokens.max=1000000
41
+
42
+ #-------------------- training ------------------
43
+ #number of threads for training the wapiti models (0 to use all available processors)
44
+ grobid.nb_threads=0
45
+
46
+ #-------------------- language identification ------------------
47
+ #property for using or not the language identifier (true|false)
48
+ grobid.use_language_id=true
49
+ grobid.language_detector_factory=org.grobid.core.lang.impl.CybozuLanguageDetectorFactory
50
+ #determines if properties like the firstnames, lastnames country codes and dictionaries are supposed to be read from $GROBID_HOME path or not (possible values (true|false) dafault is false)
51
+ grobid.resources.inHome=true
52
+ #------------------------------------------------------
53
+
54
+ #-------------------- pooling -------------------
55
+ # Maximum parallel connections allowed
56
+ org.grobid.max.connections=72
57
+ # Maximum time wait to get a connection when the pool is full (in seconds)
58
+ org.grobid.pool.max.wait=1
59
+ #------------------------------------------------------
s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import argparse
5
+ import time
6
+ import glob
7
+ from doc2json.grobid2json.grobid.client import ApiClient
8
+ import ntpath
9
+ from typing import List
10
+
11
+ '''
12
+ This version uses the standard ProcessPoolExecutor for parallelizing the concurrent calls to the GROBID services.
13
+ Given the limits of ThreadPoolExecutor (input stored in memory, blocking Executor.map until the whole input
14
+ is acquired), it works with batches of PDF of a size indicated in the config.json file (default is 1000 entries).
15
+ We are moving from first batch to the second one only when the first is entirely processed - which means it is
16
+ slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would
17
+ require something scalable too, which is not implemented for the moment.
18
+ '''
19
+
20
+ DEFAULT_GROBID_CONFIG = {
21
+ "grobid_server": "localhost",
22
+ "grobid_port": "8070",
23
+ "batch_size": 1000,
24
+ "sleep_time": 5,
25
+ "generateIDs": False,
26
+ "consolidate_header": False,
27
+ "consolidate_citations": False,
28
+ "include_raw_citations": True,
29
+ "include_raw_affiliations": False,
30
+ "max_workers": 2,
31
+ }
32
+
33
+ class GrobidClient(ApiClient):
34
+
35
+ def __init__(self, config=None):
36
+ self.config = config or DEFAULT_GROBID_CONFIG
37
+ self.generate_ids = self.config["generateIDs"]
38
+ self.consolidate_header = self.config["consolidate_header"]
39
+ self.consolidate_citations = self.config["consolidate_citations"]
40
+ self.include_raw_citations = self.config["include_raw_citations"]
41
+ self.include_raw_affiliations = self.config["include_raw_affiliations"]
42
+ self.max_workers = self.config["max_workers"]
43
+ self.grobid_server = self.config["grobid_server"]
44
+ self.grobid_port = self.config["grobid_port"]
45
+ self.sleep_time = self.config["sleep_time"]
46
+
47
+ def process(self, input: str, output: str, service: str):
48
+ batch_size_pdf = self.config['batch_size']
49
+ pdf_files = []
50
+
51
+ for pdf_file in glob.glob(input + "/*.pdf"):
52
+ pdf_files.append(pdf_file)
53
+
54
+ if len(pdf_files) == batch_size_pdf:
55
+ self.process_batch(pdf_files, output, service)
56
+ pdf_files = []
57
+
58
+ # last batch
59
+ if len(pdf_files) > 0:
60
+ self.process_batch(pdf_files, output, service)
61
+
62
+ def process_batch(self, pdf_files: List[str], output: str, service: str) -> None:
63
+ print(len(pdf_files), "PDF files to process")
64
+ for pdf_file in pdf_files:
65
+ self.process_pdf(pdf_file, output, service)
66
+
67
+ def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, service: str) -> str:
68
+ # process the stream
69
+ files = {
70
+ 'input': (
71
+ pdf_file,
72
+ pdf_strm,
73
+ 'application/pdf',
74
+ {'Expires': '0'}
75
+ )
76
+ }
77
+
78
+ the_url = 'http://' + self.grobid_server
79
+ the_url += ":" + self.grobid_port
80
+ the_url += "/api/" + service
81
+
82
+ # set the GROBID parameters
83
+ the_data = {}
84
+ if self.generate_ids:
85
+ the_data['generateIDs'] = '1'
86
+ else:
87
+ the_data['generateIDs'] = '0'
88
+
89
+ if self.consolidate_header:
90
+ the_data['consolidateHeader'] = '1'
91
+ else:
92
+ the_data['consolidateHeader'] = '0'
93
+
94
+ if self.consolidate_citations:
95
+ the_data['consolidateCitations'] = '1'
96
+ else:
97
+ the_data['consolidateCitations'] = '0'
98
+
99
+ if self.include_raw_affiliations:
100
+ the_data['includeRawAffiliations'] = '1'
101
+ else:
102
+ the_data['includeRawAffiliations'] = '0'
103
+
104
+ if self.include_raw_citations:
105
+ the_data['includeRawCitations'] = '1'
106
+ else:
107
+ the_data['includeRawCitations'] = '0'
108
+
109
+ res, status = self.post(
110
+ url=the_url,
111
+ files=files,
112
+ data=the_data,
113
+ headers={'Accept': 'text/plain'}
114
+ )
115
+
116
+ if status == 503:
117
+ time.sleep(self.sleep_time)
118
+ return self.process_pdf_stream(pdf_file, pdf_strm, service)
119
+ elif status != 200:
120
+ with open(os.path.join(output, "failed.log"), "a+") as failed:
121
+ failed.write(pdf_file.strip(".pdf") + "\n")
122
+ print('Processing failed with error ' + str(status))
123
+ return ""
124
+ else:
125
+ return res.text
126
+
127
+ def process_pdf(self, pdf_file: str, output: str, service: str) -> None:
128
+ # check if TEI file is already produced
129
+ # we use ntpath here to be sure it will work on Windows too
130
+ pdf_file_name = ntpath.basename(pdf_file)
131
+ filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + '.tei.xml')
132
+ if os.path.isfile(filename):
133
+ return
134
+
135
+ print(pdf_file)
136
+ pdf_strm = open(pdf_file, 'rb').read()
137
+ tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service)
138
+
139
+ # writing TEI file
140
+ if tei_text:
141
+ with io.open(filename, 'w+', encoding='utf8') as tei_file:
142
+ tei_file.write(tei_text)
143
+
144
+ def process_citation(self, bib_string: str, log_file: str) -> str:
145
+ # process citation raw string and return corresponding dict
146
+ the_data = {
147
+ 'citations': bib_string,
148
+ 'consolidateCitations': '0'
149
+ }
150
+
151
+ the_url = 'http://' + self.grobid_server
152
+ the_url += ":" + self.grobid_port
153
+ the_url += "/api/processCitation"
154
+
155
+ for _ in range(5):
156
+ try:
157
+ res, status = self.post(
158
+ url=the_url,
159
+ data=the_data,
160
+ headers={'Accept': 'text/plain'}
161
+ )
162
+ if status == 503:
163
+ time.sleep(self.sleep_time)
164
+ continue
165
+ elif status != 200:
166
+ with open(log_file, "a+") as failed:
167
+ failed.write("-- BIBSTR --\n")
168
+ failed.write(bib_string + "\n\n")
169
+ break
170
+ else:
171
+ return res.text
172
+ except Exception:
173
+ continue
174
+
175
+ def process_header_names(self, header_string: str, log_file: str) -> str:
176
+ # process author names from header string
177
+ the_data = {
178
+ 'names': header_string
179
+ }
180
+
181
+ the_url = 'http://' + self.grobid_server
182
+ the_url += ":" + self.grobid_port
183
+ the_url += "/api/processHeaderNames"
184
+
185
+ res, status = self.post(
186
+ url=the_url,
187
+ data=the_data,
188
+ headers={'Accept': 'text/plain'}
189
+ )
190
+
191
+ if status == 503:
192
+ time.sleep(self.sleep_time)
193
+ return self.process_header_names(header_string, log_file)
194
+ elif status != 200:
195
+ with open(log_file, "a+") as failed:
196
+ failed.write("-- AUTHOR --\n")
197
+ failed.write(header_string + "\n\n")
198
+ else:
199
+ return res.text
200
+
201
+ def process_affiliations(self, aff_string: str, log_file: str) -> str:
202
+ # process affiliation from input string
203
+ the_data = {
204
+ 'affiliations': aff_string
205
+ }
206
+
207
+ the_url = 'http://' + self.grobid_server
208
+ the_url += ":" + self.grobid_port
209
+ the_url += "/api/processAffiliations"
210
+
211
+ res, status = self.post(
212
+ url=the_url,
213
+ data=the_data,
214
+ headers={'Accept': 'text/plain'}
215
+ )
216
+
217
+ if status == 503:
218
+ time.sleep(self.sleep_time)
219
+ return self.process_affiliations(aff_string, log_file)
220
+ elif status != 200:
221
+ with open(log_file, "a+") as failed:
222
+ failed.write("-- AFFILIATION --\n")
223
+ failed.write(aff_string + "\n\n")
224
+ else:
225
+ return res.text
226
+
227
+
228
+ if __name__ == "__main__":
229
+ parser = argparse.ArgumentParser(description="Client for GROBID services")
230
+ parser.add_argument("service", help="one of [processFulltextDocument, processHeaderDocument, processReferences]")
231
+ parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
232
+ parser.add_argument("--output", default=None, help="path to the directory where to put the results")
233
+ parser.add_argument("--config", default=None, help="path to the config file, default is ./config.json")
234
+
235
+ args = parser.parse_args()
236
+
237
+ input_path = args.input
238
+ config = json.load(open(args.config)) if args.config else DEFAULT_GROBID_CONFIG
239
+ output_path = args.output
240
+ service = args.service
241
+
242
+ client = GrobidClient(config=config)
243
+
244
+ start_time = time.time()
245
+
246
+ client.process(input_path, output_path, service)
247
+
248
+ runtime = round(time.time() - start_time, 3)
249
+ print("runtime: %s seconds " % (runtime))
s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import Dict, List
4
+ from PyPDF2 import PdfFileReader
5
+
6
+
7
+
s2orc-doc2json/doc2json/grobid2json/process_pdf.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import time
5
+ from bs4 import BeautifulSoup
6
+ from typing import Optional, Dict
7
+
8
+ from doc2json.grobid2json.grobid.grobid_client import GrobidClient
9
+ from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json
10
+
11
+ BASE_TEMP_DIR = 'temp'
12
+ BASE_OUTPUT_DIR = 'output'
13
+ BASE_LOG_DIR = 'log'
14
+
15
+
16
+ def process_pdf_stream(input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None) -> Dict:
17
+ """
18
+ Process PDF stream
19
+ :param input_file:
20
+ :param sha:
21
+ :param input_stream:
22
+ :return:
23
+ """
24
+ # process PDF through Grobid -> TEI.XML
25
+ client = GrobidClient(grobid_config)
26
+ tei_text = client.process_pdf_stream(input_file, input_stream, 'temp', "processFulltextDocument")
27
+
28
+ # make soup
29
+ soup = BeautifulSoup(tei_text, "xml")
30
+
31
+ # get paper
32
+ paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha)
33
+
34
+ return paper.release_json('pdf')
35
+
36
+
37
+ def process_pdf_file(
38
+ input_file: str,
39
+ temp_dir: str = BASE_TEMP_DIR,
40
+ output_dir: str = BASE_OUTPUT_DIR,
41
+ grobid_config: Optional[Dict] = None
42
+ ) -> str:
43
+ """
44
+ Process a PDF file and get JSON representation
45
+ :param input_file:
46
+ :param temp_dir:
47
+ :param output_dir:
48
+ :return:
49
+ """
50
+ os.makedirs(temp_dir, exist_ok=True)
51
+ os.makedirs(output_dir, exist_ok=True)
52
+
53
+ # get paper id as the name of the file
54
+ paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
55
+ tei_file = os.path.join(temp_dir, f'{paper_id}.tei.xml')
56
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
57
+
58
+ # check if input file exists and output file doesn't
59
+ if not os.path.exists(input_file):
60
+ raise FileNotFoundError(f"{input_file} doesn't exist")
61
+ if os.path.exists(output_file):
62
+ print(f'{output_file} already exists!')
63
+
64
+ # process PDF through Grobid -> TEI.XML
65
+ client = GrobidClient(grobid_config)
66
+ # TODO: compute PDF hash
67
+ # TODO: add grobid version number to output
68
+ client.process_pdf(input_file, temp_dir, "processFulltextDocument")
69
+
70
+ # process TEI.XML -> JSON
71
+ assert os.path.exists(tei_file)
72
+ paper = convert_tei_xml_file_to_s2orc_json(tei_file)
73
+
74
+ # write to file
75
+ with open(output_file, 'w') as outf:
76
+ json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
77
+
78
+ return output_file
79
+
80
+
81
+ if __name__ == '__main__':
82
+ parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
83
+ parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
84
+ parser.add_argument("-t", "--temp", default=BASE_TEMP_DIR, help="path to the temp dir for putting tei xml files")
85
+ parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json files")
86
+ parser.add_argument("-k", "--keep", action='store_true')
87
+
88
+ args = parser.parse_args()
89
+
90
+ input_path = args.input
91
+ temp_path = args.temp
92
+ output_path = args.output
93
+ keep_temp = args.keep
94
+
95
+ start_time = time.time()
96
+
97
+ os.makedirs(temp_path, exist_ok=True)
98
+ os.makedirs(output_path, exist_ok=True)
99
+
100
+ process_pdf_file(input_path, temp_path, output_path)
101
+
102
+ runtime = round(time.time() - start_time, 3)
103
+ print("runtime: %s seconds " % (runtime))
104
+ print('done.')
s2orc-doc2json/doc2json/grobid2json/tei_to_json.py ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import sys
5
+ import bs4
6
+ import re
7
+ from bs4 import BeautifulSoup, NavigableString
8
+ from typing import List, Dict, Tuple
9
+
10
+ from doc2json.s2orc import Paper
11
+
12
+ from doc2json.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
13
+ from doc2json.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
14
+ from doc2json.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
15
+ from doc2json.utils.refspan_util import sub_spans_and_update_indices
16
+
17
+
18
+ REPLACE_TABLE_TOKS = {
19
+ "<row>": "<tr>",
20
+ "<row/>": "<tr/>",
21
+ "</row>": "</tr>",
22
+ "<cell>": "<td>",
23
+ "<cell/>": "<td/>",
24
+ "</cell>": "</td>",
25
+ "<cell ": "<td ",
26
+ "cols=": "colspan="
27
+ }
28
+
29
+
30
+ class UniqTokenGenerator:
31
+ """
32
+ Generate unique token
33
+ """
34
+ def __init__(self, tok_string):
35
+ self.tok_string = tok_string
36
+ self.ind = 0
37
+
38
+ def __iter__(self):
39
+ return self
40
+
41
+ def __next__(self):
42
+ return self.next()
43
+
44
+ def next(self):
45
+ new_token = f'{self.tok_string}{self.ind}'
46
+ self.ind += 1
47
+ return new_token
48
+
49
+
50
+ def normalize_grobid_id(grobid_id: str):
51
+ """
52
+ Normalize grobid object identifiers
53
+ :param grobid_id:
54
+ :return:
55
+ """
56
+ str_norm = grobid_id.upper().replace('_', '').replace('#', '')
57
+ if str_norm.startswith('B'):
58
+ return str_norm.replace('B', 'BIBREF')
59
+ if str_norm.startswith('TAB'):
60
+ return str_norm.replace('TAB', 'TABREF')
61
+ if str_norm.startswith('FIG'):
62
+ return str_norm.replace('FIG', 'FIGREF')
63
+ if str_norm.startswith('FORMULA'):
64
+ return str_norm.replace('FORMULA', 'EQREF')
65
+ return str_norm
66
+
67
+
68
+ def parse_bibliography(soup: BeautifulSoup) -> List[Dict]:
69
+ """
70
+ Finds all bibliography entries in a grobid xml.
71
+ """
72
+ bibliography = soup.listBibl
73
+ if bibliography is None:
74
+ return []
75
+
76
+ entries = bibliography.find_all("biblStruct")
77
+
78
+ structured_entries = []
79
+ for entry in entries:
80
+ bib_entry = parse_bib_entry(entry)
81
+ # add bib entry only if it has a title
82
+ if bib_entry['title']:
83
+ structured_entries.append(bib_entry)
84
+
85
+ bibliography.decompose()
86
+
87
+ return structured_entries
88
+
89
+
90
+ def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None:
91
+ """
92
+ Replace all formulas with the text
93
+ :param sp:
94
+ :return:
95
+ """
96
+ for eq in sp.find_all('formula'):
97
+ eq.replace_with(sp.new_string(eq.text.strip()))
98
+
99
+
100
+ def table_to_html(table: bs4.element.Tag) -> str:
101
+ """
102
+ Sub table tags with html table tags
103
+ :param table_str:
104
+ :return:
105
+ """
106
+ for tag in table:
107
+ if tag.name != 'row':
108
+ print(f'Unknown table subtag: {tag.name}')
109
+ tag.decompose()
110
+ table_str = str(table)
111
+ for token, subtoken in REPLACE_TABLE_TOKS.items():
112
+ table_str = table_str.replace(token, subtoken)
113
+ return table_str
114
+
115
+
116
+ def extract_figures_and_tables_from_tei_xml(sp: BeautifulSoup) -> Dict[str, Dict]:
117
+ """
118
+ Generate figure and table dicts
119
+ :param sp:
120
+ :return:
121
+ """
122
+ ref_map = dict()
123
+
124
+ for fig in sp.find_all('figure'):
125
+ try:
126
+ if fig.name and fig.get('xml:id'):
127
+ if fig.get('type') == 'table':
128
+ ref_map[normalize_grobid_id(fig.get('xml:id'))] = {
129
+ "text": fig.figDesc.text.strip() if fig.figDesc else fig.head.text.strip() if fig.head else "",
130
+ "latex": None,
131
+ "type": "table",
132
+ "content": table_to_html(fig.table)
133
+ }
134
+ else:
135
+ ref_map[normalize_grobid_id(fig.get('xml:id'))] = {
136
+ "text": fig.figDesc.text.strip() if fig.figDesc else "",
137
+ "latex": None,
138
+ "type": "figure",
139
+ "content": ""
140
+ }
141
+ except AttributeError:
142
+ continue
143
+ fig.decompose()
144
+
145
+ return ref_map
146
+
147
+
148
+ def check_if_citations_are_bracket_style(sp: BeautifulSoup) -> bool:
149
+ """
150
+ Check if the document has bracket style citations
151
+ :param sp:
152
+ :return:
153
+ """
154
+ cite_strings = []
155
+ if sp.body:
156
+ for div in sp.body.find_all('div'):
157
+ if div.head:
158
+ continue
159
+ for rtag in div.find_all('ref'):
160
+ ref_type = rtag.get('type')
161
+ if ref_type == 'bibr':
162
+ cite_strings.append(rtag.text.strip())
163
+
164
+ # check how many match bracket style
165
+ bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings]
166
+
167
+ # return true if
168
+ if sum(bracket_style) > BRACKET_STYLE_THRESHOLD:
169
+ return True
170
+
171
+ return False
172
+
173
+
174
+ def sub_all_note_tags(sp: BeautifulSoup) -> BeautifulSoup:
175
+ """
176
+ Sub all note tags with p tags
177
+ :param para_el:
178
+ :param sp:
179
+ :return:
180
+ """
181
+ for ntag in sp.find_all('note'):
182
+ p_tag = sp.new_tag('p')
183
+ p_tag.string = ntag.text.strip()
184
+ ntag.replace_with(p_tag)
185
+ return sp
186
+
187
+
188
+ def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None:
189
+ """
190
+ Process all formulas in paragraph and replace with text and label
191
+ :param para_el:
192
+ :param sp:
193
+ :return:
194
+ """
195
+ for ftag in para_el.find_all('formula'):
196
+ # get label if exists and insert a space between formula and label
197
+ if ftag.label:
198
+ label = ' ' + ftag.label.text
199
+ ftag.label.decompose()
200
+ else:
201
+ label = ''
202
+ ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}'))
203
+
204
+
205
+ def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict:
206
+ """
207
+ Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
208
+ :param para_el:
209
+ :param sp:
210
+ :param refs:
211
+ :return:
212
+ """
213
+ tokgen = UniqTokenGenerator('REFTOKEN')
214
+ ref_dict = dict()
215
+ for rtag in para_el.find_all('ref'):
216
+ try:
217
+ ref_type = rtag.get('type')
218
+ # skip if citation
219
+ if ref_type == 'bibr':
220
+ continue
221
+ if ref_type == 'table' or ref_type == 'figure':
222
+ ref_id = rtag.get('target')
223
+ if ref_id and normalize_grobid_id(ref_id) in refs:
224
+ # normalize reference string
225
+ rtag_string = normalize_grobid_id(ref_id)
226
+ else:
227
+ rtag_string = None
228
+ # add to ref set
229
+ ref_key = tokgen.next()
230
+ ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
231
+ rtag.replace_with(sp.new_string(f" {ref_key} "))
232
+ else:
233
+ # replace with surface form
234
+ rtag.replace_with(sp.new_string(rtag.text.strip()))
235
+ except AttributeError:
236
+ continue
237
+ return ref_dict
238
+
239
+
240
+ def process_citations_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, bibs: Dict, bracket: bool) -> Dict:
241
+ """
242
+ Process all citations in paragraph and generate a dict for surface forms
243
+ :param para_el:
244
+ :param sp:
245
+ :param bibs:
246
+ :param bracket:
247
+ :return:
248
+ """
249
+ # CHECK if range between two surface forms is appropriate for bracket style expansion
250
+ def _get_surface_range(start_surface, end_surface):
251
+ span1_match = SINGLE_BRACKET_REGEX.match(start_surface)
252
+ span2_match = SINGLE_BRACKET_REGEX.match(end_surface)
253
+ if span1_match and span2_match:
254
+ # get numbers corresponding to citations
255
+ span1_num = int(span1_match.group(1))
256
+ span2_num = int(span2_match.group(1))
257
+ # expand if range is between 1 and 20
258
+ if 1 < span2_num - span1_num < 20:
259
+ return span1_num, span2_num
260
+ return None
261
+
262
+ # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4
263
+ def _create_ref_id_range(start_ref_id, end_ref_id):
264
+ start_ref_num = int(start_ref_id[6:])
265
+ end_ref_num = int(end_ref_id[6:])
266
+ return [f'BIBREF{curr_ref_num}' for curr_ref_num in range(start_ref_num, end_ref_num + 1)]
267
+
268
+ # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4]
269
+ def _create_surface_range(start_number, end_number):
270
+ return [f'[{n}]' for n in range(start_number, end_number + 1)]
271
+
272
+ # create citation dict with keywords
273
+ cite_map = dict()
274
+ tokgen = UniqTokenGenerator('CITETOKEN')
275
+
276
+ for rtag in para_el.find_all('ref'):
277
+ try:
278
+ # get surface span, e.g. [3]
279
+ surface_span = rtag.text.strip()
280
+
281
+ # check if target is available (#b2 -> BID2)
282
+ if rtag.get('target'):
283
+ # normalize reference string
284
+ rtag_ref_id = normalize_grobid_id(rtag.get('target'))
285
+
286
+ # skip if rtag ref_id not in bibliography
287
+ if rtag_ref_id not in bibs:
288
+ cite_key = tokgen.next()
289
+ rtag.replace_with(sp.new_string(f" {cite_key} "))
290
+ cite_map[cite_key] = (None, surface_span)
291
+ continue
292
+
293
+ # if bracket style, only keep if surface form is bracket
294
+ if bracket:
295
+ # valid bracket span
296
+ if surface_span and (surface_span[0] == '[' or surface_span[-1] == ']' or surface_span[-1] == ','):
297
+ pass
298
+ # invalid, replace tag with surface form and continue to next ref tag
299
+ else:
300
+ rtag.replace_with(sp.new_string(f" {surface_span} "))
301
+ continue
302
+ # not bracket, add cite span and move on
303
+ else:
304
+ cite_key = tokgen.next()
305
+ rtag.replace_with(sp.new_string(f" {cite_key} "))
306
+ cite_map[cite_key] = (rtag_ref_id, surface_span)
307
+ continue
308
+
309
+ ### EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ###
310
+ # look backward for range marker, e.g. [1]-*[3]*
311
+ backward_between_span = ""
312
+ for sib in rtag.previous_siblings:
313
+ if sib.name == 'ref':
314
+ break
315
+ elif type(sib) == NavigableString:
316
+ backward_between_span += sib
317
+ else:
318
+ break
319
+
320
+ # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3]
321
+ if is_expansion_string(backward_between_span):
322
+ # get surface number range
323
+ surface_num_range = _get_surface_range(
324
+ rtag.find_previous_sibling('ref').text.strip(),
325
+ surface_span
326
+ )
327
+ # if the surface number range is reasonable (range < 20, in order), EXPAND
328
+ if surface_num_range:
329
+ # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces)
330
+ for sib in rtag.previous_siblings:
331
+ if sib.name == 'ref':
332
+ break
333
+ elif type(sib) == NavigableString:
334
+ sib.replace_with(sp.new_string(""))
335
+ else:
336
+ break
337
+
338
+ # get ref id of previous ref, e.g. [1] (#b0 -> BID0)
339
+ previous_rtag = rtag.find_previous_sibling('ref')
340
+ previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target'))
341
+ previous_rtag.decompose()
342
+
343
+ # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2)
344
+ id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id)
345
+ surface_range = _create_surface_range(surface_num_range[0], surface_num_range[1])
346
+ replace_string = ''
347
+ for range_ref_id, range_surface_form in zip(id_range, surface_range):
348
+ # only replace if ref id is in bibliography, else add none
349
+ if range_ref_id in bibs:
350
+ cite_key = tokgen.next()
351
+ cite_map[cite_key] = (range_ref_id, range_surface_form)
352
+ else:
353
+ cite_key = tokgen.next()
354
+ cite_map[cite_key] = (None, range_surface_form)
355
+ replace_string += cite_key + ' '
356
+ rtag.replace_with(sp.new_string(f" {replace_string} "))
357
+ # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id
358
+ else:
359
+ # add mapping between ref id and surface form for previous ref tag
360
+ previous_rtag = rtag.find_previous_sibling('ref')
361
+ previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target'))
362
+ previous_rtag_surface = previous_rtag.text.strip()
363
+ cite_key = tokgen.next()
364
+ previous_rtag.replace_with(sp.new_string(f" {cite_key} "))
365
+ cite_map[cite_key] = (previous_rtag_ref_id, previous_rtag_surface)
366
+
367
+ # add mapping between ref id and surface form for current reftag
368
+ cite_key = tokgen.next()
369
+ rtag.replace_with(sp.new_string(f" {cite_key} "))
370
+ cite_map[cite_key] = (rtag_ref_id, surface_span)
371
+ else:
372
+ # look forward and see if expansion string, e.g. *[1]*-[3]
373
+ forward_between_span = ""
374
+ for sib in rtag.next_siblings:
375
+ if sib.name == 'ref':
376
+ break
377
+ elif type(sib) == NavigableString:
378
+ forward_between_span += sib
379
+ else:
380
+ break
381
+ # look forward for range marker (if is a range, continue -- range will be expanded
382
+ # when we get to the second value)
383
+ if is_expansion_string(forward_between_span):
384
+ continue
385
+ # else treat like normal reference
386
+ else:
387
+ cite_key = tokgen.next()
388
+ rtag.replace_with(sp.new_string(f" {cite_key} "))
389
+ cite_map[cite_key] = (rtag_ref_id, surface_span)
390
+
391
+ else:
392
+ cite_key = tokgen.next()
393
+ rtag.replace_with(sp.new_string(f" {cite_key} "))
394
+ cite_map[cite_key] = (None, surface_span)
395
+ except AttributeError:
396
+ continue
397
+
398
+ return cite_map
399
+
400
+
401
+ def process_paragraph(
402
+ sp: BeautifulSoup,
403
+ para_el: bs4.element.Tag,
404
+ section_names: List[Tuple],
405
+ bib_dict: Dict,
406
+ ref_dict: Dict,
407
+ bracket: bool
408
+ ) -> Dict:
409
+ """
410
+ Process one paragraph
411
+ :param sp:
412
+ :param para_el:
413
+ :param section_names:
414
+ :param bib_dict:
415
+ :param ref_dict:
416
+ :param bracket: if bracket style, expand and clean up citations
417
+ :return:
418
+ """
419
+ # return empty paragraph if no text
420
+ if not para_el.text:
421
+ return {
422
+ 'text': "",
423
+ 'cite_spans': [],
424
+ 'ref_spans': [],
425
+ 'eq_spans': [],
426
+ 'section': section_names
427
+ }
428
+
429
+ # replace formulas with formula text
430
+ process_formulas_in_paragraph(para_el, sp)
431
+
432
+ # get references to tables and figures
433
+ ref_map = process_references_in_paragraph(para_el, sp, ref_dict)
434
+
435
+ # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked)
436
+ cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket)
437
+
438
+ # substitute space characters
439
+ para_text = re.sub(r'\s+', ' ', para_el.text)
440
+ para_text = re.sub(r'\s', ' ', para_text)
441
+
442
+ # get all cite and ref spans
443
+ all_spans_to_replace = []
444
+ for span in re.finditer(r'(CITETOKEN\d+)', para_text):
445
+ uniq_token = span.group()
446
+ ref_id, surface_text = cite_map[uniq_token]
447
+ all_spans_to_replace.append((
448
+ span.start(),
449
+ span.start() + len(uniq_token),
450
+ uniq_token,
451
+ surface_text
452
+ ))
453
+ for span in re.finditer(r'(REFTOKEN\d+)', para_text):
454
+ uniq_token = span.group()
455
+ ref_id, surface_text, ref_type = ref_map[uniq_token]
456
+ all_spans_to_replace.append((
457
+ span.start(),
458
+ span.start() + len(uniq_token),
459
+ uniq_token,
460
+ surface_text
461
+ ))
462
+
463
+ # replace cite and ref spans and create json blobs
464
+ para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text)
465
+
466
+ cite_span_blobs = [{
467
+ "start": start,
468
+ "end": end,
469
+ "text": surface,
470
+ "ref_id": cite_map[token][0]
471
+ } for start, end, token, surface in all_spans_to_replace if token.startswith('CITETOKEN')]
472
+
473
+ ref_span_blobs = [{
474
+ "start": start,
475
+ "end": end,
476
+ "text": surface,
477
+ "ref_id": ref_map[token][0]
478
+ } for start, end, token, surface in all_spans_to_replace if token.startswith('REFTOKEN')]
479
+
480
+ for cite_blob in cite_span_blobs:
481
+ assert para_text[cite_blob["start"]:cite_blob["end"]] == cite_blob["text"]
482
+
483
+ for ref_blob in ref_span_blobs:
484
+ assert para_text[ref_blob["start"]:ref_blob["end"]] == ref_blob["text"]
485
+
486
+ return {
487
+ 'text': para_text,
488
+ 'cite_spans': cite_span_blobs,
489
+ 'ref_spans': ref_span_blobs,
490
+ 'eq_spans': [],
491
+ 'section': section_names
492
+ }
493
+
494
+
495
+ def extract_abstract_from_tei_xml(
496
+ sp: BeautifulSoup,
497
+ bib_dict: Dict,
498
+ ref_dict: Dict,
499
+ cleanup_bracket: bool
500
+ ) -> List[Dict]:
501
+ """
502
+ Parse abstract from soup
503
+ :param sp:
504
+ :param bib_dict:
505
+ :param ref_dict:
506
+ :param cleanup_bracket:
507
+ :return:
508
+ """
509
+ abstract_text = []
510
+ if sp.abstract:
511
+ # process all divs
512
+ if sp.abstract.div:
513
+ for div in sp.abstract.find_all('div'):
514
+ if div.text:
515
+ if div.p:
516
+ for para in div.find_all('p'):
517
+ if para.text:
518
+ abstract_text.append(
519
+ process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
520
+ )
521
+ else:
522
+ if div.text:
523
+ abstract_text.append(
524
+ process_paragraph(sp, div, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
525
+ )
526
+ # process all paragraphs
527
+ elif sp.abstract.p:
528
+ for para in sp.abstract.find_all('p'):
529
+ if para.text:
530
+ abstract_text.append(
531
+ process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
532
+ )
533
+ # else just try to get the text
534
+ else:
535
+ if sp.abstract.text:
536
+ abstract_text.append(
537
+ process_paragraph(sp, sp.abstract, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
538
+ )
539
+ sp.abstract.decompose()
540
+ return abstract_text
541
+
542
+
543
+ def extract_body_text_from_div(
544
+ sp: BeautifulSoup,
545
+ div: bs4.element.Tag,
546
+ sections: List[Tuple],
547
+ bib_dict: Dict,
548
+ ref_dict: Dict,
549
+ cleanup_bracket: bool
550
+ ) -> List[Dict]:
551
+ """
552
+ Parse body text from soup
553
+ :param sp:
554
+ :param div:
555
+ :param sections:
556
+ :param bib_dict:
557
+ :param ref_dict:
558
+ :param cleanup_bracket:
559
+ :return:
560
+ """
561
+ chunks = []
562
+ # check if nested divs; recursively process
563
+ if div.div:
564
+ for subdiv in div.find_all('div'):
565
+ # has header, add to section list and process
566
+ if subdiv.head:
567
+ chunks += extract_body_text_from_div(
568
+ sp,
569
+ subdiv,
570
+ sections + [(subdiv.head.get('n', None), subdiv.head.text.strip())],
571
+ bib_dict,
572
+ ref_dict,
573
+ cleanup_bracket
574
+ )
575
+ subdiv.head.decompose()
576
+ # no header, process with same section list
577
+ else:
578
+ chunks += extract_body_text_from_div(
579
+ sp,
580
+ subdiv,
581
+ sections,
582
+ bib_dict,
583
+ ref_dict,
584
+ cleanup_bracket
585
+ )
586
+ # process tags individuals
587
+ for tag in div:
588
+ try:
589
+ if tag.name == 'p':
590
+ if tag.text:
591
+ chunks.append(process_paragraph(
592
+ sp, tag, sections, bib_dict, ref_dict, cleanup_bracket
593
+ ))
594
+ elif tag.name == 'formula':
595
+ # e.g. <formula xml:id="formula_0">Y = W T X.<label>(1)</label></formula>
596
+ label = tag.label.text
597
+ tag.label.decompose()
598
+ eq_text = tag.text
599
+ chunks.append({
600
+ 'text': 'EQUATION',
601
+ 'cite_spans': [],
602
+ 'ref_spans': [],
603
+ 'eq_spans': [
604
+ {
605
+ "start": 0,
606
+ "end": 8,
607
+ "text": "EQUATION",
608
+ "ref_id": "EQREF",
609
+ "raw_str": eq_text,
610
+ "eq_num": label
611
+ }
612
+ ],
613
+ 'section': sections
614
+ })
615
+ except AttributeError:
616
+ if tag.text:
617
+ chunks.append(process_paragraph(
618
+ sp, tag, sections, bib_dict, ref_dict, cleanup_bracket
619
+ ))
620
+
621
+ return chunks
622
+
623
+
624
+ def extract_body_text_from_tei_xml(
625
+ sp: BeautifulSoup,
626
+ bib_dict: Dict,
627
+ ref_dict: Dict,
628
+ cleanup_bracket: bool
629
+ ) -> List[Dict]:
630
+ """
631
+ Parse body text from soup
632
+ :param sp:
633
+ :param bib_dict:
634
+ :param ref_dict:
635
+ :param cleanup_bracket:
636
+ :return:
637
+ """
638
+ body_text = []
639
+ if sp.body:
640
+ body_text = extract_body_text_from_div(sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket)
641
+ sp.body.decompose()
642
+ return body_text
643
+
644
+
645
+ def extract_back_matter_from_tei_xml(
646
+ sp: BeautifulSoup,
647
+ bib_dict: Dict,
648
+ ref_dict: Dict,
649
+ cleanup_bracket: bool
650
+ ) -> List[Dict]:
651
+ """
652
+ Parse back matter from soup
653
+ :param sp:
654
+ :param bib_dict:
655
+ :param ref_dict:
656
+ :param cleanup_bracket:
657
+ :return:
658
+ """
659
+ back_text = []
660
+
661
+ if sp.back:
662
+ for div in sp.back.find_all('div'):
663
+ if div.get('type'):
664
+ section_type = div.get('type')
665
+ else:
666
+ section_type = ''
667
+
668
+ for child_div in div.find_all('div'):
669
+ if child_div.head:
670
+ section_title = child_div.head.text.strip()
671
+ section_num = child_div.head.get('n', None)
672
+ child_div.head.decompose()
673
+ else:
674
+ section_title = section_type
675
+ section_num = None
676
+ if child_div.text:
677
+ if child_div.text:
678
+ back_text.append(
679
+ process_paragraph(sp, child_div, [(section_num, section_title)], bib_dict, ref_dict, cleanup_bracket)
680
+ )
681
+ sp.back.decompose()
682
+ return back_text
683
+
684
+
685
+ def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper:
686
+ """
687
+ Convert Grobid TEI XML to S2ORC json format
688
+ :param soup: BeautifulSoup of XML file content
689
+ :param paper_id: name of file
690
+ :param pdf_hash: hash of PDF
691
+ :return:
692
+ """
693
+ # extract metadata
694
+ metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc)
695
+ # clean metadata authors (remove dupes etc)
696
+ metadata['authors'] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata['authors'])
697
+
698
+ # parse bibliography entries (removes empty bib entries)
699
+ biblio_entries = parse_bibliography(soup)
700
+ bibkey_map = {
701
+ normalize_grobid_id(bib['ref_id']): bib for bib in biblio_entries
702
+ }
703
+
704
+ # # process formulas and replace with text
705
+ # extract_formulas_from_tei_xml(soup)
706
+
707
+ # extract figure and table captions
708
+ refkey_map = extract_figures_and_tables_from_tei_xml(soup)
709
+
710
+ # get bracket style
711
+ is_bracket_style = check_if_citations_are_bracket_style(soup)
712
+
713
+ # substitute all note tags with p tags
714
+ soup = sub_all_note_tags(soup)
715
+
716
+ # process abstract if possible
717
+ abstract_entries = extract_abstract_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
718
+
719
+ # process body text
720
+ body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
721
+
722
+ # parse back matter (acks, author statements, competing interests, abbrevs etc)
723
+ back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
724
+
725
+ # form final paper entry
726
+ return Paper(
727
+ paper_id=paper_id,
728
+ pdf_hash=pdf_hash,
729
+ metadata=metadata,
730
+ abstract=abstract_entries,
731
+ body_text=body_entries,
732
+ back_matter=back_matter,
733
+ bib_entries=bibkey_map,
734
+ ref_entries=refkey_map
735
+ )
736
+
737
+
738
+ def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper:
739
+ """
740
+ Convert a TEI XML file to S2ORC JSON
741
+ :param tei_file:
742
+ :param pdf_hash:
743
+ :return:
744
+ """
745
+ if not os.path.exists(tei_file):
746
+ raise FileNotFoundError("Input TEI XML file doesn't exist")
747
+ paper_id = tei_file.split('/')[-1].split('.')[0]
748
+ soup = BeautifulSoup(open(tei_file, "rb").read(), "xml")
749
+ paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash)
750
+ return paper
s2orc-doc2json/doc2json/jats2json/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/jats2json/jats_to_json.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mostly copied from cite2vec paper_parsing.parse_nxml
3
+ """
4
+
5
+ from typing import List, Set, Dict, Callable
6
+
7
+ import os
8
+ import json
9
+ import re
10
+ import multiprocessing
11
+ from bs4 import BeautifulSoup
12
+ from tqdm import tqdm
13
+ from glob import glob
14
+ from pprint import pprint
15
+
16
+ from doc2json.utils.soup_utils import destroy_unimportant_tags_inplace
17
+ from doc2json.jats2json.pmc_utils.front_tag_utils import parse_journal_id_tag, parse_journal_name_tag, \
18
+ parse_title_tag, parse_category_tag, parse_date_tag, parse_doi_tag, parse_pmc_id_tag, parse_pubmed_id_tag, \
19
+ parse_authors, parse_affiliations, parse_abstract_tag, parse_funding_groups, NoAuthorNamesError
20
+ from doc2json.jats2json.pmc_utils.extract_utils import extract_fig_blobs, extract_table_blobs, extract_suppl_blobs
21
+ from doc2json.jats2json.pmc_utils.all_tag_utils import replace_xref_with_string_placeholders, \
22
+ replace_sup_sub_tags_with_string_placeholders, recurse_parse_section
23
+ from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section
24
+ from doc2json.jats2json.pmc_utils.back_tag_utils import parse_bib_entries
25
+
26
+ from doc2json.s2orc import Paper
27
+
28
+
29
+ def process_front_tag(front_tag, soup) -> Dict:
30
+ # process <journal-meta> tags
31
+ journal_id: str = parse_journal_id_tag(front_tag=front_tag)
32
+ journal_name: str = parse_journal_name_tag(front_tag=front_tag)
33
+
34
+ # process <article-meta> tags
35
+ title: str = parse_title_tag(front_tag=front_tag)
36
+
37
+ try:
38
+ authors: List[Dict] = parse_authors(front_tag=front_tag)
39
+ except NoAuthorNamesError:
40
+ authors: List[Dict] = []
41
+ affiliations: Dict = parse_affiliations(front_tag=front_tag)
42
+
43
+ dates: Dict = parse_date_tag(front_tag=front_tag)
44
+
45
+ pubmed_id: str = parse_pubmed_id_tag(front_tag=front_tag)
46
+ pmc_id: str = parse_pmc_id_tag(front_tag=front_tag)
47
+ doi: str = parse_doi_tag(front_tag=front_tag)
48
+
49
+ abstract: List[Dict] = parse_abstract_tag(front_tag=front_tag, soup=soup)
50
+
51
+ # categories: str = parse_category_tag(front_tag=front_tag)
52
+
53
+ funding_groups: List[str] = parse_funding_groups(front_tag=front_tag)
54
+
55
+ return {
56
+ 'title': title,
57
+ 'abstract': abstract,
58
+ 'authors': authors,
59
+ 'affiliations': affiliations,
60
+ 'journal_id': journal_id,
61
+ 'journal_name': journal_name,
62
+ 'pubmed_id': pubmed_id,
63
+ 'pmc_id': pmc_id,
64
+ 'doi': doi,
65
+ 'year': dates,
66
+ 'funding_groups': funding_groups
67
+ }
68
+
69
+
70
+ def process_body_tag(body_tag, soup) -> Dict:
71
+ # replace all xref tags with string placeholders
72
+ replace_xref_with_string_placeholders(soup_tag=body_tag, soup=soup)
73
+
74
+ # replace all sup/sub tags with string placeholders
75
+ replace_sup_sub_tags_with_string_placeholders(soup_tag=body_tag, soup=soup)
76
+
77
+ # some articles (like PMC2844102) have no sections
78
+ sec_tags = body_tag.find_all('sec', recursive=False)
79
+
80
+ # try looking in article tag
81
+ if not sec_tags:
82
+ try:
83
+ sec_tags = body_tag.article.find_all('sec', recursive=False)
84
+ except:
85
+ pass
86
+
87
+ if sec_tags:
88
+ all_par_blobs = []
89
+ for sec_tag in sec_tags:
90
+ # note; most sections dont have this 'sec-type' attribute
91
+ if sec_tag.get('sec-type') == 'supplementary-material':
92
+ # hopefully all the important supplementary content already extracted above in previous step
93
+ continue
94
+ else:
95
+ par_blobs = recurse_parse_section(sec_tag=sec_tag)
96
+ all_par_blobs.extend(par_blobs)
97
+ else:
98
+ all_par_blobs = parse_all_paragraphs_in_section(body_tag)
99
+
100
+ return {
101
+ 'body_text': all_par_blobs,
102
+ }
103
+
104
+
105
+ def process_back_tag(back_tag) -> Dict:
106
+ # glossary = {}
107
+ # if back_tag.find('glossary'):
108
+ # for def_item_tag in back_tag.find('glossary').find_all('def-item'):
109
+ # glossary[def_item_tag.find('term').text] = def_item_tag.find('def').text
110
+
111
+ # TODO: author contrib and COIs
112
+ # notes = []
113
+ # for notes_tag in back_tag.find_all('notes'):
114
+ # pass
115
+
116
+ # TODO: PMC2778891 has back tag that looks like: <back><sec><title>Acknowledgements</title><p>Supported by the Austrian Science Fund (P-20670 and W11).</p></sec></back>
117
+ # that is, it doesn't have 'ack' section.
118
+ acknowledgements: List[Dict] = []
119
+ for ack_tag in back_tag.find_all('ack'):
120
+ title_tag = ack_tag.find('title')
121
+ for par_tag in ack_tag.find_all('p'):
122
+ acknowledgements.append({
123
+ 'section': title_tag.text if title_tag is not None else None,
124
+ 'text': par_tag.text,
125
+ 'funding_sources': [fund_tag.text for fund_tag in par_tag.find_all('funding-source')],
126
+ 'urls': [url_tag.text for url_tag in par_tag.find_all('ext-link')]
127
+ })
128
+
129
+ bib_entries = parse_bib_entries(back_tag)
130
+
131
+ return {
132
+ 'acknowledgements': acknowledgements,
133
+ 'bib_entries': bib_entries,
134
+ }
135
+
136
+
137
+ def postprocess_front_tags_for_s2orc(init_front_dict: Dict):
138
+ """
139
+ Fix authors and year for S2ORC format
140
+ """
141
+ # Make authors in front tags look like S2ORC
142
+ for a in init_front_dict['authors']:
143
+ a['affiliation'] = {}
144
+ # get affiliation if available
145
+ if a['affiliation_ids']:
146
+ affil_id = a['affiliation_ids'][0]
147
+ affil_text = [affil['text'] for affil in init_front_dict['affiliations'] if affil['id'] == affil_id]
148
+ if affil_text:
149
+ a['affiliation'] = {
150
+ 'laboratory': "",
151
+ 'institution': affil_text[0],
152
+ 'location': {}
153
+ }
154
+ del a['affiliation_ids']
155
+ del a['corresponding']
156
+ del a['orcid']
157
+ del init_front_dict['affiliations']
158
+
159
+ # Pick best year and make year int in front tags
160
+ if init_front_dict['year'].get('epub'):
161
+ year = init_front_dict['year'].get('epub')
162
+ elif init_front_dict['year'].get('accepted'):
163
+ year = init_front_dict['year'].get('accepted')
164
+ elif init_front_dict['year'].get('collection'):
165
+ year = init_front_dict['year'].get('collection')
166
+ elif init_front_dict['year'].get('received'):
167
+ year = init_front_dict['year'].get('received')
168
+ else:
169
+ year = None
170
+ init_front_dict['year'] = year
171
+
172
+ return init_front_dict
173
+
174
+
175
+ def convert_acks_to_s2orc(paragraphs: List) -> List[Dict]:
176
+ """
177
+ Convert acks to S2ORC paragraphs
178
+ """
179
+ for paragraph_blob in paragraphs:
180
+ paragraph_blob['cite_spans'] = []
181
+ paragraph_blob['ref_spans'] = []
182
+ del paragraph_blob['funding_sources']
183
+ del paragraph_blob['urls']
184
+ return paragraphs
185
+
186
+
187
+ def convert_paragraphs_to_s2orc(paragraphs: List, old_to_new: Dict) -> List[Dict]:
188
+ """
189
+ Convert paragraphs into S2ORC format
190
+ """
191
+ # TODO: temp code to process body text into S2ORC format. this includes getting rid of sub/superscript spans.
192
+ # also combining fig & table spans into ref spans.
193
+ # also remapping the reference / bib labels to the new ones defined earlier in this function.
194
+ # temporarily, we cant support PMC xml parse bibs, so remove all links to the bibliography (cuz they'll be wrong)
195
+ for paragraph_blob in paragraphs:
196
+ del paragraph_blob['sup_spans']
197
+ del paragraph_blob['sub_spans']
198
+ paragraph_blob['ref_spans'] = []
199
+ for fig_tab_span in paragraph_blob['fig_spans'] + paragraph_blob['table_spans']:
200
+ # replace old ref_id with new ref_id. default to None if null
201
+ # optional, just wanted to check if this ever happens
202
+ assert fig_tab_span['ref_id']
203
+ fig_tab_span['ref_id'] = old_to_new.get(fig_tab_span['ref_id'])
204
+ paragraph_blob['ref_spans'].append(fig_tab_span)
205
+ del paragraph_blob['fig_spans']
206
+ del paragraph_blob['table_spans']
207
+ for cite_span in paragraph_blob['cite_spans']:
208
+ # replace old cite ids with new cite ids. again default to None if null
209
+ # optional, just wanted to check if this ever happens
210
+ assert cite_span['ref_id']
211
+ cite_span['ref_id'] = old_to_new.get(cite_span['ref_id'])
212
+ return paragraphs
213
+
214
+
215
+ def convert_jats_xml_to_s2orc_json(jats_file: str, log_dir: str):
216
+ """
217
+ Convert JATS XML to S2ORC JSON
218
+ :param jats_file:
219
+ :param log_dir:
220
+ :return:
221
+ """
222
+ # get file id (PMC id usually)
223
+ file_id = jats_file.split('/')[-1].split('.')[0]
224
+
225
+ # read JATS XML
226
+ with open(jats_file, 'r') as f_in:
227
+ soup = BeautifulSoup(f_in, 'lxml')
228
+ destroy_unimportant_tags_inplace(soup, tags_to_remove=['bold', 'italic', 'graphic'])
229
+
230
+ # all the XML files have their own wonky reference IDs. we want to standardize them, but need to remember the old->new mapping
231
+ old_key_to_new_key = {}
232
+
233
+ # REFERENCES
234
+ table_blobs = extract_table_blobs(soup)
235
+ figure_blobs = extract_fig_blobs(soup)
236
+ # TODO: not current represented in S2ORC, keep for later
237
+ suppl_blobs = extract_suppl_blobs(soup)
238
+ # TODO: for S2ORC, need to process them into a single ref dict. need to construct new IDs to match ID conventions. and update all cite spans.
239
+ # also, S2ORC table captions are free text without detected reference/citation mentions
240
+ # TODO: may want to keep table representations around
241
+ ref_entries = {}
242
+ for i, (old_table_key, table_blob) in enumerate(sorted(table_blobs.items())):
243
+ # TODO: PMC2557072 table `tbl5` has no label. skip.
244
+ # TODO: PMC3137981 table `tab1` has no caption text. skip.
245
+ if not table_blob['label'] or not table_blob['caption']:
246
+ continue
247
+ table_text = table_blob['label'] + ': ' + ' '.join(
248
+ [c['text'] for c in table_blob['caption']]
249
+ ) + '\n' + ' '.join([f['text'] for f in table_blob['footnote']])
250
+ new_table_key = f'TABREF{i}'
251
+ old_key_to_new_key[old_table_key] = new_table_key
252
+ # TODO: skipping over any citations or references in the table for now
253
+ if table_blob['xml']:
254
+ table_content = table_blob['xml'][0]['text']
255
+ ref_entries[new_table_key] = {'text': table_text, 'content': table_content, 'type': 'table'}
256
+ for i, (old_figure_key, figure_blob) in enumerate(sorted(figure_blobs.items())):
257
+ # TODO: double-check, but it seems like figure blobs dont have footnotes parsed out? might be bug
258
+ # TODO: PMC1326260 first figure has no ['label']. just skip these for now (because no inline references)
259
+ # TODO: PMC2403743 has null-valued caption in `fig1`. also skip here. fix later.
260
+ if not figure_blob['label'] or not figure_blob['caption']:
261
+ continue
262
+ figure_text = figure_blob['label'] + ': ' + ' '.join([c['text'] for c in figure_blob['caption']])
263
+ new_figure_key = f'FIGREF{i}'
264
+ old_key_to_new_key[old_figure_key] = new_figure_key
265
+ ref_entries[new_figure_key] = {'text': figure_text, 'type': 'figure'}
266
+
267
+ # FRONT TAGS
268
+ front_tag = soup.find('front').extract()
269
+ front_dict = process_front_tag(front_tag=front_tag, soup=soup)
270
+ front_dict = postprocess_front_tags_for_s2orc(front_dict)
271
+ front_dict['abstract'] = convert_paragraphs_to_s2orc(front_dict['abstract'], old_key_to_new_key)
272
+
273
+ # BACK TAGS
274
+ back_tag = soup.find('back')
275
+ back_dict = {}
276
+ # PMC1139917 doesnt have 'back' tag
277
+ if back_tag is not None:
278
+ back_dict = process_back_tag(back_tag=back_tag)
279
+ # TODO: format bib entries to S2ORC format. we're already very close, but need a couple changes:
280
+ # - author blobs include a 'suffix' which defaults to empty string
281
+ # - issn defaults to empty string
282
+ # - rename all the bib IDs
283
+ bib_entries = {}
284
+ for i, (old_bib_key, bib_entry) in enumerate(sorted(back_dict['bib_entries'].items())):
285
+ del bib_entry['ref_id']
286
+ new_bib_key = f'BIBREF{i}'
287
+ old_key_to_new_key[old_bib_key] = new_bib_key
288
+ bib_entries[new_bib_key] = bib_entry
289
+ else:
290
+ bib_entries = {}
291
+
292
+ if back_dict and back_dict.get('acknowledgements'):
293
+ back_dict['acknowledgements'] = convert_acks_to_s2orc(back_dict['acknowledgements'])
294
+
295
+ # BODY TAGS
296
+ body_tag = soup.find('body')
297
+ # PMC1240684 doesnt have 'body' tag
298
+ if body_tag is not None:
299
+ body_dict = process_body_tag(body_tag=body_tag, soup=soup)
300
+ body_text = body_dict['body_text']
301
+ else:
302
+ # Has no body: /disk2/gorpus/20200101/pmc/Br_Foreign_Med_Chir_Rev/PMC5163425.nxml
303
+ body_text = []
304
+
305
+ body_text = convert_paragraphs_to_s2orc(body_text, old_key_to_new_key)
306
+
307
+ metadata = {
308
+ "title": front_dict['title'],
309
+ "authors": front_dict['authors'],
310
+ "year": front_dict['year'],
311
+ "venue": front_dict['journal_name'],
312
+ "identifiers": {
313
+ "doi": front_dict['doi'],
314
+ "pubmed_id": front_dict['pubmed_id'],
315
+ "pmc_id": front_dict['pmc_id']
316
+ }
317
+ }
318
+
319
+ return Paper(
320
+ paper_id=file_id,
321
+ pdf_hash="",
322
+ metadata=metadata,
323
+ abstract=front_dict['abstract'],
324
+ body_text=body_text,
325
+ back_matter=back_dict.get('acknowledgements', []),
326
+ bib_entries=bib_entries,
327
+ ref_entries=ref_entries
328
+ )
329
+
330
+
331
+ if __name__ == '__main__':
332
+ jats_file = 'tests/jats/PMC5828200.nxml'
333
+ paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
334
+
335
+ jats_file = 'tests/jats/PMC6398430.nxml'
336
+ paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
337
+
338
+ jats_file = 'tests/jats/PMC7417471.nxml'
339
+ paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
340
+
341
+ print('done.')
s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Callable
2
+
3
+ import re
4
+ import itertools
5
+
6
+ from bs4 import BeautifulSoup
7
+
8
+ START_TOKENS = {"#!start#", "@!start@", "&!start&"}
9
+ SEP_TOKENS = {"#!sep#"}
10
+ END_TOKENS = {"#!end#", "@!end@", "&!end&"}
11
+ ALL_TOKENS = START_TOKENS | SEP_TOKENS | END_TOKENS
12
+
13
+
14
+ def replace_xref_with_string_placeholders(soup_tag, soup):
15
+ # replace all xref tags with string placeholders
16
+ for xref_tag in soup_tag.find_all("xref"):
17
+ rid = xref_tag['rid'] if 'rid' in xref_tag.attrs else None
18
+ ref_type = xref_tag['ref-type'] if 'ref-type' in xref_tag.attrs else None
19
+ xref_tag.replace_with(
20
+ soup.new_string(
21
+ f"#!start#{xref_tag.text}#!sep#{rid}#!sep#{ref_type}#!end#"
22
+ )
23
+ )
24
+
25
+
26
+ def replace_sup_sub_tags_with_string_placeholders(soup_tag, soup):
27
+ # replace all sup/sub tags with string placeholders
28
+ for sup_tag in soup_tag.find_all("sup"):
29
+ sup_tag.replace_with(soup.new_string(f"@!start@{sup_tag.text}@!end@"))
30
+ for sub_tag in soup_tag.find_all("sub"):
31
+ sub_tag.replace_with(soup.new_string(f"&!start&{sub_tag.text}&!end&"))
32
+
33
+
34
+ def recurse_parse_section(
35
+ sec_tag,
36
+ # suppl_blobs: Dict
37
+ ) -> List[Dict]:
38
+ """Recursive function for getting paragraph blobs to look like
39
+ {
40
+ 'text': ...,
41
+ ...,
42
+ 'section': SUBSUBSECTION_NAME :: SUBSECTION_NAME :: SECTION_NAME
43
+ }
44
+ """
45
+ subsections = sec_tag.find_all("sec", recursive=False)
46
+ if not subsections:
47
+ return parse_all_paragraphs_in_section(
48
+ sec_tag=sec_tag
49
+ ) # , suppl_blobs=suppl_blobs)
50
+ else:
51
+ outputs = []
52
+ for child in subsections:
53
+ child_blobs = recurse_parse_section(
54
+ sec_tag=child
55
+ ) # , suppl_blobs=suppl_blobs)
56
+ for blob in child_blobs:
57
+ # PMC373254 - process blob['section'] to remove any span markers left in there
58
+ for t in ALL_TOKENS:
59
+ blob['section'] = blob['section'].replace(t, '')
60
+ blob["section"] = blob["section"] + " :: " + sec_tag.find("title").text
61
+ outputs.extend(child_blobs)
62
+ return outputs
63
+
64
+
65
+ def _reduce_args(stack: List, end_token: str) -> List[List]:
66
+ """Helper function for `_parse_all_paragraphs_in_section`.
67
+
68
+ Pop arguments for the xref off the top of the stack and return a list of argument lists,
69
+ where the outer lists represent groups divided by separators."""
70
+ start_token = end_token.replace('end', 'start')
71
+ sep_token = end_token.replace('end', 'sep')
72
+ args = [[]]
73
+ while True:
74
+ token = stack.pop()
75
+ if token == start_token:
76
+ return args
77
+ elif token == sep_token:
78
+ args.insert(0, [])
79
+ else:
80
+ args[0].insert(0, token)
81
+
82
+
83
+ def _add_spans(
84
+ end_token: str,
85
+ start_pos: int,
86
+ text: str,
87
+ ref_id,
88
+ ref_type,
89
+ cite_spans: List,
90
+ fig_spans: List,
91
+ table_spans: List,
92
+ sup_spans: List,
93
+ sub_spans: List,
94
+ ):
95
+ """Helper function used by `_parse_all_paragraphs_in_section`."""
96
+ if end_token.startswith("#"): # process xref
97
+ blob = {
98
+ "start": start_pos,
99
+ "end": start_pos + len(text),
100
+ "mention": text,
101
+ "ref_id": ref_id,
102
+ }
103
+ if ref_type == "bibr":
104
+ cite_spans.append(blob)
105
+ elif ref_type == "fig":
106
+ fig_spans.append(blob)
107
+ elif ref_type == "table":
108
+ table_spans.append(blob)
109
+
110
+ else:
111
+ blob = {
112
+ "start": start_pos,
113
+ "end": start_pos + len(text),
114
+ "mention": text,
115
+ }
116
+ if end_token.startswith("@"):
117
+ sup_spans.append(blob)
118
+ else:
119
+ assert end_token.startswith("&")
120
+ sub_spans.append(blob)
121
+
122
+
123
+ def get_latex_from_formula(
124
+ formula_tag
125
+ ):
126
+ if formula_tag.find('tex-math'):
127
+ latex_text = formula_tag.find('tex-math').text
128
+ match = re.search(r'\\begin\{document\}(.+)\\end\{document\}', latex_text)
129
+ if match:
130
+ return match.group(1).strip('$')
131
+ return None
132
+
133
+
134
+ def get_mathml_from_formula(
135
+ formula_tag
136
+ ):
137
+ if formula_tag.find('mml:math'):
138
+ return str(formula_tag.find('mml:math'))
139
+ return None
140
+
141
+
142
+ def parse_formulas(
143
+ para_el,
144
+ sp,
145
+ replace
146
+ ):
147
+ # sub and get corresponding spans of inline formulas
148
+ formula_dict = dict()
149
+ eq_ind = 0
150
+ for ftag in para_el.find_all('inline-formula'):
151
+ try:
152
+ formula_key = f'INLINEFORM{eq_ind}'
153
+ eq_ind += 1
154
+ try:
155
+ formula_text = ftag.find('mml:math').text
156
+ except:
157
+ if 'begin{document}' not in ftag.text:
158
+ formula_text = ftag.text
159
+ else:
160
+ formula_text = "FORMULA"
161
+ formula_latex = get_latex_from_formula(ftag)
162
+ formula_mathml = get_mathml_from_formula(ftag)
163
+ if not formula_mathml and formula_latex:
164
+ formula_mathml = latex2mathml.converter.convert(formula_latex)
165
+ formula_dict[formula_key] = (formula_text, formula_latex, formula_mathml, ftag.get('id'))
166
+ if replace:
167
+ ftag.replace_with(sp.new_string(f" {formula_key} "))
168
+ else:
169
+ # replace with mathml text if available
170
+ if formula_text != 'FORMULA':
171
+ ftag.replace_with(sp.new_string(f" {formula_text} "))
172
+ except AttributeError:
173
+ continue
174
+
175
+ return formula_dict
176
+
177
+
178
+ def parse_all_paragraphs_in_section(
179
+ sec_tag,
180
+ par_to_text: Callable = None,
181
+ replace_formula=True
182
+ ) -> List[Dict]:
183
+ """Internal function. Assumes section has no nested tags
184
+ `par_to_text` is an optional function that converts the `par` tag into a string. by default, calls `par_tag.text`.
185
+ """
186
+ outputs = []
187
+ sp = BeautifulSoup('', 'lxml')
188
+ for par_tag in sec_tag.find_all("p", recursive=True):
189
+ cite_spans = []
190
+ fig_spans = []
191
+ table_spans = []
192
+ # suppl_spans = []
193
+ sup_spans = []
194
+ sub_spans = []
195
+ eq_spans = []
196
+
197
+ if par_tag.find('display-formula'):
198
+ raise NotImplementedError('Display formula!')
199
+
200
+ if par_tag.find('formula'):
201
+ raise NotImplementedError('Formula!')
202
+
203
+ formula_dict = parse_formulas(par_tag, sp, replace_formula)
204
+
205
+ par_text = par_to_text(par_tag) if par_to_text else par_tag.text
206
+ par_text = re.sub(
207
+ r"[^\S\n\t]", " ", par_text
208
+ ) # replaces whitespace but not newline or tab
209
+ par_text = re.sub(
210
+ r" ", " ", par_text
211
+ ) # replaces two spaces w/ one
212
+
213
+ # Tokenize the text into normal text and special placeholder tokens.
214
+ pattern = r"(#!start#)|(#!sep#)|(#!end#)|(@!start@)|(@!end@)|(&!start&)|(&!end&)"
215
+ tokens = [tok for tok in re.split(pattern, par_text) if tok]
216
+
217
+ # To handle nested structures, use a shift-reduce algorithm to consume the text. Placeholder tags are merged away, and related spans are registered.
218
+ stack = []
219
+ full_text = []
220
+ pos = 0
221
+ disable_count = False
222
+ for token in tokens:
223
+ if token in START_TOKENS:
224
+ stack.append(token)
225
+ stack.append(pos)
226
+ stack.append(token.replace('start', 'sep'))
227
+ elif token in SEP_TOKENS:
228
+ assert stack
229
+ stack.append(token)
230
+ disable_count = True
231
+ elif token in END_TOKENS:
232
+ assert stack
233
+ disable_count = False
234
+ args = _reduce_args(stack, token)
235
+ start_pos = args[0][0]
236
+ text = "".join(args[1])
237
+ assert len(args) == 2 or len(args) == 4
238
+ if len(args) == 2:
239
+ ref_id, ref_type = None, None
240
+ elif len(args) == 4:
241
+ ref_id = args[2] and args[2][0]
242
+ ref_type = args[3] and args[3][0]
243
+ stack.append(text)
244
+ _add_spans(
245
+ token,
246
+ start_pos,
247
+ text,
248
+ ref_id,
249
+ ref_type,
250
+ cite_spans,
251
+ fig_spans,
252
+ table_spans,
253
+ sup_spans,
254
+ sub_spans,
255
+ )
256
+ else: # just normal text
257
+ stack.append(token)
258
+ if not disable_count: # metadata appearing after a separator
259
+ full_text.append(token)
260
+ pos += len(token)
261
+
262
+ full_text = "".join(full_text)
263
+ assert pos == len(full_text)
264
+
265
+ title = sec_tag.find("title")
266
+ title = title.text if title else ""
267
+
268
+ # get all equation spans
269
+ eq_spans = []
270
+ for span in itertools.chain(
271
+ re.finditer(r'(INLINEFORM\d+)', full_text),
272
+ re.finditer(r'(DISPLAYFORM\d+)', full_text)
273
+ ):
274
+ try:
275
+ matching_formula = formula_dict[span.group()]
276
+ eq_spans.append({
277
+ "start": span.start(),
278
+ "end": span.start() + len(span.group()),
279
+ "text": matching_formula[0],
280
+ "latex": matching_formula[1],
281
+ "mathml": matching_formula[2],
282
+ "ref_id": span.group()
283
+ })
284
+ except KeyError:
285
+ continue
286
+
287
+ outputs.append(
288
+ {
289
+ "text": full_text,
290
+ 'cite_spans': cite_spans,
291
+ 'fig_spans': fig_spans,
292
+ 'table_spans': table_spans,
293
+ # 'suppl_spans': suppl_spans,
294
+ 'sup_spans': sup_spans,
295
+ 'sub_spans': sub_spans,
296
+ 'eq_spans': eq_spans,
297
+ "section": title,
298
+ }
299
+ )
300
+ return outputs
s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+
4
+ def _wrap_text(tag):
5
+ return tag.text if tag else ''
6
+
7
+
8
+ def parse_authors(authors_tag) -> List:
9
+ """The PMC XML has a slightly different format than authors listed in front tag."""
10
+ if not authors_tag:
11
+ return []
12
+
13
+ authors = []
14
+ for name_tag in authors_tag.find_all('name', recursive=False):
15
+ surname = name_tag.find('surname')
16
+ given_names = name_tag.find('given-names')
17
+ given_names = given_names.text.split(' ') if given_names else None
18
+ suffix = name_tag.find('suffix')
19
+ authors.append({
20
+ 'first': given_names[0] if given_names else '',
21
+ 'middle': given_names[1:] if given_names else [],
22
+ 'last': surname.text if surname else '',
23
+ 'suffix': suffix.text if suffix else ''
24
+ })
25
+ return authors
26
+
27
+
28
+ def parse_bib_entries(back_tag) -> Dict:
29
+ bib_entries = {}
30
+ # TODO: PMC2778891 does not have 'ref-list' in its back_tag. do we even need this, or can directly .find_all('ref')?
31
+ ref_list_tag = back_tag.find('ref-list')
32
+ if ref_list_tag:
33
+ for ref_tag in ref_list_tag.find_all('ref'):
34
+ # The ref ID and label are semantically swapped between CORD-19 and PMC, lol
35
+ ref_label = ref_tag['id']
36
+ ref_id = ref_tag.find('label')
37
+ authors_tag = ref_tag.find('person-group', {'person-group-type': 'author'})
38
+ year = ref_tag.find('year')
39
+ fpage = ref_tag.find('fpage')
40
+ lpage = ref_tag.find('lpage')
41
+ pages = f'{fpage.text}-{lpage.text}' if fpage and lpage else None
42
+ dois = [tag.text for tag in ref_tag.find_all('pub-id', {'pub-id-type': 'doi'})]
43
+ bib_entries[ref_label] = {
44
+ 'ref_id': _wrap_text(ref_id),
45
+ 'title': _wrap_text(ref_tag.find('article-title')),
46
+ 'authors': parse_authors(authors_tag),
47
+ 'year': int(year.text) if year and year.text.isdigit() else None,
48
+ 'venue': _wrap_text(ref_tag.find('source')),
49
+ 'volume': _wrap_text(ref_tag.find('volume')),
50
+ 'issn': _wrap_text(ref_tag.find('issue')),
51
+ 'pages': pages,
52
+ 'other_ids': {
53
+ 'DOI': dois,
54
+ }
55
+ }
56
+ return bib_entries
s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict
3
+
4
+ import bs4
5
+ from bs4 import BeautifulSoup
6
+
7
+ from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section
8
+
9
+
10
+ def extract_fig_blobs(body_tag) -> Dict:
11
+ fig_blobs = {}
12
+ for fig_tag in body_tag.find_all('fig'):
13
+ fig = fig_tag.extract()
14
+ label = fig.find('label')
15
+ fig_blobs[fig['id']] = {
16
+ 'label': label and label.text,
17
+ 'caption': fig.find('caption')
18
+ }
19
+ _update_fig_blobs(fig_blobs)
20
+ return fig_blobs
21
+
22
+
23
+ def _update_fig_blobs(fig_blobs: Dict):
24
+ for fig_blob in fig_blobs.values():
25
+ if fig_blob['caption'] is None:
26
+ continue
27
+ # replace non-p tags w/ p tags in figure caption (mostly dealing with title tags, which weren't being extracted before)
28
+ for tag in fig_blob['caption']:
29
+ if type(tag) == bs4.element.Tag and tag.name != 'p':
30
+ tag.name = 'p'
31
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=fig_blob['caption'], replace_formula=False)
32
+ for par_blob in par_blobs:
33
+ del par_blob['section']
34
+ fig_blob['caption'] = par_blobs
35
+
36
+
37
+ def extract_table_blobs(body_tag) -> Dict:
38
+ # note 1: footnotes dont always exist for each table; hence the if statement
39
+ # note 2: we want to preserve the XML tags for tables, but also need to run it through the regex cleaner for xrefs and other spans
40
+ # hence, wrapping all of the table XML text into a fake <p> paragraph tag
41
+ table_blobs = {}
42
+ for table_tag in body_tag.find_all('table-wrap'):
43
+ table = table_tag.extract()
44
+ label = table.find('label')
45
+ # TODO: currently restricting to tables with identifiers. might want to include unreferenced tables once we care more.
46
+ if table.get('id'):
47
+ table_blobs[table['id']] = {
48
+ 'label': label and label.text,
49
+ 'caption': table.find('caption'),
50
+ 'footnote': table.find('table-wrap-foot') if table.find('table-wrap-foot') else BeautifulSoup('<p></p>', 'xml'),
51
+ 'xml': BeautifulSoup('<p>' + str(table.find('table')) + '</p>', 'xml')
52
+ }
53
+ _update_table_blobs(table_blobs)
54
+ return table_blobs
55
+
56
+
57
+ def _update_table_blobs(table_blobs: Dict):
58
+ for table_blob in table_blobs.values():
59
+ if table_blob['caption'] is not None:
60
+ # replace non-p tags w/ p tags in table caption (mostly dealing with title tags, which weren't being extracted before)
61
+ for tag in table_blob['caption']:
62
+ if type(tag) == bs4.element.Tag and tag.name != 'p':
63
+ tag.name = 'p'
64
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['caption'], replace_formula=False)
65
+ for par_blob in par_blobs:
66
+ del par_blob['section']
67
+ table_blob['caption'] = par_blobs
68
+ if table_blob['footnote'] is not None:
69
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['footnote'], replace_formula=False)
70
+ for par_blob in par_blobs:
71
+ del par_blob['section']
72
+ table_blob['footnote'] = par_blobs
73
+ # note: if we dont include `par_to_text` function, the parser will convert all <p> tags to text via `par_tag.text`
74
+ # which actually removes all XML tags we wanted to preserve in table.
75
+ # by passing in str(), we ensure to keep all of those tags
76
+ if table_blob['xml'] is not None:
77
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['xml'], par_to_text=str, replace_formula=False)
78
+ for par_blob in par_blobs:
79
+ del par_blob['section']
80
+ table_blob['xml'] = par_blobs
81
+
82
+
83
+ def extract_suppl_blobs(body_tag) -> Dict:
84
+ suppl_blobs = {}
85
+ for suppl_tag in body_tag.find_all('supplementary-material'):
86
+ suppl = suppl_tag.extract()
87
+ # We only care about supplementary material that can be referenced (like figures/tables)
88
+ # for example, we dont care about PMC1139917 which has supplementary material but without an ID
89
+ if 'id' in suppl:
90
+ label = suppl.find('label')
91
+ suppl_blobs[suppl['id']] = {
92
+ 'label': label and label.text,
93
+ 'caption': suppl.find('caption')
94
+ }
95
+ _update_suppl_blobs(suppl_blobs)
96
+ return suppl_blobs
97
+
98
+
99
+ def _update_suppl_blobs(suppl_blobs: Dict):
100
+ for suppl_blob in suppl_blobs.values():
101
+ if suppl_blob['caption'] is None:
102
+ continue
103
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=suppl_blob['caption'])
104
+ for par_blob in par_blobs:
105
+ del par_blob['section']
106
+ suppl_blob['caption'] = par_blobs
s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ Functions for parsing specific `front_tag` soup tags
4
+
5
+ """
6
+
7
+ from typing import Dict, List, Optional
8
+
9
+ from collections import Counter
10
+
11
+ import re
12
+
13
+
14
+ from doc2json.jats2json.pmc_utils.all_tag_utils import recurse_parse_section, parse_all_paragraphs_in_section, \
15
+ replace_sup_sub_tags_with_string_placeholders, replace_xref_with_string_placeholders
16
+
17
+
18
+ class NoAuthorNamesError(Exception):
19
+ """Known papers that trigger:
20
+ - PMC3462967
21
+ """
22
+ pass
23
+
24
+
25
+ def parse_journal_id_tag(front_tag) -> str:
26
+ """
27
+ front_tag.find_all('journal-id') returns:
28
+ [
29
+ <journal-id journal-id-type="nlm-ta">Neurosci J</journal-id>,
30
+ <journal-id journal-id-type="iso-abbrev">Neurosci J</journal-id>,
31
+ <journal-id journal-id-type="publisher-id">NEUROSCIENCE</journal-id>
32
+ ]
33
+ [
34
+ <journal-id journal-id-type="nlm-ta">BMC Biochem</journal-id>
35
+ <journal-id journal-id-type="iso-abbrev">BMC Biochem</journal-id>
36
+ ]
37
+ """
38
+ c = Counter()
39
+ for tag in front_tag.find_all('journal-id'):
40
+ c[tag.text] += 1
41
+ tag.decompose()
42
+ journal_id, n = c.most_common(1)[0]
43
+ return journal_id
44
+
45
+
46
+ def parse_journal_name_tag(front_tag) -> str:
47
+ """
48
+ Examples:
49
+ # Paper 1
50
+ <journal-title-group>
51
+ <journal-title>BMC Biochemistry</journal-title>
52
+ </journal-title-group>
53
+ # Paper 2
54
+ <journal-title-group>
55
+ <journal-title>Neuroscience Journal</journal-title>
56
+ </journal-title-group>
57
+
58
+ But not all titles are contained within a `journal-title-group`. See PMC1079901
59
+ <journal-meta>
60
+ <journal-id journal-id-type="nlm-ta">
61
+ Biomed Eng Online
62
+ </journal-id>
63
+ <journal-title>
64
+ BioMedical Engineering OnLine
65
+ </journal-title>
66
+ ...
67
+ """
68
+ if len(front_tag.find_all('journal-title')) > 1:
69
+ raise Exception('Multiple journal titles?!')
70
+ return front_tag.find('journal-title').extract().text
71
+
72
+
73
+ def parse_pubmed_id_tag(front_tag) -> Optional[str]:
74
+ """Not every PMC paper has a PMID """
75
+ pmid_tag = front_tag.find('article-id', {'pub-id-type': 'pmid'})
76
+ if pmid_tag is None:
77
+ return None
78
+ else:
79
+ return pmid_tag.extract().text
80
+
81
+
82
+ def parse_pmc_id_tag(front_tag) -> str:
83
+ return f"PMC{front_tag.find('article-id', {'pub-id-type': 'pmc'}).extract().text}"
84
+
85
+
86
+ def parse_doi_tag(front_tag) -> Optional[str]:
87
+ """Not all papers have a DOI"""
88
+ doi_tag = front_tag.find('article-id', {'pub-id-type': 'doi'})
89
+ if doi_tag is not None:
90
+ return doi_tag.extract().text
91
+ else:
92
+ return None
93
+
94
+
95
+ def parse_title_tag(front_tag) -> str:
96
+ """
97
+ Examples:
98
+ # Paper 1
99
+ <title-group>
100
+ <article-title>Role of the highly conserved G68 residue in the yeast phosphorelay protein Ypd1: implications for interactions between histidine phosphotransfer (HPt) and response regulator proteins</article-title>
101
+ </title-group>
102
+ # Paper 2
103
+ <title-group>
104
+ <article-title>Association of Strength and Physical Functions in People with Parkinson's Disease</article-title>
105
+ </title-group>
106
+
107
+ Want to restrict to `title-group` because sometimes title shows up in <notes> under self-citation
108
+ """
109
+ title_group = front_tag.find('title-group').extract()
110
+ if len(title_group.find_all('article-title')) > 1:
111
+ raise Exception('Multiple article titles?!')
112
+ return title_group.find('article-title').text
113
+
114
+
115
+ def parse_category_tag(front_tag) -> List[str]:
116
+ """
117
+ Examples:
118
+ # Paper 1
119
+ <article-categories>
120
+ <subj-group subj-group-type="heading">
121
+ <subject>Research Article</subject>
122
+ </subj-group>
123
+ </article-categories>
124
+ # Paper 2
125
+ <article-categories>
126
+ <subj-group subj-group-type="heading">
127
+ <subject>Research Article</subject>
128
+ </subj-group>
129
+ </article-categories>
130
+ """
131
+ if len(front_tag.find_all('subj-group')) > 1 or len(front_tag.find_all('subject')) > 1:
132
+ raise Exception('Multiple categories?!')
133
+ article_categories = front_tag.find('article-categories').extract()
134
+ return article_categories.find('subject').text
135
+
136
+
137
+ def parse_date_tag(front_tag) -> Dict:
138
+ """
139
+ Two sets of tags contain dates:
140
+ <pub-date pub-type="collection">
141
+ <year>2018</year>
142
+ </pub-date>
143
+ <pub-date pub-type="epub">
144
+ <day>12</day>
145
+ <month>12</month>
146
+ <year>2018</year>
147
+ </pub-date>
148
+ And:
149
+ <history>
150
+ <date date-type="received">
151
+ <day>15</day>
152
+ <month>10</month>
153
+ <year>2018</year>
154
+ </date>
155
+ <date date-type="rev-recd">
156
+ <day>20</day>
157
+ <month>11</month>
158
+ <year>2018</year>
159
+ </date>
160
+ <date date-type="accepted">
161
+ <day>26</day>
162
+ <month>11</month>
163
+ <year>2018</year>
164
+ </date>
165
+ </history>
166
+
167
+ PMC2557072 has `date` tag with no `day`, only `year` and `month`
168
+ """
169
+ out = {}
170
+ for pub_date in front_tag.find_all('pub-date'):
171
+ year = pub_date.find('year')
172
+ month = pub_date.find('month')
173
+ day = pub_date.find('day')
174
+ out[pub_date.get('pub-type', 'MISSING_PUB_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None])
175
+ pub_date.decompose()
176
+ for date in front_tag.find_all('date'):
177
+ year = date.find('year')
178
+ month = date.find('month')
179
+ day = date.find('day')
180
+ out[date.get('date-type', 'MISSING_DATE_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None])
181
+ date.decompose()
182
+ return out
183
+
184
+
185
+ def parse_funding_groups(front_tag) -> List[str]:
186
+ outs = []
187
+ for tag in front_tag.find_all():
188
+
189
+ # AND statement skips cases where the two tag types nest within each other; we only process the inner one
190
+ if (tag.name == 'funding-source' or tag.name == 'funding-statement') and tag.find('funding-source') is None and tag.find('funding-statement') is None:
191
+
192
+ out = {
193
+ 'name': None,
194
+ 'doi': None,
195
+ 'notes': None,
196
+ # 'raw': str(tag) # for debugging
197
+ }
198
+
199
+ # handle institution
200
+ institution_id_tag = tag.find('institution-id')
201
+ if institution_id_tag:
202
+ out['doi'] = institution_id_tag.extract().text.replace('http://dx.doi.org/', '')
203
+ institution_tag = tag.find('institution')
204
+ if institution_tag:
205
+ out['name'] = tag.find('institution').extract().text
206
+
207
+ # handle named content
208
+ funder_name_tag = tag.find('named-content', {'content-type': 'funder-name'})
209
+ if funder_name_tag:
210
+ out['name'] = funder_name_tag.extract().text
211
+
212
+ funder_id_tag = tag.find('named-content', {'content-type': 'funder-identifier'})
213
+ if funder_id_tag:
214
+ out['doi'] = funder_id_tag.extract().text.replace('http://dx.doi.org/', '')
215
+
216
+ # handle urls
217
+ if tag.get('xlink:href'):
218
+ out['doi'] = tag['xlink:href']
219
+
220
+ # fix DOIs with URLs in them
221
+ if out['doi']:
222
+ match = re.search(r'http(s?)://dx.doi.org/(.+)', out['doi'])
223
+ if match:
224
+ out['doi'] = match.group(2)
225
+
226
+ # remainder text is either a name or a full statement
227
+ text = tag.text
228
+ if tag.name == 'funding-statement' or ('fund' in text or 'support' in text or 'provide' in text):
229
+ out['notes'] = text
230
+ else:
231
+ # what if something already in 'name'? observed it's typically empty string; so ignore.
232
+ if not out['name']:
233
+ out['name'] = text
234
+
235
+ # if DOI link is in the name, remove it and parse (PMC5407128)
236
+ if out['name'] and not out['doi']:
237
+ pattern = r'\s*http(s?)://dx.doi.org/(.+)$'
238
+ match = re.search(pattern, out['name'])
239
+ if match:
240
+ out['doi'] = match.group(2)
241
+ out['name'] = re.sub(pattern, r'', out['name'])
242
+
243
+ outs.append(out)
244
+ return outs
245
+
246
+
247
+ # TODO: didnt want to handle <collab> group names; seemed rare and inconsistent; focus on <contrib> with <name> and <aff>
248
+ def parse_authors(front_tag) -> List[Dict]:
249
+ authors = []
250
+ for contrib_tag in front_tag.find_all('contrib'):
251
+
252
+ # skip nesting; just process children (individual authors)
253
+ if contrib_tag.find_all('contrib'):
254
+ continue
255
+
256
+ # skip contribs without a name; these should be ones that consist of <collab> tag
257
+ if contrib_tag.find('name') is None:
258
+ continue
259
+
260
+ # corresponding tag
261
+ if (contrib_tag.get('corresp') == 'yes') or (contrib_tag.find('xref', {'ref-type': 'corresp'})):
262
+ is_corresp = True
263
+ else:
264
+ is_corresp = False
265
+
266
+ # orcid ID is sometimes a URL or just a number. standardize as hyphenized number.
267
+ if contrib_tag.find('contrib-id'):
268
+ orcid_id = contrib_tag.find('contrib-id').text
269
+ match = re.search(r'http(s?)://orcid.org/(.+)', orcid_id)
270
+ if match:
271
+ orcid_id = match.group(2)
272
+ # A very small number of articles have ID type CATS, which we don't handle. For example:
273
+ # /disk2/gorpus/20200101/pmc/Change/PMC6176774.nxml
274
+ if len(orcid_id) != 19:
275
+ orcid_id = None
276
+ else:
277
+ orcid_id = None
278
+
279
+ # Email may or may not be present.
280
+ email = contrib_tag.find('email')
281
+ email = email.text if email else None
282
+
283
+ # Get the name info for the author.
284
+ name_info = {name_tag.name: name_tag.text for name_tag in contrib_tag.find('name').find_all()}
285
+ # TODO: PMC3462967 is an Erratum. It does not have ['given-names']. not sure we care about those, so try-catch for now
286
+ try:
287
+ given_names = name_info['given-names'].split(' ')
288
+ except KeyError as e:
289
+ raise NoAuthorNamesError
290
+
291
+ authors.append({
292
+ 'first': given_names[0] if given_names else None,
293
+ 'middle': given_names[1:] if given_names else None,
294
+ 'last': name_info['surname'],
295
+ 'suffix': name_info.get('suffix', ''),
296
+ 'email': email,
297
+ 'affiliation_ids': [xref_tag.get('rid') for xref_tag in contrib_tag.find_all('xref', {'ref-type': 'aff'})],
298
+ 'corresponding': is_corresp,
299
+ 'orcid': orcid_id
300
+ })
301
+
302
+ # authors.append(str(contrib_tag.extract()))
303
+ return authors
304
+
305
+
306
+ def parse_affiliations(front_tag) -> List[Dict]:
307
+ """
308
+ Sometimes affiliations is nested within '<contrib-group>' along with
309
+ authors. Sometimes, they're not and listed outside as multiple tags.
310
+
311
+ Not all <aff> have IDs. For example:
312
+ <aff>St. Paul, Minnesota</aff>
313
+ """
314
+ outs = []
315
+ for aff_tag in front_tag.find_all('aff'):
316
+ if aff_tag.find('label'): # get rid of unused markers so `.text` is cleaner
317
+ aff_tag.find('label').decompose()
318
+ if aff_tag.find('sup'):
319
+ aff_tag.find('sup').decompose() # same treatment as label
320
+
321
+ aff_id = aff_tag.get('id')
322
+
323
+ # it looks like we want to go to the full affiliation surface form without worrying about all possible handlings of <named-content> and other fields
324
+ # BUT, we do want to keep ISNI and GRID IDs when they occur. They seem to occur typically within <institution-wrap>
325
+ # so let's handle those if they exist; safely decompose the tags (because they dont contribute to surface form); then grab remaining affiliation surface form
326
+
327
+ # implicit in this approach is that we dont need to actually handle <institution-wrap> tags because only one per affiliation
328
+ if len(aff_tag.find_all('institution-wrap')) > 1:
329
+ import pdb; pdb.set_trace()
330
+ id_type_to_id = {}
331
+ for institution_id_tag in aff_tag.find_all('institution-id'):
332
+ id_type_to_id[institution_id_tag['institution-id-type']] = institution_id_tag.text
333
+ institution_id_tag.decompose()
334
+
335
+ # TODO: processing of text: there are a lot of random newline chars (cuz XML preserves page layout)
336
+ # --> replace them with whitespace if there's preceding punctuation char
337
+ # --> otherwise, replace them with comma
338
+ text = aff_tag.text
339
+
340
+ outs.append({
341
+ 'id': aff_id,
342
+ 'other_ids': id_type_to_id,
343
+ 'text': text
344
+ })
345
+
346
+ return outs
347
+
348
+
349
+ def parse_abstract_tag(front_tag, soup) -> List[Dict]:
350
+ """Not every paper has an abstract
351
+
352
+ Furthermore, note very abstract is structured into sections.
353
+ Some abstracts (see PMC1914226) look like:
354
+ <abstract>
355
+ <p> ... </p>
356
+ <p> ... </p>
357
+ </abstract>
358
+ """
359
+ # TODO: are there cases where <abstract> text <p> text </> </abstract> ?
360
+ abstract: List[Dict] = []
361
+ if front_tag.find('abstract'):
362
+ abstract_tag = front_tag.find('abstract').extract()
363
+
364
+ # replace all xref tags with string placeholders
365
+ replace_xref_with_string_placeholders(soup_tag=abstract_tag, soup=soup)
366
+
367
+ # replace all sup/sub tags with string placeholders
368
+ replace_sup_sub_tags_with_string_placeholders(soup_tag=abstract_tag, soup=soup)
369
+
370
+ if abstract_tag.find('sec'):
371
+ all_par_blobs = []
372
+ for sec_tag in abstract_tag.find_all('sec', recursive=False):
373
+ par_blobs = recurse_parse_section(sec_tag=sec_tag)
374
+ all_par_blobs.extend(par_blobs)
375
+ else:
376
+ all_par_blobs = parse_all_paragraphs_in_section(sec_tag=abstract_tag)
377
+ for par_blob in all_par_blobs:
378
+ # these 'sections' typically show up as empty string
379
+ par_blob['section'] = 'Abstract'
380
+ abstract.append(par_blob)
381
+ return abstract
s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ funding_tags_and_parsed_dicts = [
3
+ # <funding-group> is typically the top-level tag
4
+ #
5
+ # within, we see <funding-source> and <funding-statement> as containing the main information we want
6
+ #
7
+ # here, we see <funding-source> with an 'id' attribute. we can ignore these.
8
+ ("""<funding-group>
9
+ <award-group>
10
+ <funding-source id=\"CS200\">Wellcome Trust</funding-source>
11
+ </award-group>
12
+ </funding-group>""", None),
13
+ # sometimes, there are also <award-id> tags, but we can ignore these. they're funding-group specific.
14
+ ("""<funding-group>
15
+ <award-group>
16
+ <funding-source id=\"sp1\">US Department of Energy's Office of Science, Biological and Environmental Research Program</funding-source>
17
+ <award-id rid=\"sp1\">DE-AC02-05CH11231</award-id>
18
+ <award-id rid=\"sp1\">DE-AC52-07NA27344</award-id>
19
+ <award-id rid=\"sp1\">DE-AC02-06NA25396</award-id>
20
+ <award-id rid=\"sp1\">DE-AC05-00OR22725</award-id>
21
+ </award-group>
22
+ <award-group>
23
+ <funding-source id=\"sp2\">German Research Foundation</funding-source>
24
+ <award-id rid=\"sp2\">INST 599/1-2</award-id>
25
+ </award-group>
26
+ </funding-group>""", None),
27
+
28
+ # <funding-statement> is a less structured alternative to <funding-source>
29
+ ("""<funding-group>
30
+ <funding-statement>No sources of funding were used to assist in the preparation of this study.</funding-statement>
31
+ </funding-group>""", None),
32
+
33
+ # Rarely, there is nesting! ignore parents.
34
+ ("""<funding-group>
35
+ <funding-statement>
36
+ <funding-source>This work was supported by the Swedish Association for Sexuality Education (RFSU).</funding-source>
37
+ </funding-statement>
38
+ </funding-group>""", None),
39
+
40
+
41
+ # Sometimes both can occur, sort of duplicating the same information.
42
+ # For example "Cornell" is mentioned as both a <funding-source> and a <funding-statement>
43
+ ("""<funding-group>
44
+ <award-group>
45
+ <funding-source>
46
+ <named-content content-type=\"funder-name\">Cornell University Institute for the Social Sciences</named-content>
47
+ </funding-source>
48
+ </award-group>
49
+ <funding-statement>The research was supported by a grant from the Cornell University Institute for the Social Sciences.</funding-statement>
50
+ </funding-group>""", None),
51
+
52
+ # many <funding-source>
53
+ ("""<funding-group>
54
+ <award-group id=\"sp1\">
55
+ <funding-source>Brien Holden Vision Institute</funding-source>
56
+ </award-group>
57
+ <award-group id=\"sp2\">
58
+ <funding-source>Australian Federal Government</funding-source>
59
+ </award-group>
60
+ <award-group id=\"sp3\">
61
+ <funding-source>International Postgraduate Research Scholarship (Cathleen Fedtke)</funding-source>
62
+ </award-group>
63
+ <award-group id=\"sp4\">
64
+ <funding-source>University of New South Wales, Australia</funding-source>
65
+ </award-group>
66
+ <award-group id=\"sp5\">
67
+ <funding-source>National Institutes of Health</funding-source>
68
+ <award-id>P30EY14801</award-id>
69
+ </award-group>
70
+ <award-group id=\"sp6\">
71
+ <funding-source>Florida Lions Eye Bank</funding-source>
72
+ </award-group>
73
+ <award-group id=\"sp7\">
74
+ <funding-source>Bascom Palmer Eye Institute</funding-source>
75
+ </award-group>
76
+ </funding-group>""", None),
77
+
78
+ # institutions can optionally occur within <funding-source>
79
+ # 'institution-id-type' is common, but also optional
80
+ # regardless of the institution ID type, it looks like the ID is always a DOI (or URL to a DOI)
81
+ ("""<funding-group>
82
+ <award-group>
83
+ <funding-source>
84
+ <institution-wrap>
85
+ <institution-id institution-id-type=\"FundRef\">http://dx.doi.org/10.13039/100000025</institution-id>
86
+ <institution>National Institute of Mental Health</institution>
87
+ </institution-wrap>
88
+ </funding-source>
89
+ <award-id>R01MH107333</award-id>
90
+ <principal-award-recipient>
91
+ <name><surname>Kim</surname><given-names>Woong-Ki</given-names></name>
92
+ </principal-award-recipient>
93
+ </award-group>
94
+ </funding-group>""", None),
95
+ ("""<funding-group specific-use=\"FundRef\">
96
+ <award-group>
97
+ <funding-source>
98
+ <institution-wrap>
99
+ <institution>Deutsche Forschungsgemeinschaft</institution>
100
+ <institution-id>http://search.crossref.org/fundref?q=501100001659</institution-id>
101
+ </institution-wrap>
102
+ </funding-source>
103
+ <award-id>Re 628/16-1</award-id>
104
+ <award-id>GRK 1216</award-id>
105
+ </award-group>
106
+ </funding-group>""", None),
107
+ ("""<funding-group>
108
+ <award-group id=\"funding-1\">
109
+ <funding-source>
110
+ <institution-wrap>
111
+ <institution>National Institutes of Health </institution>
112
+ <institution-id institution-id-type=\"open-funder-registry\">10.13039/100000002</institution-id>
113
+ </institution-wrap>
114
+ </funding-source>
115
+ </award-group>
116
+ </funding-group>""", None),
117
+
118
+ # handing <named-content>
119
+ ("""<funding-group>
120
+ <award-group>
121
+ <funding-source>
122
+ <named-content content-type=\"funder-name\">Austrian Science Fund</named-content>
123
+ <named-content content-type=\"funder-identifier\">10.13039/501100002428</named-content>
124
+ </funding-source>
125
+ <award-id>P 27625</award-id>
126
+ </award-group>
127
+ <funding-statement>This work was supported by Austrian Science Fund [grant number P 27625].</funding-statement>
128
+ </funding-group>""", None),
129
+
130
+ # handling xlink:href attributes
131
+ ("""<funding-group>
132
+ <award-group>
133
+ <funding-source xlink:href=\"http://dx.doi.org/10.13039/501100000269\">Economic and Social Research Council</funding-source>
134
+ <award-id>RES-360-25-0032</award-id>
135
+ </award-group>
136
+ <award-group>
137
+ <funding-source xlink:href=\"http://dx.doi.org/10.13039/100004440\">Wellcome Trust</funding-source>
138
+ <award-id>106542/Z/14/Z</award-id>
139
+ </award-group>
140
+ </funding-group>""", None)
141
+ ]
142
+
143
+ acknowledgement_tags_and_parsed_dicts = [
144
+ # variants with <ack id> may/may not have a <title>. always have <p> but may/may not have <p id>. <title> never has attributes.
145
+ # the <p> text might contain <funding-source> or <ext-link> tags.
146
+ # the <ext-link> tags have required attributes 'ext-link-type' and 'xlink:href', and optional attribute 'id'. all the <ext-links> are URLs.
147
+ ("""<ack id=\"ack0005\">
148
+ <title>Acknowledgements</title>
149
+ <p>The authors thank the <funding-source id=\"gs0005\">BBSRC</funding-source> (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.</p>
150
+ </ack>""", {
151
+ 'text': 'The authors thank the BBSRC (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.',
152
+ 'funding': [{'text': 'BBSRC', 'id': 'gs0005'}],
153
+ 'url': None}),
154
+ ("""<ack id=\"S27\">
155
+ <p>Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.</p>
156
+ </ack>""", {
157
+ 'text': 'Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.',
158
+ 'funding': [],
159
+ 'url': None}),
160
+ ("""<ack id=\"S11\">
161
+ <title>Acknowledgements</title>
162
+ <p id=\"P33\">This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.</p>
163
+ </ack>""", {
164
+ 'text': 'This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.',
165
+ 'funding': [],
166
+ 'url': None}),
167
+ ("""<ack id=\"mee312535-sec-0015\">
168
+ <title>Data accessibility</title>
169
+ <p>The data used is included in the RepeatABEL package available at <ext-link ext-link-type=\"uri\" xlink:href=\"https://cran.r-project.org/web/packages/RepeatABEL\">https://cran.r-project.org/web/packages/RepeatABEL</ext-link>.</p>
170
+ </ack>""", {
171
+ 'text': 'The data used is included in the RepeatABEL package available at https://cran.r-project.org/web/packages/RepeatABEL.',
172
+ 'funding': [],
173
+ 'url': 'https://cran.r-project.org/web/packages/RepeatABEL'}),
174
+ # variants with <ack> are similar to the above.
175
+ ("""<ack>
176
+ <title>Acknowledgments</title>
177
+ <p>D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.</p>
178
+ </ack>""", {
179
+ 'text': 'D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.',
180
+ 'funding': [],
181
+ 'url': None}),
182
+ ("""<ack>
183
+ <title>Conflict of interest</title>
184
+ <p>The authors declare there is no conflict of interest associated with this manuscript.</p>
185
+ </ack>""", {
186
+ 'text': 'The authors declare there is no conflict of interest associated with this manuscript.',
187
+ 'funding': [],
188
+ 'url': None})
189
+ ]
190
+
191
+ affiliation_tags_and_parsed_dicts = [
192
+ # mix of <aff> tags with and without IDs
193
+ ("""<aff>Department of Internal Medicine, Division of Cardiology, Inha University Hospital, Incheon, South Korea</aff>""", None),
194
+ ("""<aff id=\"aff1\"><label>1</label>Department of Cardiology, Atatürk Chest Diseases and Chest Surgery Training and Research Hospital; Ankara-Turkey</aff>""", None),
195
+ # there can exist a <label> tag with/without IDs
196
+ ("""<aff><label>3</label>Center for Medical Education, Sapporo Medical University, <addr-line>Sapporo, Japan</addr-line></aff>""", None),
197
+ # sometimes, the marker used in paper is kept also. for example, `1` in superscript.
198
+ # this can exist with/without the <label> tag. as in, it's inconsistent whether the marker is encapsulated in <label> or kept as string
199
+ ("""<aff id=\"I1\">\n<sup>1</sup>Department of Orthodontics, College of Dentistry, King Khalid University, Abha, Saudi Arabia</aff>""", None),
200
+ ("""<aff id=\"hic312304-aff-0001\"><label><sup>1</sup></label><institution>University of Dundee</institution></aff>""", None),
201
+ # <institution> tags can be straightforward; just ignore and grab text
202
+ ("""<aff id=\"AF02477-1\"><label>1</label><institution>School of Chemistry, The University of Manchester, Manchester, United Kingdom</institution>""", None),
203
+ # sometimes <institution> tags can have SIBLING tags, like <addr-line> or <country>
204
+ ("""<aff id=\"aff002\"><label>2</label>Sr. Consultant &amp; Head, Dept. of Neurology, <institution>National Neurosciences Centre, Peerless Hospital</institution>, <addr-line>Kolkata, India</addr-line></aff>""", None),
205
+ ("""<aff id=\"aff2\"><label><sup>2</sup></label>Institute for Transplantation Diagnostics and Cell Therapeutics, <institution>Heinrich Heine University Düsseldorf</institution>, Düsseldorf, <country>Germany</country>.</aff>""", None),
206
+ # <named-content> is also a common CHILD tag; these can be either entirely structured affiliation entries (not intended for tag.text)
207
+ ("""<aff id=\"embr201642857-aff-0007\">
208
+ <label><sup>7</sup></label>
209
+ <institution>VIB</institution>
210
+ <named-content content-type=\"city\">Zwijnaarde</named-content>
211
+ <country country=\"BE\">Belgium</country>
212
+ </aff>""", None),
213
+ # or overlayed over a single affiliation string (comma-sep if call tag.text)
214
+ ("""<aff id=\"AFF0005\">
215
+ <label><sup>e</sup></label>
216
+ <institution>
217
+ <named-content content-type=\"department\">School of Public Health &amp; Health Systems</named-content>, <named-content content-type=\"institution-name\">University of Waterloo</named-content>
218
+ </institution>
219
+ </aff>""", None),
220
+ # example of a nonsense one that has TWO <named-content> tags, whitespaces, the <sup> tag WITHIN <label>
221
+ ("""<aff id=\"ejn14074-aff-0007\">\n
222
+ <label><sup>7</sup></label>\n
223
+ <named-content content-type=\"organisation-division\">Brain Research Institute</named-content>\n
224
+ <institution>University of Zürich</institution>\n
225
+ <named-content content-type=\"city\">Zürich</named-content>\n
226
+ <country country=\"CH\">Switzerland</country>\n</aff>""", None),
227
+ # most common content-type within <named-content> are: 'department', 'organisation-division', 'city', 'institution-name', 'postal-code', 'country-part', etc.
228
+
229
+ # <institution-wrap> is the other popular way to surface <institution> tags.
230
+ # They seem to always come with 1+ <institution-id> as children.
231
+
232
+ # finally, these wrappers can wrap multiple <institution> tags.
233
+ # in this example, see how the COMMA is awkwardly encapsulated within <institution> tags? Also, notice how the country is untagged outside of <institution-wrap>
234
+ # basically, everything is weird.
235
+ ("""<aff id=\"Aff10\">
236
+ <label>10</label>
237
+ <institution-wrap>
238
+ <institution-id institution-id-type=\"ISNI\">0000000123222966</institution-id>
239
+ <institution-id institution-id-type=\"GRID\">grid.6936.a</institution-id>
240
+ <institution>Institute of Experimental Genetics, Life and Food Science Center Weihenstephan, </institution>
241
+ <institution>Technische Universität München, </institution>
242
+ </institution-wrap>Freising-Weihenstephan, Germany </aff>""", None)
243
+ ]
244
+
245
+ author_tags_and_parsed_dicts = [
246
+ # every author seems to be in a <contrib> tag.
247
+ # all <contrib> tags seem to have a 'contrib-type' attribute, which often equals 'author' and sometimes equals 'collab'
248
+
249
+ # below is an 'author' that has <name>, <address>, and <bio> child tags. Also XREF to affiliation (can have multiple).
250
+ ("""<contrib contrib-type=\"author\">
251
+ <name><surname>Sandström</surname><given-names>Annica</given-names></name>
252
+ <address><email>annica.sandstrom@ltu.se</email></address>
253
+ <xref ref-type=\"aff\" rid=\"Aff2\"/>
254
+ <bio><sec id=\"d30e226\"><title>Annica Sandström</title><p>is an Associate Professor in Political Science at Luleå University of Technology. Working foremost within the field of environmental policy and management, her publications include empirical studies on the socio-political complexities of natural resource governance as well as theory-driven pieces on collaborative management, adaptive management, and policy networks.</p></sec></bio>
255
+ </contrib>""", None),
256
+ ("""<contrib contrib-type="author">
257
+ <name><surname>Cassidy</surname><given-names>John W.</given-names></name>
258
+ <xref ref-type="aff" rid="A1">1</xref>
259
+ <xref ref-type="aff" rid="A2">2</xref>
260
+ </contrib>""", None),
261
+
262
+ # below is an 'author' that contains a <collab> child tag. We can see sometimes there's other tags like an XREF to affiliation which can probably be .decomposed()
263
+ ("""<contrib contrib-type=\"author\">
264
+ <collab>The HIV Neurobehavioral Research Programs (HNRP) Group</collab>
265
+ </contrib>""", None),
266
+ ("""<contrib contrib-type=\"author\">
267
+ <collab>JET EFDA contributors</collab>
268
+ <xref ref-type=\"aff\" rid=\"aff1\">a</xref><xref ref-type=\"fn\" rid=\"fn3\">3</xref>
269
+ </contrib>""", None),
270
+
271
+ # below is a 'collab' that also contains nested <contrib> tags wrapped by <contrib-group>. Yikes!
272
+ # luckily, it seems <contrib-group> is rare and always nested within an ultimate parent <contrib>
273
+ # --> these are more like affiliations
274
+ ("""<contrib contrib-type=\"collab\">
275
+ <collab>UK Biobank Eye and Vision Consortium\n
276
+ <contrib-group>
277
+ <contrib contrib-type=\"collab\">
278
+ <name><surname>Aslam</surname><given-names>Tariq</given-names></name>
279
+ </contrib>
280
+ <contrib contrib-type=\"collab\">
281
+ <name><surname>Bishop</surname><given-names>Paul</given-names></name>
282
+ </contrib>
283
+ <contrib contrib-type=\"collab\">
284
+ <name><surname>Barman</surname><given-names>Sarah</given-names></name>
285
+ </contrib>
286
+ </contrib-group>
287
+ </collab>
288
+ </contrib>
289
+ """, None),
290
+ ("""<contrib contrib-type="author">
291
+ <collab>WERF EPHect Working Group
292
+ <contrib-group>
293
+ <contrib contrib-type="author"><name><surname>Adamson</surname><given-names>G.D.</given-names></name></contrib>
294
+ <contrib contrib-type="author"><name><surname>Allaire</surname><given-names>C.</given-names></name></contrib>
295
+ </contrib-group>
296
+ </collab>
297
+ </contrib>""", None),
298
+
299
+ # there are optional <aff> tags instead of an <xref ref-type=\"aff\">
300
+ ("""<contrib contrib-type=\"author\">
301
+ <name><surname>Beedle</surname><given-names>Aaron M</given-names></name>
302
+ <aff id=\"A1\">Department of Pharmaceutical and Biomedical Sciences, University of Georgia College of Pharmacy, Athens, GA 30602 USA</aff>
303
+ </contrib>""", None),
304
+
305
+ # corresponding authors are indicated in two ways: (i) within <contrib> as a 'corresp=yes' attribute, (ii) within <xref> as a 'ref-type=corresp' attribute
306
+ ("""<contrib contrib-type=\"author\" corresp=\"yes\">
307
+ <name><surname>Kim</surname><given-names>Woong-Ki</given-names></name>
308
+ <address><email>kimw@evms.edu</email></address>
309
+ <xref ref-type=\"aff\" rid=\"Aff1\">1</xref>
310
+ </contrib>""", None),
311
+ ("""<contrib contrib-type=\"author\">
312
+ <name><surname>Suero Molina</surname><given-names>Eric</given-names></name>
313
+ <degrees>MD, MBA</degrees>
314
+ <!--<email>eric.suero@ukmuenster.de</email>-->
315
+ <xref ref-type=\"aff\" rid=\"aff1\"/>
316
+ <xref ref-type=\"corresp\" rid=\"cor1\"/>
317
+ </contrib>""", None),
318
+ # note that contrib-type 'editor' is also present, and seems to accompany <role> tag and 'corresp=no' attribute
319
+ ("""<contrib contrib-type=\"editor\" corresp=\"no\">
320
+ <name><surname>Greene</surname><given-names>Robert L.</given-names></name>
321
+ <role>Editor</role>
322
+ </contrib>""", None),
323
+
324
+ # within <contrib> are optional child tags <contrib-id>
325
+ # the 'contrib-id-type' seems to always be 'orcid'
326
+ # authentication seems optional
327
+ ("""<contrib contrib-type=\"author\" corresp=\"yes\">
328
+ <contrib-id authenticated=\"false\" contrib-id-type=\"orcid\">https://orcid.org/0000-0002-9987-6824</contrib-id>
329
+ <name><surname>Sandeepa</surname><given-names>N. C.</given-names></name>
330
+ <email>drsandeepanc@gmail.com</email>
331
+ <xref ref-type=\"aff\" rid=\"I2\">\n<sup>2</sup>\n</xref>
332
+ </contrib>""", None),
333
+ ("""<contrib contrib-type=\"author\" corresp=\"yes\">
334
+ <contrib-id contrib-id-type=\"orcid\">http://orcid.org/0000-0003-1079-4775</contrib-id>
335
+ <name><surname>West</surname><given-names>Ann H.</given-names></name>
336
+ <address><email>awest@ou.edu</email></address>
337
+ <xref ref-type=\"aff\" rid=\"Aff1\">1</xref>
338
+ </contrib>""", None),
339
+
340
+ # more edge cases; a <contrib> tag with no <name> --> probably just remove
341
+ ("""<contrib contrib-type="author">
342
+ <collab>on behalf of the National Advisory Committee on Blood and Blood Products
343
+ <xref ref-type="author-notes" rid="fn1">*</xref>
344
+ </collab>
345
+ </contrib>""", None),
346
+
347
+ ]
s2orc-doc2json/doc2json/jats2json/process_jats.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import time
5
+ from typing import Optional
6
+
7
+ from doc2json.jats2json.jats_to_json import convert_jats_xml_to_s2orc_json
8
+
9
+
10
+ BASE_TEMP_DIR = 'temp'
11
+ BASE_OUTPUT_DIR = 'output'
12
+ BASE_LOG_DIR = 'log'
13
+
14
+
15
+ def process_jats_stream(
16
+ fname: str,
17
+ stream: bytes,
18
+ temp_dir: str=BASE_TEMP_DIR
19
+ ):
20
+ """
21
+ Process a jats file stream
22
+ :param fname:
23
+ :param stream:
24
+ :param temp_dir:
25
+ :return:
26
+ """
27
+ temp_input_dir = os.path.join(temp_dir, 'input')
28
+ temp_input_file = os.path.join(temp_input_dir, fname)
29
+
30
+ os.makedirs(temp_dir, exist_ok=True)
31
+ os.makedirs(temp_input_dir, exist_ok=True)
32
+
33
+ with open(temp_input_file, 'wb') as outf:
34
+ outf.write(stream)
35
+
36
+ output_file = process_jats_file(temp_input_file)
37
+
38
+ if os.path.exists(output_file):
39
+ with open(output_file, 'r') as f:
40
+ contents = json.load(f)
41
+ return contents
42
+ else:
43
+ return []
44
+
45
+
46
+ def process_jats_file(
47
+ jats_file: str,
48
+ output_dir: str=BASE_OUTPUT_DIR,
49
+ log_dir: str=BASE_LOG_DIR,
50
+ ) -> Optional[str]:
51
+ """
52
+ Process files in a JATS XML file and get JSON representation
53
+ :param jats_file:
54
+ :param output_dir:
55
+ :param log_dir:
56
+ :return:
57
+ """
58
+ # create directories
59
+ os.makedirs(output_dir, exist_ok=True)
60
+ os.makedirs(log_dir, exist_ok=True)
61
+
62
+ # get paper id as the name of the file
63
+ paper_id = os.path.splitext(jats_file)[0].split('/')[-1]
64
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
65
+
66
+ # check if input file exists and output file doesn't
67
+ if not os.path.exists(jats_file):
68
+ raise FileNotFoundError(f"{jats_file} doesn't exist")
69
+ if os.path.exists(output_file):
70
+ print(f'{output_file} already exists!')
71
+
72
+ # convert to S2ORC
73
+ paper = convert_jats_xml_to_s2orc_json(jats_file, log_dir)
74
+
75
+ # write to file
76
+ with open(output_file, 'w') as outf:
77
+ json.dump(paper.release_json("jats"), outf, indent=4, sort_keys=False)
78
+
79
+ return output_file
80
+
81
+
82
+ if __name__ == '__main__':
83
+ parser = argparse.ArgumentParser(description="Run S2ORC JATS2JSON")
84
+ parser.add_argument("-i", "--input", default=None, help="path to the input JATS XML file")
85
+ parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
86
+ parser.add_argument("-l", "--log", default='log', help="path to the log dir")
87
+ parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
88
+
89
+ args = parser.parse_args()
90
+
91
+ input_path = args.input
92
+ output_path = args.output
93
+ log_path = args.log
94
+ keep_temp = args.keep
95
+
96
+ start_time = time.time()
97
+
98
+ os.makedirs(output_path, exist_ok=True)
99
+
100
+ process_jats_file(input_path, output_path, log_path, keep_temp)
101
+
102
+ runtime = round(time.time() - start_time, 3)
103
+ print("runtime: %s seconds " % (runtime))
104
+ print('done.')
s2orc-doc2json/doc2json/s2orc.py ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ S2ORC classes
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import Dict, List, Optional
7
+ from doc2json.config import *
8
+
9
+
10
+ CORRECT_KEYS = {
11
+ "issn": "issue",
12
+ "type": "type_str"
13
+ }
14
+
15
+ SKIP_KEYS = {
16
+ 'link',
17
+ 'bib_id'
18
+ }
19
+
20
+ REFERENCE_OUTPUT_KEYS = {
21
+ 'figure': {'text', 'type_str', 'uris', 'num'},
22
+ 'table': {'text', 'type_str', 'content', 'num', 'html'},
23
+ 'footnote': {'text', 'type_str', 'num'},
24
+ 'section': {'text', 'type_str', 'num', 'parent'},
25
+ 'equation': {'text', 'type_str', 'latex', 'mathml', 'num'}
26
+ }
27
+
28
+ METADATA_KEYS = {
29
+ "title", "authors", "year", "venue", "identifiers"
30
+ }
31
+
32
+
33
+ class ReferenceEntry:
34
+ """
35
+ Class for representing S2ORC figure and table references
36
+
37
+ An example json representation (values are examples, not accurate):
38
+
39
+ {
40
+ "FIGREF0": {
41
+ "text": "FIG. 2. Depth profiles of...",
42
+ "latex": null,
43
+ "type": "figure"
44
+ },
45
+ "TABREF2": {
46
+ "text": "Diversity indices of...",
47
+ "latex": null,
48
+ "type": "table",
49
+ "content": "",
50
+ "html": ""
51
+ }
52
+ }
53
+ """
54
+ def __init__(
55
+ self,
56
+ ref_id: str,
57
+ text: str,
58
+ type_str: str,
59
+ latex: Optional[str] = None,
60
+ mathml: Optional[str] = None,
61
+ content: Optional[str] = None,
62
+ html: Optional[str] = None,
63
+ uris: Optional[List[str]] = None,
64
+ num: Optional[str] = None,
65
+ parent: Optional[str] = None
66
+ ):
67
+ self.ref_id = ref_id
68
+ self.text = text
69
+ self.type_str = type_str
70
+ self.latex = latex
71
+ self.mathml = mathml
72
+ self.content = content
73
+ self.html = html
74
+ self.uris = uris
75
+ self.num = num
76
+ self.parent = parent
77
+
78
+ def as_json(self):
79
+ keep_keys = REFERENCE_OUTPUT_KEYS.get(self.type_str, None)
80
+ if keep_keys:
81
+ return {
82
+ k: self.__getattribute__(k) for k in keep_keys
83
+ }
84
+ else:
85
+ return {
86
+ "text": self.text,
87
+ "type": self.type_str,
88
+ "latex": self.latex,
89
+ "mathml": self.mathml,
90
+ "content": self.content,
91
+ "html": self.html,
92
+ "uris": self.uris,
93
+ "num": self.num,
94
+ "parent": self.parent
95
+ }
96
+
97
+
98
+ class BibliographyEntry:
99
+ """
100
+ Class for representing S2ORC parsed bibliography entries
101
+
102
+ An example json representation (values are examples, not accurate):
103
+
104
+ {
105
+ "title": "Mobility Reports...",
106
+ "authors": [
107
+ {
108
+ "first": "A",
109
+ "middle": ["A"],
110
+ "last": "Haija",
111
+ "suffix": ""
112
+ }
113
+ ],
114
+ "year": 2015,
115
+ "venue": "IEEE Wireless Commun. Mag",
116
+ "volume": "42",
117
+ "issn": "9",
118
+ "pages": "80--92",
119
+ "other_ids": {
120
+ "doi": [
121
+ "10.1109/TWC.2014.2360196"
122
+ ],
123
+
124
+ }
125
+ }
126
+
127
+ """
128
+ def __init__(
129
+ self,
130
+ bib_id: str,
131
+ title: str,
132
+ authors: List[Dict[str, str]],
133
+ ref_id: Optional[str] = None,
134
+ year: Optional[int] = None,
135
+ venue: Optional[str] = None,
136
+ volume: Optional[str] = None,
137
+ issue: Optional[str] = None,
138
+ pages: Optional[str] = None,
139
+ other_ids: Dict[str, List] = None,
140
+ num: Optional[int] = None,
141
+ urls: Optional[List] = None,
142
+ raw_text: Optional[str] = None,
143
+ links: Optional[List] = None
144
+ ):
145
+ self.bib_id = bib_id
146
+ self.ref_id = ref_id
147
+ self.title = title
148
+ self.authors = authors
149
+ self.year = year
150
+ self.venue = venue
151
+ self.volume = volume
152
+ self.issue = issue
153
+ self.pages = pages
154
+ self.other_ids = other_ids
155
+ self.num = num
156
+ self.urls = urls
157
+ self.raw_text = raw_text
158
+ self.links = links
159
+
160
+ def as_json(self):
161
+ return {
162
+ "ref_id": self.ref_id,
163
+ "title": self.title,
164
+ "authors": self.authors,
165
+ "year": self.year,
166
+ "venue": self.venue,
167
+ "volume": self.volume,
168
+ "issue": self.issue,
169
+ "pages": self.pages,
170
+ "other_ids": self.other_ids,
171
+ "num": self.num,
172
+ "urls": self.urls,
173
+ "raw_text": self.raw_text,
174
+ "links": self.links
175
+ }
176
+
177
+
178
+ class Affiliation:
179
+ """
180
+ Class for representing affiliation info
181
+
182
+ Example:
183
+ {
184
+ "laboratory": "Key Laboratory of Urban Environment and Health",
185
+ "institution": "Chinese Academy of Sciences",
186
+ "location": {
187
+ "postCode": "361021",
188
+ "settlement": "Xiamen",
189
+ "country": "People's Republic of China"
190
+ }
191
+ """
192
+ def __init__(
193
+ self,
194
+ laboratory: str,
195
+ institution: str,
196
+ location: Dict
197
+ ):
198
+ self.laboratory = laboratory
199
+ self.institution = institution
200
+ self.location = location
201
+
202
+ def as_json(self):
203
+ return {
204
+ "laboratory": self.laboratory,
205
+ "institution": self.institution,
206
+ "location": self.location
207
+ }
208
+
209
+
210
+ class Author:
211
+ """
212
+ Class for representing paper authors
213
+
214
+ Example:
215
+
216
+ {
217
+ "first": "Anyi",
218
+ "middle": [],
219
+ "last": "Hu",
220
+ "suffix": "",
221
+ "affiliation": {
222
+ "laboratory": "Key Laboratory of Urban Environment and Health",
223
+ "institution": "Chinese Academy of Sciences",
224
+ "location": {
225
+ "postCode": "361021",
226
+ "settlement": "Xiamen",
227
+ "country": "People's Republic of China"
228
+ }
229
+ },
230
+ "email": ""
231
+ }
232
+ """
233
+ def __init__(
234
+ self,
235
+ first: str,
236
+ middle: List[str],
237
+ last: str,
238
+ suffix: str,
239
+ affiliation: Optional[Dict] = None,
240
+ email: Optional[str] = None
241
+ ):
242
+ self.first = first
243
+ self.middle = middle
244
+ self.last = last
245
+ self.suffix = suffix
246
+ self.affiliation = Affiliation(**affiliation) if affiliation else {}
247
+ self.email = email
248
+
249
+ def as_json(self):
250
+ return {
251
+ "first": self.first,
252
+ "middle": self.middle,
253
+ "last": self.last,
254
+ "suffix": self.suffix,
255
+ "affiliation": self.affiliation.as_json() if self.affiliation else {},
256
+ "email": self.email
257
+ }
258
+
259
+
260
+ class Metadata:
261
+ """
262
+ Class for representing paper metadata
263
+
264
+ Example:
265
+ {
266
+ "title": "Niche Partitioning...",
267
+ "authors": [
268
+ {
269
+ "first": "Anyi",
270
+ "middle": [],
271
+ "last": "Hu",
272
+ "suffix": "",
273
+ "affiliation": {
274
+ "laboratory": "Key Laboratory of Urban Environment and Health",
275
+ "institution": "Chinese Academy of Sciences",
276
+ "location": {
277
+ "postCode": "361021",
278
+ "settlement": "Xiamen",
279
+ "country": "People's Republic of China"
280
+ }
281
+ },
282
+ "email": ""
283
+ }
284
+ ],
285
+ "year": "2011-11"
286
+ }
287
+ """
288
+ def __init__(
289
+ self,
290
+ title: str,
291
+ authors: List[Dict],
292
+ year: Optional[str] = None,
293
+ venue: Optional[str] = None,
294
+ identifiers: Optional[Dict] = {}
295
+ ):
296
+ self.title = title
297
+ self.authors = [Author(**author) for author in authors]
298
+ self.year = year
299
+ self.venue = venue
300
+ self.identifiers = identifiers
301
+
302
+ def as_json(self):
303
+ return {
304
+ "title": self.title,
305
+ "authors": [author.as_json() for author in self.authors],
306
+ "year": self.year,
307
+ "venue": self.venue,
308
+ "identifiers": self.identifiers
309
+ }
310
+
311
+
312
+ class Paragraph:
313
+ """
314
+ Class for representing a parsed paragraph from Grobid xml
315
+ All xml tags are removed from the paragraph text, all figures, equations, and tables are replaced
316
+ with a special token that maps to a reference identifier
317
+ Citation mention spans and section header are extracted
318
+
319
+ An example json representation (values are examples, not accurate):
320
+
321
+ {
322
+ "text": "Formal language techniques BID1 may be used to study FORMULA0 (see REF0)...",
323
+ "mention_spans": [
324
+ {
325
+ "start": 27,
326
+ "end": 31,
327
+ "text": "[1]")
328
+ ],
329
+ "ref_spans": [
330
+ {
331
+ "start": ,
332
+ "end": ,
333
+ "text": "Fig. 1"
334
+ }
335
+ ],
336
+ "eq_spans": [
337
+ {
338
+ "start": 53,
339
+ "end": 61,
340
+ "text": "α = 1",
341
+ "latex": "\\alpha = 1",
342
+ "ref_id": null
343
+ }
344
+ ],
345
+ "section": "Abstract"
346
+ }
347
+ """
348
+ def __init__(
349
+ self,
350
+ text: str,
351
+ cite_spans: List[Dict],
352
+ ref_spans: List[Dict],
353
+ eq_spans: Optional[List[Dict]] = [],
354
+ section: Optional = None,
355
+ sec_num: Optional = None
356
+ ):
357
+ self.text = text
358
+ self.cite_spans = cite_spans
359
+ self.ref_spans = ref_spans
360
+ self.eq_spans = eq_spans
361
+ if type(section) == str:
362
+ if section:
363
+ sec_parts = section.split('::')
364
+ section_list = [[None, sec_name] for sec_name in sec_parts]
365
+ else:
366
+ section_list = None
367
+ if section_list and sec_num:
368
+ section_list[-1][0] = sec_num
369
+ else:
370
+ section_list = section
371
+ self.section = section_list
372
+
373
+ def as_json(self):
374
+ return {
375
+ "text": self.text,
376
+ "cite_spans": self.cite_spans,
377
+ "ref_spans": self.ref_spans,
378
+ "eq_spans": self.eq_spans,
379
+ "section": '::'.join([sec[1] for sec in self.section]) if self.section else "",
380
+ "sec_num": self.section[-1][0] if self.section else None
381
+ }
382
+
383
+
384
+ class Paper:
385
+ """
386
+ Class for representing a parsed S2ORC paper
387
+ """
388
+ def __init__(
389
+ self,
390
+ paper_id: str,
391
+ pdf_hash: str,
392
+ metadata: Dict,
393
+ abstract: List[Dict],
394
+ body_text: List[Dict],
395
+ back_matter: List[Dict],
396
+ bib_entries: Dict,
397
+ ref_entries: Dict
398
+ ):
399
+ self.paper_id = paper_id
400
+ self.pdf_hash = pdf_hash
401
+ self.metadata = Metadata(**metadata)
402
+ self.abstract = [Paragraph(**para) for para in abstract]
403
+ self.body_text = [Paragraph(**para) for para in body_text]
404
+ self.back_matter = [Paragraph(**para) for para in back_matter]
405
+ self.bib_entries = [
406
+ BibliographyEntry(
407
+ bib_id=key,
408
+ **{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in bib.items() if k not in SKIP_KEYS}
409
+ ) for key, bib in bib_entries.items()
410
+ ]
411
+ self.ref_entries = [
412
+ ReferenceEntry(
413
+ ref_id=key,
414
+ **{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in ref.items() if k != 'ref_id'}
415
+ ) for key, ref in ref_entries.items()
416
+ ]
417
+
418
+ def as_json(self):
419
+ return {
420
+ "paper_id": self.paper_id,
421
+ "pdf_hash": self.pdf_hash,
422
+ "metadata": self.metadata.as_json(),
423
+ "abstract": [para.as_json() for para in self.abstract],
424
+ "body_text": [para.as_json() for para in self.body_text],
425
+ "back_matter": [para.as_json() for para in self.back_matter],
426
+ "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
427
+ "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
428
+ }
429
+
430
+ @property
431
+ def raw_abstract_text(self) -> str:
432
+ """
433
+ Get all the body text joined by a newline
434
+ :return:
435
+ """
436
+ return '\n'.join([para.text for para in self.abstract])
437
+
438
+ @property
439
+ def raw_body_text(self) -> str:
440
+ """
441
+ Get all the body text joined by a newline
442
+ :return:
443
+ """
444
+ return '\n'.join([para.text for para in self.body_text])
445
+
446
+ def release_json(self, doc_type: str="pdf"):
447
+ """
448
+ Return in release JSON format
449
+ :return:
450
+ """
451
+ # TODO: not fully implemented; metadata format is not right; extra keys in some places
452
+ release_dict = {"paper_id": self.paper_id}
453
+ release_dict.update({"header": {
454
+ "generated_with": f'{S2ORC_NAME_STRING} {S2ORC_VERSION_STRING}',
455
+ "date_generated": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
456
+ }})
457
+ release_dict.update(self.metadata.as_json())
458
+ release_dict.update({"abstract": self.raw_abstract_text})
459
+ release_dict.update({
460
+ f"{doc_type}_parse": {
461
+ "paper_id": self.paper_id,
462
+ "_pdf_hash": self.pdf_hash,
463
+ "abstract": [para.as_json() for para in self.abstract],
464
+ "body_text": [para.as_json() for para in self.body_text],
465
+ "back_matter": [para.as_json() for para in self.back_matter],
466
+ "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
467
+ "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
468
+ }
469
+ })
470
+ return release_dict
471
+
472
+
473
+ def load_s2orc(paper_dict: Dict) -> Paper:
474
+ """
475
+ Load release S2ORC into Paper class
476
+ :param paper_dict:
477
+ :return:
478
+ """
479
+ paper_id = paper_dict['paper_id']
480
+ pdf_hash = paper_dict.get('_pdf_hash', paper_dict.get('s2_pdf_hash', None))
481
+
482
+ # 2019 gorc parses
483
+ if "grobid_parse" in paper_dict and paper_dict.get("grobid_parse"):
484
+ metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
485
+ abstract = paper_dict.get("grobid_parse").get("abstract", [])
486
+ body_text = paper_dict.get("grobid_parse").get("body_text", [])
487
+ back_matter = paper_dict.get("grobid_parse").get("back_matter", [])
488
+ bib_entries = paper_dict.get("grobid_parse").get("bib_entries", {})
489
+ for k, v in bib_entries.items():
490
+ if 'link' in v:
491
+ v['links'] = [v['link']]
492
+ ref_entries = paper_dict.get("grobid_parse").get("ref_entries", {})
493
+ # current and 2020 s2orc release_json
494
+ elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ("body_text" in paper_dict and paper_dict.get("body_text")):
495
+ if "pdf_parse" in paper_dict:
496
+ paper_dict = paper_dict["pdf_parse"]
497
+ if paper_dict.get("metadata"):
498
+ metadata = {k: v for k, v in paper_dict.get("metadata").items() if k in METADATA_KEYS}
499
+ # 2020 s2orc releases (metadata is separate)
500
+ else:
501
+ metadata = {
502
+ "title": None,
503
+ "authors": [],
504
+ "year": None
505
+ }
506
+ abstract = paper_dict.get("abstract", [])
507
+ body_text = paper_dict.get("body_text", [])
508
+ back_matter = paper_dict.get("back_matter", [])
509
+ bib_entries = paper_dict.get("bib_entries", {})
510
+ for k, v in bib_entries.items():
511
+ if 'link' in v:
512
+ v['links'] = [v['link']]
513
+ ref_entries = paper_dict.get("ref_entries", {})
514
+ else:
515
+ print(paper_id)
516
+ raise NotImplementedError("Unknown S2ORC file type!")
517
+
518
+ return Paper(
519
+ paper_id=paper_id,
520
+ pdf_hash=pdf_hash,
521
+ metadata=metadata,
522
+ abstract=abstract,
523
+ body_text=body_text,
524
+ back_matter=back_matter,
525
+ bib_entries=bib_entries,
526
+ ref_entries=ref_entries
527
+ )
s2orc-doc2json/doc2json/spp2json/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/spp2json/process_pdf.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import time
5
+ from typing import Dict
6
+
7
+ from doc2json.spp2json.spp.spp_client import SppClient
8
+ from doc2json.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
9
+
10
+
11
+
12
+ def process_pdf_file(input_file: str, temp_dir: str, output_dir: str) -> str:
13
+ """
14
+ Process a PDF file and get JSON representation
15
+ :param input_file:
16
+ :param temp_dir:
17
+ :param output_dir:
18
+ :return:
19
+ """
20
+ # get paper id as the name of the file
21
+ paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
22
+ spp_json_file = os.path.join(temp_dir, f'{paper_id}.json')
23
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
24
+
25
+ # check if input file exists and output file doesn't
26
+ if not os.path.exists(input_file):
27
+ raise FileNotFoundError(f"{input_file} doesn't exist")
28
+ if os.path.exists(output_file):
29
+ raise Warning(f'{output_file} already exists!')
30
+
31
+ # process PDF through SPP -> SPP JSON
32
+ client = SppClient()
33
+ # TODO: compute PDF hash
34
+ client.process(input_file, temp_dir)
35
+
36
+ # process SPP JSON -> S2ORC JSON
37
+ assert os.path.exists(spp_json_file)
38
+ with open(spp_json_file, 'r') as f_in:
39
+ spp_json = json.load(f_in)
40
+ paper = convert_spp_json_to_s2orc_json(spp_json=spp_json)
41
+
42
+ # write to file
43
+ with open(output_file, 'w') as outf:
44
+ json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
45
+
46
+ return output_file
47
+
48
+
49
+ if __name__ == '__main__':
50
+ parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
51
+ parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
52
+ parser.add_argument("-t", "--temp", default='temp/', help="path to the temp dir for putting tei xml files")
53
+ parser.add_argument("-o", "--output", default='output/', help="path to the output dir for putting json files")
54
+ parser.add_argument("-k", "--keep", action='store_true')
55
+
56
+ args = parser.parse_args()
57
+
58
+ input_path = args.input
59
+ temp_path = args.temp
60
+ output_path = args.output
61
+ keep_temp = args.keep
62
+
63
+ start_time = time.time()
64
+
65
+ os.makedirs(temp_path, exist_ok=True)
66
+ os.makedirs(output_path, exist_ok=True)
67
+
68
+ process_pdf_file(input_path, temp_path, output_path)
69
+
70
+ runtime = round(time.time() - start_time, 3)
71
+ print("runtime: %s seconds " % (runtime))
72
+ print('done.')
s2orc-doc2json/doc2json/spp2json/spp/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/spp2json/spp/spp_client.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import argparse
5
+ import time
6
+ import glob
7
+ import ntpath
8
+ from typing import List
9
+
10
+
11
+ class SppClient:
12
+ def process(self, input: str, output: str):
13
+ raise NotImplementedError
14
+
15
+
16
+ if __name__ == "__main__":
17
+ parser = argparse.ArgumentParser(description="Client for ScienceParsePlus (SPP) services")
18
+ parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
19
+ parser.add_argument("--output", default=None, help="path to the directory where to put the results")
20
+ args = parser.parse_args()
21
+
22
+ input_path = args.input
23
+ output_path = args.output
24
+
25
+ client = SppClient()
26
+
27
+ start_time = time.time()
28
+
29
+ client.process(input_path, output_path)
30
+
31
+ runtime = round(time.time() - start_time, 3)
32
+ print("runtime: %s seconds " % (runtime))
s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from typing import *
2
+
3
+ from doc2json.s2orc import Paper
4
+
5
+
6
+ def convert_spp_json_to_s2orc_json(spp_json: Dict) -> Paper:
7
+ raise NotImplementedError
s2orc-doc2json/doc2json/tex2json/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/tex2json/process_tex.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import time
5
+ from typing import Optional, Dict
6
+
7
+ from doc2json.tex2json.tex_to_xml import convert_latex_to_s2orc_json
8
+ from doc2json.tex2json.xml_to_json import convert_latex_xml_to_s2orc_json
9
+
10
+
11
+ BASE_TEMP_DIR = 'temp'
12
+ BASE_OUTPUT_DIR = 'output'
13
+ BASE_LOG_DIR = 'log'
14
+
15
+
16
+ def process_tex_stream(
17
+ fname: str,
18
+ stream: bytes,
19
+ temp_dir: str=BASE_TEMP_DIR,
20
+ keep_flag: bool=False,
21
+ grobid_config: Optional[Dict] = None
22
+ ):
23
+ """
24
+ Process a gz file stream
25
+ :param fname:
26
+ :param stream:
27
+ :param temp_dir:
28
+ :param keep_flag:
29
+ :param grobid_config:
30
+ :return:
31
+ """
32
+ temp_input_dir = os.path.join(temp_dir, 'input')
33
+ temp_input_file = os.path.join(temp_input_dir, fname)
34
+
35
+ os.makedirs(temp_dir, exist_ok=True)
36
+ os.makedirs(temp_input_dir, exist_ok=True)
37
+
38
+ with open(temp_input_file, 'wb') as outf:
39
+ outf.write(stream)
40
+
41
+ output_file = process_tex_file(
42
+ temp_input_file, temp_dir=temp_dir, keep_flag=keep_flag, grobid_config=grobid_config
43
+ )
44
+
45
+ if os.path.exists(output_file):
46
+ with open(output_file, 'r') as f:
47
+ contents = json.load(f)
48
+ return contents
49
+ else:
50
+ return []
51
+
52
+
53
+ def process_tex_file(
54
+ input_file: str,
55
+ temp_dir: str=BASE_TEMP_DIR,
56
+ output_dir: str=BASE_OUTPUT_DIR,
57
+ log_dir: str=BASE_LOG_DIR,
58
+ keep_flag: bool=False,
59
+ grobid_config: Optional[Dict]=None
60
+ ) -> Optional[str]:
61
+ """
62
+ Process files in a TEX zip and get JSON representation
63
+ :param input_file:
64
+ :param temp_dir:
65
+ :param output_dir:
66
+ :param log_dir:
67
+ :param keep_flag:
68
+ :param grobid_config:
69
+ :return:
70
+ """
71
+ # create directories
72
+ os.makedirs(temp_dir, exist_ok=True)
73
+ os.makedirs(output_dir, exist_ok=True)
74
+ os.makedirs(log_dir, exist_ok=True)
75
+
76
+ # get paper id as the name of the file
77
+ paper_id = os.path.splitext(input_file)[0].split('/')[-1]
78
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
79
+ cleanup_flag = not keep_flag
80
+
81
+ # check if input file exists and output file doesn't
82
+ if not os.path.exists(input_file):
83
+ raise FileNotFoundError(f"{input_file} doesn't exist")
84
+ if os.path.exists(output_file):
85
+ print(f'{output_file} already exists!')
86
+
87
+ # process LaTeX
88
+ xml_file = convert_latex_to_s2orc_json(input_file, temp_dir, cleanup_flag)
89
+ if not xml_file:
90
+ return None
91
+
92
+ # convert to S2ORC
93
+ paper = convert_latex_xml_to_s2orc_json(xml_file, log_dir, grobid_config=grobid_config)
94
+
95
+ # write to file
96
+ with open(output_file, 'w') as outf:
97
+ json.dump(paper.release_json("latex"), outf, indent=4, sort_keys=False)
98
+
99
+ return output_file
100
+
101
+
102
+ if __name__ == '__main__':
103
+ parser = argparse.ArgumentParser(description="Run S2ORC TEX2JSON")
104
+ parser.add_argument("-i", "--input", default=None, help="path to the input TEX zip file")
105
+ parser.add_argument("-t", "--temp", default='temp', help="path to a temp dir for partial files")
106
+ parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
107
+ parser.add_argument("-l", "--log", default='log', help="path to the log dir")
108
+ parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
109
+
110
+ args = parser.parse_args()
111
+
112
+ input_path = args.input
113
+ temp_path = args.temp
114
+ output_path = args.output
115
+ log_path = args.log
116
+ keep_temp = args.keep
117
+
118
+ start_time = time.time()
119
+
120
+ os.makedirs(temp_path, exist_ok=True)
121
+ os.makedirs(output_path, exist_ok=True)
122
+
123
+ process_tex_file(input_path, temp_path, output_path, log_path, keep_temp)
124
+
125
+ runtime = round(time.time() - start_time, 3)
126
+ print("runtime: %s seconds " % (runtime))
127
+ print('done.')
s2orc-doc2json/doc2json/tex2json/tex_to_xml.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Process all the files in a LaTeX zip file to extract paper content
3
+
4
+ 1. Unzips LaTeX ZIP file
5
+ 2. Identifies primary TEX file
6
+ 3. Expands other TEX files into main TEX file using latexpand
7
+ 4. Expands BBL file into main TEX file
8
+ 5. Convert TEX file into XML using tralics
9
+ 6. Extract content of XML into S2ORC JSON
10
+
11
+ """
12
+
13
+ import os
14
+ import gzip
15
+ import tarfile
16
+ import zipfile
17
+ import shutil
18
+ from typing import Optional
19
+
20
+ from doc2json.utils.latex_util import normalize, latex_to_xml
21
+
22
+
23
+ def _is_gzip_file(fpath):
24
+ with open(fpath, 'rb') as test_f:
25
+ return test_f.read(2) == b'\x1f\x8b'
26
+
27
+
28
+ def extract_latex(zip_file: str, latex_dir: str, cleanup=True):
29
+ """
30
+ Unzip latex zip into temp directory
31
+ :param zip_file:
32
+ :param latex_dir:
33
+ :param cleanup:
34
+ :return:
35
+ """
36
+ assert os.path.exists(zip_file)
37
+ assert zip_file.endswith('.gz') or zip_file.endswith('.zip') or zip_file.endswith('.tar')
38
+
39
+ # get name of zip file
40
+ file_id = os.path.splitext(zip_file)[0].split('/')[-1]
41
+
42
+ # check if tar file -> untar
43
+ tar_dir = os.path.join(latex_dir, file_id)
44
+ os.makedirs(tar_dir, exist_ok=True)
45
+ if tarfile.is_tarfile(zip_file):
46
+ with tarfile.open(zip_file) as tar:
47
+ tar.extractall(tar_dir)
48
+ # check if gzip file -> un-gz and/or untar
49
+ elif _is_gzip_file(zip_file):
50
+ tar_file = os.path.join(latex_dir, f'{file_id}.tar')
51
+ with gzip.open(zip_file, 'rb') as in_f, open(tar_file, 'wb') as out_f:
52
+ s = in_f.read()
53
+ out_f.write(s)
54
+ if os.path.exists(tar_file):
55
+ # check if tarfile
56
+ if tarfile.is_tarfile(tar_file):
57
+ with tarfile.open(tar_file) as tar:
58
+ tar.extractall(tar_dir)
59
+ os.remove(tar_file)
60
+ # else, copy to tex file
61
+ else:
62
+ tex_file = os.path.join(latex_dir, file_id, f'{file_id}.tex')
63
+ os.makedirs(tar_dir, exist_ok=True)
64
+ os.rename(tar_file, tex_file)
65
+ # check if zip file -> unzip
66
+ elif zipfile.is_zipfile(zip_file):
67
+ with zipfile.ZipFile(zip_file, 'r') as in_f:
68
+ in_f.extractall(tar_dir)
69
+ else:
70
+ return None
71
+
72
+ # clean up if needed
73
+ if cleanup:
74
+ os.remove(zip_file)
75
+
76
+ # returns directory
77
+ if os.path.exists(tar_dir):
78
+ return tar_dir
79
+
80
+
81
+ def normalize_latex(latex_dir: str, norm_dir: str, norm_log_file: str, cleanup=True) -> Optional[str]:
82
+ """
83
+ Normalize all latex files from arxiv
84
+ :param latex_dir:
85
+ :param norm_dir:
86
+ :param norm_log_file:
87
+ :param cleanup:
88
+ :return:
89
+ """
90
+ # normalize file
91
+ file_id = latex_dir.strip('/').split('/')[-1]
92
+ if file_id == 'skipped':
93
+ return None
94
+ norm_output_folder = os.path.join(norm_dir, file_id)
95
+ os.makedirs(norm_output_folder, exist_ok=True)
96
+ try:
97
+ normalize(latex_dir, norm_output_folder)
98
+ except TypeError:
99
+ shutil.rmtree(norm_output_folder)
100
+ with open(norm_log_file, 'a+') as log_f:
101
+ log_f.write(f'{file_id}\n')
102
+
103
+ # delete latex directory if cleanup
104
+ if cleanup:
105
+ shutil.rmtree(latex_dir)
106
+
107
+ return norm_output_folder
108
+
109
+
110
+ def norm_latex_to_xml(norm_dir: str, xml_dir: str, xml_err_file: str, xml_log_file: str, cleanup=True) -> Optional[str]:
111
+ """
112
+ Convert LaTeX to XML using tralics
113
+ :param norm_dir:
114
+ :param xml_dir:
115
+ :param xml_err_file:
116
+ :param xml_log_file:
117
+ :param cleanup:
118
+ :return:
119
+ """
120
+ file_id = norm_dir.strip('/').split('/')[-1]
121
+ norm_tex_file = os.path.join(norm_dir, f'{file_id}.tex')
122
+ xml_output_dir = os.path.join(xml_dir, file_id)
123
+ xml_file = os.path.join(xml_output_dir, f'{file_id}.xml')
124
+ os.makedirs(xml_output_dir, exist_ok=True)
125
+
126
+ latex_to_xml(
127
+ tex_file=norm_tex_file,
128
+ out_dir=xml_output_dir,
129
+ out_file=xml_file,
130
+ err_file=xml_err_file,
131
+ log_file=xml_log_file
132
+ )
133
+
134
+ # delete norm directory if cleanup
135
+ if cleanup:
136
+ shutil.rmtree(norm_dir)
137
+
138
+ if os.path.exists(xml_file):
139
+ return xml_file
140
+
141
+
142
+ def convert_latex_to_xml(
143
+ zip_file: str, latex_dir: str, norm_dir: str, xml_dir: str, log_dir: str, cleanup=True
144
+ ) -> Optional[str]:
145
+ """
146
+ Run expansion, normalization, xml conversion on latex
147
+ :param zip_file:
148
+ :param latex_dir:
149
+ :param norm_dir:
150
+ :param xml_dir:
151
+ :param log_dir:
152
+ :param cleanup:
153
+ :return:
154
+ """
155
+ # extract zip file
156
+ latex_output_dir = extract_latex(zip_file, latex_dir, cleanup)
157
+
158
+ # normalize latex
159
+ norm_log_file = os.path.join(log_dir, 'norm_error.log')
160
+ norm_output_dir = normalize_latex(latex_output_dir, norm_dir, norm_log_file, cleanup)
161
+
162
+ # convert to xml
163
+ xml_error_file = os.path.join(log_dir, 'xml_error.log')
164
+ xml_log_file = os.path.join(log_dir, 'xml_skip.log')
165
+ xml_output_file = norm_latex_to_xml(norm_output_dir, xml_dir, xml_error_file, xml_log_file, cleanup)
166
+
167
+ return xml_output_file
168
+
169
+
170
+ def convert_latex_to_s2orc_json(
171
+ latex_zip: str,
172
+ base_temp_dir: str,
173
+ cleanup_after: bool=True
174
+ ) -> str:
175
+ """
176
+ Convert a LaTeX zip file to S2ORC JSON
177
+ :param latex_zip:
178
+ :param base_temp_dir:
179
+ :param cleanup_after:
180
+ :return:
181
+ """
182
+ if not os.path.exists(latex_zip):
183
+ raise FileNotFoundError("Input LaTeX ZIP file doesn't exist")
184
+
185
+ # temp directories
186
+ latex_expand_dir = os.path.join(base_temp_dir, 'latex')
187
+ latex_norm_dir = os.path.join(base_temp_dir, 'norm')
188
+ latex_xml_dir = os.path.join(base_temp_dir, 'xml')
189
+ latex_log_dir = os.path.join(base_temp_dir, 'log')
190
+
191
+ os.makedirs(base_temp_dir, exist_ok=True)
192
+ os.makedirs(latex_expand_dir, exist_ok=True)
193
+ os.makedirs(latex_norm_dir, exist_ok=True)
194
+ os.makedirs(latex_xml_dir, exist_ok=True)
195
+ os.makedirs(latex_log_dir, exist_ok=True)
196
+
197
+ # convert to XML
198
+ xml_file = convert_latex_to_xml(
199
+ latex_zip, latex_expand_dir, latex_norm_dir, latex_xml_dir, latex_log_dir, cleanup_after
200
+ )
201
+ return xml_file
s2orc-doc2json/doc2json/tex2json/xml_to_json.py ADDED
@@ -0,0 +1,1396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import itertools
4
+ import bs4
5
+ from bs4 import BeautifulSoup, NavigableString
6
+ from typing import List, Dict, Tuple, Optional
7
+ import copy
8
+ import latex2mathml.converter
9
+
10
+ from doc2json.grobid2json.grobid.grobid_client import GrobidClient
11
+ from doc2json.utils.grobid_util import parse_bib_entry, get_author_data_from_grobid_xml
12
+ from doc2json.s2orc import Paper, Paragraph
13
+
14
+
15
+ SKIP_TAGS = {
16
+ 'clearpage',
17
+ 'colorpool',
18
+ 'newpage',
19
+ 'tableofcontents'
20
+ }
21
+
22
+ TEXT_TAGS = {
23
+ 'p',
24
+ 'proof',
25
+ 'caption'
26
+ }
27
+
28
+
29
+ def normalize_latex_id(latex_id: str):
30
+ str_norm = latex_id.upper().replace('_', '')
31
+ if str_norm.startswith('BID'):
32
+ return str_norm.replace('BID', 'BIBREF')
33
+ if str_norm.startswith('CID'):
34
+ return str_norm.replace('CID', 'SECREF')
35
+ if str_norm.startswith('FORMULA'):
36
+ return str_norm.replace('FORMULA', 'EQREF')
37
+ return str_norm
38
+
39
+
40
+ def process_author(
41
+ author_text: str,
42
+ grobid_client: GrobidClient,
43
+ logfile: str
44
+ ) -> List[Dict]:
45
+ """
46
+ Process authors
47
+ :param author_text:
48
+ :param grobid_client:
49
+ :param logfile:
50
+ :return:
51
+ """
52
+ if author_text:
53
+ author_xml_str = grobid_client.process_header_names(author_text, logfile)
54
+ if author_xml_str:
55
+ author_soup = BeautifulSoup(author_xml_str, 'xml')
56
+ author_entry = get_author_data_from_grobid_xml(author_soup)
57
+ return author_entry
58
+
59
+ return [{
60
+ "first": "",
61
+ "middle": [],
62
+ "last": author_text,
63
+ "suffix": "",
64
+ "affiliation": {},
65
+ "email": ""
66
+ }]
67
+
68
+
69
+ def process_bibentry(bib_text: str, grobid_client: GrobidClient, logfile: str):
70
+ """
71
+ Process one bib entry text into title, authors, etc
72
+ :param bib_text:
73
+ :param grobid_client:
74
+ :param logfile:
75
+ :return:
76
+ """
77
+ if not bib_text:
78
+ return None
79
+ bib_lines = bib_text.split('\n')
80
+ bib_lines = [re.sub(r'\s+', ' ', line) for line in bib_lines]
81
+ bib_lines = [re.sub(r'\s', ' ', line).strip() for line in bib_lines]
82
+ bib_string = ' '.join(bib_lines)
83
+ xml_str = grobid_client.process_citation(bib_string, logfile)
84
+ if xml_str:
85
+ soup = BeautifulSoup(xml_str, 'lxml')
86
+ bib_entry = parse_bib_entry(soup)
87
+ if not bib_entry['raw_text']:
88
+ bib_entry['raw_text'] = bib_string
89
+ return bib_entry
90
+ return None
91
+
92
+
93
+ def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
94
+ """
95
+ Replace all references in element with special tokens
96
+ :param sp:
97
+ :param el:
98
+ :param ref_map:
99
+ :return:
100
+ """
101
+ # replace all citations with cite keyword
102
+ for cite in el.find_all('cit'):
103
+ try:
104
+ target = cite.ref.get('target').replace('bid', 'BIBREF')
105
+ cite.replace_with(sp.new_string(f" {target} "))
106
+ except AttributeError:
107
+ print('Attribute error: ', cite)
108
+ continue
109
+
110
+ # replace all non citation references
111
+ for rtag in el.find_all('ref'):
112
+ try:
113
+ if rtag.get('target') and not rtag.get('target').startswith('bid'):
114
+ if rtag.get('target').startswith('cid'):
115
+ target = rtag.get('target').replace('cid', 'SECREF')
116
+ elif rtag.get('target').startswith('uid'):
117
+ if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
118
+ target = rtag.get('target').replace('uid', 'FIGREF')
119
+ elif rtag.get('target').replace('uid', 'TABREF') in ref_map:
120
+ target = rtag.get('target').replace('uid', 'TABREF')
121
+ elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
122
+ target = rtag.get('target').replace('uid', 'EQREF')
123
+ elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map:
124
+ target = rtag.get('target').replace('uid', 'FOOTREF')
125
+ elif rtag.get('target').replace('uid', 'SECREFU') in ref_map:
126
+ target = rtag.get('target').replace('uid', 'SECREFU')
127
+ else:
128
+ target = rtag.get('target').upper()
129
+ else:
130
+ print('Weird ID!')
131
+ target = rtag.get('target').upper()
132
+ rtag.replace_with(sp.new_string(f" {target} "))
133
+ except AttributeError:
134
+ print('Attribute error: ', rtag)
135
+ continue
136
+
137
+ return el
138
+
139
+
140
+ def process_list_el(sp: BeautifulSoup, list_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
141
+ """
142
+ Process list element
143
+ :param sp:
144
+ :param list_el:
145
+ :param section_info:
146
+ :param bib_map:
147
+ :param ref_map:
148
+ :return:
149
+ """
150
+ # TODO: currently parsing list as a list of paragraphs (append numbers to start of each entry in ordered lists)
151
+ list_items = []
152
+ for item in list_el.find_all('item'):
153
+ # skip itemize settings
154
+ if item.text.strip().startswith('[') and item.text.strip().endswith(']'):
155
+ continue
156
+ # try processing as paragraph
157
+ list_num = item.get('id-text', None)
158
+ item_as_para = process_paragraph(sp, item, section_info, bib_map, ref_map)
159
+ # append list number if ordered
160
+ if list_num:
161
+ list_num_str = f'{list_num}. '
162
+ # iterate cite spans
163
+ new_cite_spans = []
164
+ for span in item_as_para.cite_spans:
165
+ new_cite_spans.append({
166
+ "start": span['start'] + len(list_num_str),
167
+ "end": span['end'] + len(list_num_str),
168
+ "text": span['text']
169
+ })
170
+ # iterate ref spans
171
+ new_ref_spans = []
172
+ for span in item_as_para.ref_spans:
173
+ new_ref_spans.append({
174
+ "start": span['start'] + len(list_num_str),
175
+ "end": span['end'] + len(list_num_str),
176
+ "text": span['text']
177
+ })
178
+ # iterate equation spans
179
+ new_eq_spans = []
180
+ for span in item_as_para.eq_spans:
181
+ new_eq_spans.append({
182
+ "start": span['start'] + len(list_num_str),
183
+ "end": span['end'] + len(list_num_str),
184
+ "text": span['text'],
185
+ "latex": span['latex'],
186
+ "ref_id": span['ref_id']
187
+ })
188
+ new_para = Paragraph(
189
+ text=list_num_str + item_as_para.text,
190
+ cite_spans=new_cite_spans,
191
+ ref_spans=new_ref_spans,
192
+ eq_spans=new_eq_spans,
193
+ section=item_as_para.section
194
+ )
195
+ else:
196
+ new_para = item_as_para
197
+ list_items.append(new_para)
198
+ return list_items
199
+
200
+
201
+ def process_navstring(str_el: NavigableString, section_info: List):
202
+ """
203
+ Process one NavigableString
204
+ :param sp:
205
+ :param str_el:
206
+ :param section_info:
207
+ :param bib_map:
208
+ :param ref_map:
209
+ :return:
210
+ """
211
+ # substitute space characters
212
+ text = re.sub(r'\s+', ' ', str_el)
213
+ text = re.sub(r'\s', ' ', text)
214
+
215
+ # get all cite spans
216
+ all_cite_spans = []
217
+ for span in re.finditer(r'(BIBREF\d+)', text):
218
+ all_cite_spans.append({
219
+ "start": span.start(),
220
+ "end": span.start() + len(span.group()),
221
+ "ref_id": span.group()
222
+ })
223
+
224
+ # get all ref spans
225
+ all_ref_spans = []
226
+ for span in itertools.chain(
227
+ re.finditer(r'(FIGREF\d+)', text),
228
+ re.finditer(r'(TABREF\d+)', text),
229
+ re.finditer(r'(EQREF\d+)', text),
230
+ re.finditer(r'(FOOTREF\d+)', text),
231
+ re.finditer(r'(SECREF\d+)', text),
232
+ re.finditer(r'(SECREFU\d+)', text),
233
+ ):
234
+ all_ref_spans.append({
235
+ "start": span.start(),
236
+ "end": span.start() + len(span.group()),
237
+ "ref_id": span.group()
238
+ })
239
+
240
+ # assert all align
241
+ for cite_span in all_cite_spans:
242
+ assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
243
+ for ref_span in all_ref_spans:
244
+ assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']
245
+
246
+ return Paragraph(
247
+ text=text,
248
+ cite_spans=all_cite_spans,
249
+ ref_spans=all_ref_spans,
250
+ eq_spans=[],
251
+ section=section_info
252
+ )
253
+
254
+
255
+ def process_paragraph(sp: BeautifulSoup, para_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
256
+ """
257
+ Process one paragraph
258
+ :param sp:
259
+ :param para_el:
260
+ :param section_info:
261
+ :param bib_map:
262
+ :param ref_map:
263
+ :return:
264
+ """
265
+ # replace all ref tokens with special tokens
266
+ para_el = replace_ref_tokens(sp, para_el, ref_map)
267
+
268
+ # sub and get corresponding spans of inline formulas
269
+ formula_dict = dict()
270
+ inline_key_ind = 0
271
+ display_key_ind = 0
272
+ for ftag in para_el.find_all('formula'):
273
+ try:
274
+ # if formula has ref id, treat as display formula
275
+ if ftag.get('id'):
276
+ formula_key = f'DISPLAYFORM{display_key_ind}'
277
+ ref_id = ftag.get('id').replace('uid', 'EQREF')
278
+ display_key_ind += 1
279
+ # else, treat as inline
280
+ else:
281
+ formula_key = f'INLINEFORM{inline_key_ind}'
282
+ ref_id = None
283
+ inline_key_ind += 1
284
+ try:
285
+ formula_mathml = latex2mathml.converter.convert(ftag.texmath.text)
286
+ except Exception:
287
+ formula_mathml = ""
288
+ formula_dict[formula_key] = (ftag.math.text, ftag.texmath.text, formula_mathml, ref_id)
289
+ ftag.replace_with(sp.new_string(f" {formula_key} "))
290
+ except AttributeError:
291
+ continue
292
+
293
+ # remove floats
294
+ for fl in para_el.find_all('float'):
295
+ print('Warning: still has <float/>!')
296
+ fl.decompose()
297
+
298
+ # remove notes
299
+ for note in para_el.find_all('note'):
300
+ print('Warning: still has <note/>!')
301
+ note.decompose()
302
+
303
+ # substitute space characters
304
+ text = re.sub(r'\s+', ' ', para_el.text)
305
+ text = re.sub(r'\s', ' ', text)
306
+
307
+ # get all cite spans
308
+ all_cite_spans = []
309
+ for span in re.finditer(r'(BIBREF\d+)', text):
310
+ all_cite_spans.append({
311
+ "start": span.start(),
312
+ "end": span.start() + len(span.group()),
313
+ "text": bib_map[span.group()]['num'] if span.group() in bib_map else None,
314
+ "ref_id": span.group()
315
+ })
316
+
317
+ # get all ref spans
318
+ all_ref_spans = []
319
+ for span in itertools.chain(
320
+ re.finditer(r'(FIGREF\d+)', text),
321
+ re.finditer(r'(TABREF\d+)', text),
322
+ re.finditer(r'(EQREF\d+)', text),
323
+ re.finditer(r'(FOOTREF\d+)', text),
324
+ re.finditer(r'(SECREF\d+)', text),
325
+ re.finditer(r'(SECREFU\d+)', text),
326
+ ):
327
+ all_ref_spans.append({
328
+ "start": span.start(),
329
+ "end": span.start() + len(span.group()),
330
+ "text": ref_map[span.group()]['num'] if span.group() in ref_map else None,
331
+ "ref_id": span.group()
332
+ })
333
+
334
+ # get all equation spans
335
+ all_eq_spans = []
336
+ for span in itertools.chain(
337
+ re.finditer(r'(INLINEFORM\d+)', text),
338
+ re.finditer(r'(DISPLAYFORM\d+)', text)
339
+ ):
340
+ try:
341
+ matching_formula = formula_dict[span.group()]
342
+ all_eq_spans.append({
343
+ "start": span.start(),
344
+ "end": span.start() + len(span.group()),
345
+ "text": matching_formula[0],
346
+ "latex": matching_formula[1],
347
+ "mathml": matching_formula[2],
348
+ "ref_id": span.group()
349
+ })
350
+ except KeyError:
351
+ continue
352
+
353
+ # assert all align
354
+ for cite_span in all_cite_spans:
355
+ assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
356
+ for ref_span in all_ref_spans:
357
+ assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']
358
+
359
+ return Paragraph(
360
+ text=text,
361
+ cite_spans=all_cite_spans,
362
+ ref_spans=all_ref_spans,
363
+ eq_spans=all_eq_spans,
364
+ section=section_info
365
+ )
366
+
367
+
368
+ def decompose_tags_before_title(sp: BeautifulSoup):
369
+ """
370
+ decompose all tags before title
371
+ :param sp:
372
+ :return:
373
+ """
374
+ if sp.body.next.name == 'std':
375
+ cld_tags = sp.std.find_all(recursive=False)
376
+ if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
377
+ for tag in sp.std:
378
+ if type(tag) == bs4.element.Tag:
379
+ if tag.name != 'maketitle' and tag.name != 'title':
380
+ tag.decompose()
381
+ else:
382
+ break
383
+ elif sp.body.next.name == 'unknown':
384
+ cld_tags = sp.unknown.find_all(recursive=False)
385
+ if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
386
+ for tag in sp.std:
387
+ if type(tag) == bs4.element.Tag:
388
+ if tag.name != 'maketitle' and tag.name != 'title':
389
+ tag.decompose()
390
+ else:
391
+ break
392
+ else:
393
+ print(f"Unknown inner tag: {sp.body.next.name}")
394
+ return
395
+
396
+
397
+ def process_metadata(sp: BeautifulSoup, grobid_client: GrobidClient, log_file: str) -> Tuple[str, List]:
398
+ """
399
+ Process metadata section in soup
400
+ :param sp:
401
+ :param grobid_client:
402
+ :param log_file:
403
+ :return:
404
+ """
405
+ title = ""
406
+ authors = []
407
+
408
+ if not sp.maketitle and not sp.metadata:
409
+ if sp.title:
410
+ title = sp.title.text
411
+ return title, authors
412
+ else:
413
+ return title, authors
414
+ elif sp.maketitle:
415
+ try:
416
+ # process title
417
+ title = sp.maketitle.title.text
418
+ for formula in sp.author.find_all('formula'):
419
+ formula.decompose()
420
+ # process authors
421
+ author_parts = []
422
+ for tag in sp.author:
423
+ if type(tag) == NavigableString:
424
+ author_parts.append(tag.strip())
425
+ else:
426
+ author_parts.append(tag.text.strip())
427
+ author_parts = [re.sub(r'\s+', ' ', line) for line in author_parts]
428
+ author_parts = [re.sub(r'\s', ' ', line).strip() for line in author_parts]
429
+ author_parts = [part for part in author_parts if part.strip()]
430
+ author_string = ', '.join(author_parts)
431
+ authors = process_author(author_string, grobid_client, log_file)
432
+ sp.maketitle.decompose()
433
+ except AttributeError:
434
+ sp.maketitle.decompose()
435
+ return title, authors
436
+ elif sp.metadata:
437
+ try:
438
+ # process title and authors from metadata
439
+ title = sp.metadata.title.text
440
+ # get authors
441
+ for author in sp.authors:
442
+ for subtag in author:
443
+ subtag.decompose()
444
+ if author.text.strip():
445
+ author_parts = author.text.strip().split()
446
+ authors.append({
447
+ "first": author_parts[0] if len(author_parts) > 1 else "",
448
+ "last": author_parts[-1]
449
+ if author_parts[-1].lower() not in {"jr", "jr.", "iii", "iv", "v"}
450
+ else author_parts[-2] if len(author_parts) > 1 else author_parts[-1],
451
+ "middle": author_parts[1:-1],
452
+ "suffix": "",
453
+ "affiliation": {},
454
+ "email": ""
455
+ })
456
+ sp.metadata.decompose()
457
+ except AttributeError:
458
+ sp.metadata.decompose()
459
+ return title, authors
460
+
461
+ return title, authors
462
+
463
+
464
+ def process_bibliography_from_tex(sp: BeautifulSoup, client, log_file) -> Dict:
465
+ """
466
+ Parse bibliography from latex
467
+ :return:
468
+ """
469
+ bibkey_map = dict()
470
+ # replace Bibliography with bibliography if needed
471
+ for bibl in sp.find_all("Bibliography"):
472
+ bibl.name = 'bibliography'
473
+ # construct bib map
474
+ for bibliography in sp.find_all('bibliography'):
475
+ bib_items = bibliography.find_all('bibitem')
476
+ # map all bib entries
477
+ if bib_items:
478
+ for bi_num, bi in enumerate(bib_items):
479
+ try:
480
+ if not bi.get('id'):
481
+ continue
482
+ # get bib entry text and process it
483
+ bib_par = bi.find_parent('p')
484
+ if bib_par.text:
485
+ bib_entry = process_bibentry(bib_par.text, client, log_file)
486
+ else:
487
+ next_tag = bib_par.findNext('p')
488
+ if not next_tag.find('bibitem') and next_tag.text:
489
+ bib_entry = process_bibentry(next_tag.text, client, log_file)
490
+ else:
491
+ bib_entry = None
492
+ # if processed successfully, add to map
493
+ if bib_entry:
494
+ # get URLs from bib entry
495
+ urls = []
496
+ for xref in bib_par.find_all('xref'):
497
+ urls.append(xref.get('url'))
498
+ bib_entry['urls'] = urls
499
+ # map to ref id
500
+ ref_id = normalize_latex_id(bi.get('id'))
501
+ bib_entry['ref_id'] = ref_id
502
+ bib_entry['num'] = bi_num
503
+ bibkey_map[ref_id] = bib_entry
504
+ except AttributeError:
505
+ print('Attribute error in bib item!', bi)
506
+ continue
507
+ except TypeError:
508
+ print('Type error in bib item!', bi)
509
+ continue
510
+ else:
511
+ for bi_num, p in enumerate(sp.bibliography.find_all('p')):
512
+ try:
513
+ bib_key, bib_entry = None, None
514
+ bib_text = p.text
515
+ bib_name = re.match(r'\[(.*?)\](.*)', bib_text)
516
+ if bib_name:
517
+ bib_text = re.sub(r'\s', ' ', bib_text)
518
+ bib_name = re.match(r'\[(.*?)\](.*)', bib_text)
519
+ if bib_name:
520
+ bib_key = bib_name.group(1)
521
+ bib_entry = process_bibentry(bib_name.group(2), client, log_file)
522
+ else:
523
+ bib_lines = bib_text.split('\n')
524
+ bib_key = re.sub(r'\s', ' ', bib_lines[0])
525
+ bib_text = re.sub(r'\s', ' ', ' '.join(bib_lines[1:]))
526
+ bib_entry = process_bibentry(bib_text, client, log_file)
527
+ if bib_key and bib_entry:
528
+ # get URLs from bib entry
529
+ urls = []
530
+ for xref in p.find_all('xref'):
531
+ urls.append(xref.get('url'))
532
+ bib_entry['urls'] = urls
533
+ bib_entry['num'] = bi_num
534
+ # map to bib id
535
+ bibkey_map[bib_key] = bib_entry
536
+ except AttributeError:
537
+ print('Attribute error in bib item!', p)
538
+ continue
539
+ except TypeError:
540
+ print('Type error in bib item!', p)
541
+ continue
542
+ for bibliography in sp.find_all('bibliography'):
543
+ bibliography.decompose()
544
+ return bibkey_map
545
+
546
+
547
+ def get_section_name(sec):
548
+ """
549
+ Get section name from div tag
550
+ :param sec:
551
+ :return:
552
+ """
553
+ if sec.head:
554
+ sec_text = sec.head.text
555
+ else:
556
+ sec_str = []
557
+ for tag in sec:
558
+ if type(tag) == NavigableString:
559
+ if len(tag.strip()) < 50:
560
+ sec_str.append(tag.strip())
561
+ else:
562
+ break
563
+ elif tag.name != 'p':
564
+ if len(tag.text.strip()) < 50:
565
+ sec_str.append(tag.text.strip())
566
+ else:
567
+ break
568
+ else:
569
+ break
570
+ sec_text = ' '.join(sec_str).strip()
571
+ return sec_text
572
+
573
+
574
+ def get_sections_from_div(el: bs4.element.Tag, sp: BeautifulSoup, parent: Optional[str], faux_max: int) -> Dict:
575
+ """
576
+ Process section headers for one div
577
+ :param el:
578
+ :param sp:
579
+ :return:
580
+ """
581
+ sec_map_dict = dict()
582
+ el_ref_id = None
583
+
584
+ # process divs with ids
585
+ if el.get('id', None):
586
+ sec_num = el.get('id-text', None)
587
+ if 'cid' in el.get('id'):
588
+ el_ref_id = el.get('id').replace('cid', 'SECREF')
589
+ elif 'uid' in el.get('id'):
590
+ el_ref_id = el.get('id').replace('uid', 'SECREFU')
591
+ else:
592
+ print('Unknown ID type!', el.get('id'))
593
+ raise NotImplementedError
594
+ el['s2orc_id'] = el_ref_id
595
+ sec_map_dict[el_ref_id] = {
596
+ "num": sec_num,
597
+ "text": get_section_name(el),
598
+ "ref_id": el_ref_id,
599
+ "parent": parent
600
+ }
601
+ # process divs without section numbers
602
+ elif el.get('rend') == "nonumber":
603
+ el_ref_id = f'SECREF{faux_max}'
604
+ el['s2orc_id'] = el_ref_id
605
+ sec_map_dict[el_ref_id] = {
606
+ "num": None,
607
+ "text": get_section_name(el),
608
+ "ref_id": el_ref_id,
609
+ "parent": parent
610
+ }
611
+
612
+ # process sub elements
613
+ for sub_el in el.find_all(recursive=False):
614
+ if sub_el.name.startswith('div'):
615
+ # add any unspecified keys
616
+ sec_keys = [int(k.strip('SECREF')) for k in sec_map_dict.keys() if k and k.strip('SECREF').isdigit()]
617
+ faux_max = max(sec_keys + [faux_max]) + 1
618
+ sec_map_dict.update(
619
+ get_sections_from_div(sub_el, sp, el_ref_id if el_ref_id else parent, faux_max)
620
+ )
621
+ elif sub_el.name == 'p' or sub_el.name == 'proof':
622
+ if sub_el.get('id', None):
623
+ sec_num = sub_el.get('id-text', sub_el.hi.get('id-text', None))
624
+ if 'cid' in sub_el.get('id'):
625
+ sub_el_ref_id = sub_el.get('id').replace('cid', 'SECREF')
626
+ elif 'uid' in sub_el.get('id'):
627
+ sub_el_ref_id = sub_el.get('id').replace('uid', 'SECREFU')
628
+ else:
629
+ print('Unknown ID type!', sub_el.get('id'))
630
+ raise NotImplementedError
631
+ sub_el['s2orc_id'] = sub_el_ref_id
632
+ sec_map_dict[el_ref_id] = {
633
+ "num": sec_num,
634
+ "text": sub_el.head.text if sub_el.head else sub_el.hi.text if sub_el.hi else "",
635
+ "ref_id": sub_el_ref_id,
636
+ "parent": el_ref_id if el_ref_id else parent
637
+ }
638
+ return sec_map_dict
639
+
640
+
641
+ def process_sections_from_text(sp: BeautifulSoup) -> Dict:
642
+ """
643
+ Generate section dict and replace with id tokens
644
+ :param sp:
645
+ :return:
646
+ """
647
+ # initialize
648
+ section_map = dict()
649
+ max_above_1000 = 999
650
+
651
+ for div0 in sp.find_all('div0'):
652
+ parent = None
653
+ section_map.update(get_sections_from_div(div0, sp, parent, max_above_1000 + 1))
654
+ # add any unspecified keys
655
+ sec_keys = [int(k.strip('SECREF')) for k in section_map.keys() if k and k.strip('SECREF').isdigit()]
656
+ max_above_1000 = max(sec_keys + [max_above_1000]) + 1
657
+
658
+ return section_map
659
+
660
+
661
+ def process_equations_from_tex(sp: BeautifulSoup) -> Dict:
662
+ """
663
+ Generate equation dict and replace with id tokens
664
+ :param sp:
665
+ :return:
666
+ """
667
+ equation_map = dict()
668
+
669
+ for eq in sp.find_all('formula'):
670
+ try:
671
+ if eq.get('type', None) == 'display':
672
+ if eq.get('id', None):
673
+ ref_id = eq.get('id').replace('uid', 'EQREF')
674
+ try:
675
+ mathml = latex2mathml.converter.convert(eq.texmath.text.strip())
676
+ except Exception:
677
+ mathml = ""
678
+ equation_map[ref_id] = {
679
+ "num": eq.get('id-text', None),
680
+ "text": eq.math.text.strip(),
681
+ "mathml": mathml,
682
+ "latex": eq.texmath.text.strip(),
683
+ "ref_id": ref_id
684
+ }
685
+ replace_item = sp.new_tag('p')
686
+ equation_copy = copy.copy(eq)
687
+ equation_copy['type'] = 'inline'
688
+ replace_item.insert(0, equation_copy)
689
+
690
+ # replace with <p> containing equation as inline
691
+ eq.replace_with(replace_item)
692
+
693
+ except AttributeError:
694
+ continue
695
+
696
+ return equation_map
697
+
698
+
699
+ def process_footnotes_from_text(sp: BeautifulSoup) -> Dict:
700
+ """
701
+ Process footnote marks
702
+ :param sp:
703
+ :return:
704
+ """
705
+ footnote_map = dict()
706
+
707
+ for note in sp.find_all('note'):
708
+ try:
709
+ if note.name and note.get('id'):
710
+ # normalize footnote id
711
+ ref_id = note.get('id').replace('uid', 'FOOTREF')
712
+ # remove equation tex
713
+ for eq in note.find_all('texmath'):
714
+ eq.decompose()
715
+ # replace all xrefs with link
716
+ for xref in note.find_all('xref'):
717
+ xref.replace_with(sp.new_string(f" {xref.get('url')} "))
718
+ # clean footnote text
719
+ footnote_text = None
720
+ if note.text:
721
+ footnote_text = note.text.strip()
722
+ footnote_text = re.sub(r'\s+', ' ', footnote_text)
723
+ footnote_text = re.sub(r'\s', ' ', footnote_text)
724
+ # form footnote entry
725
+ footnote_map[ref_id] = {
726
+ "num": note.get('id-text', None),
727
+ "text": footnote_text,
728
+ "ref_id": ref_id
729
+ }
730
+ note.replace_with(sp.new_string(f" {ref_id} "))
731
+ except AttributeError:
732
+ continue
733
+
734
+ return footnote_map
735
+
736
+
737
+ def get_figure_map_from_tex(sp: BeautifulSoup) -> Dict:
738
+ """
739
+ Generate figure dict only
740
+ :param sp:
741
+ :return:
742
+ """
743
+ figure_map = dict()
744
+
745
+ # get floats first because they are around figures
746
+ for flt in sp.find_all('float'):
747
+ try:
748
+ if flt.name and flt.get('name') == 'figure':
749
+
750
+ # get files
751
+ fig_files = []
752
+ for fig in flt.find_all('figure'):
753
+ if fig.get('file') and fig.get('extension'):
754
+ fname = fig.get('file') + '.' + fig.get('extension')
755
+ fig_files.append(fname)
756
+ elif fig.get('file'):
757
+ fname = fig.get('file')
758
+ fig_files.append(fname)
759
+ else:
760
+ for subfig in fig.find_all('subfigure'):
761
+ if subfig.get('file') and subfig.get('extension'):
762
+ fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
763
+ elif subfig.get('file'):
764
+ fig_files.append(subfig.get('file'))
765
+
766
+ if flt.get('id'):
767
+ ref_id = flt.get('id').replace('uid', 'FIGREF')
768
+ # form figmap entry
769
+ figure_map[ref_id] = {
770
+ "num": flt.get('id-text', None),
771
+ "text": None, # placeholder
772
+ "uris": fig_files,
773
+ "ref_id": ref_id
774
+ }
775
+ except AttributeError:
776
+ print('Attribute error with figure float: ', flt.name)
777
+ continue
778
+
779
+ for fig in sp.find_all('figure'):
780
+ try:
781
+ if fig.name and fig.get('id'):
782
+ # normalize figure id
783
+ ref_id = fig.get('id').replace('uid', 'FIGREF')
784
+ # try to get filenames of figures
785
+ fig_files = []
786
+ if fig.get('file') and fig.get('extension'):
787
+ fname = fig.get('file') + '.' + fig.get('extension')
788
+ fig_files.append(fname)
789
+ elif fig.get('file'):
790
+ fig_files.append(fig.get('file'))
791
+ else:
792
+ for subfig in fig.find_all('subfigure'):
793
+ if subfig.get('file') and subfig.get('extension'):
794
+ fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
795
+ elif subfig.get('file'):
796
+ fig_files.append(subfig.get('file'))
797
+ # form figmap entry
798
+ figure_map[ref_id] = {
799
+ "num": fig.get('id-text', None),
800
+ "text": None, # placeholder
801
+ "uris": fig_files,
802
+ "ref_id": ref_id
803
+ }
804
+ except AttributeError:
805
+ print('Attribute error with figure: ', fig.name)
806
+ continue
807
+
808
+ return figure_map
809
+
810
+
811
+ def process_figures_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
812
+ """
813
+ Add figure captions to fig_map and decompose
814
+ :param sp:
815
+ :param ref_map:
816
+ :return:
817
+ """
818
+ # process floats first because they are on the outside
819
+ for flt in sp.find_all('float'):
820
+ try:
821
+ if flt.name and flt.get('name') == 'figure':
822
+ if flt.get('id'):
823
+ ref_id = flt.get('id').replace('uid', 'FIGREF')
824
+ # remove equation tex
825
+ for eq in flt.find_all('texmath'):
826
+ eq.decompose()
827
+ # clean caption text
828
+ caption_text = None
829
+ if flt.caption:
830
+ flt = replace_ref_tokens(sp, flt, ref_map)
831
+ caption_text = flt.caption.text.strip()
832
+ caption_text = re.sub(r'\s+', ' ', caption_text)
833
+ caption_text = re.sub(r'\s', ' ', caption_text)
834
+ # form figmap entry
835
+ ref_map[ref_id]['text'] = caption_text
836
+ flt.decompose()
837
+ except AttributeError:
838
+ print('Attribute error with figure float: ', flt.name)
839
+ continue
840
+
841
+ for fig in sp.find_all('figure'):
842
+ try:
843
+ if fig.name and fig.get('id'):
844
+ # normalize figure id
845
+ ref_id = fig.get('id').replace('uid', 'FIGREF')
846
+ # remove equation tex
847
+ for eq in fig.find_all('texmath'):
848
+ eq.decompose()
849
+ # clean caption text
850
+ caption_text = None
851
+ if fig.text:
852
+ fig = replace_ref_tokens(sp, fig, ref_map)
853
+ caption_text = fig.text.strip()
854
+ caption_text = re.sub(r'\s+', ' ', caption_text)
855
+ caption_text = re.sub(r'\s', ' ', caption_text)
856
+ # add text to figmap entry
857
+ ref_map[ref_id]["text"] = caption_text
858
+ except AttributeError:
859
+ print('Attribute error with figure: ', fig.name)
860
+ continue
861
+ fig.decompose()
862
+
863
+ return ref_map
864
+
865
+
866
+ def convert_table_to_html(table_lst: List) -> str:
867
+ if not table_lst:
868
+ return ''
869
+ html_str = '<table>'
870
+ for i, row in enumerate(table_lst):
871
+ html_str += '<tr>'
872
+ bottom_border = row.get('bottom-border')
873
+ if i == 0 or bottom_border:
874
+ for cell in row['cells']:
875
+ html_str += f"<th>{cell['text']}</th>"
876
+ else:
877
+ for cell in row['cells']:
878
+ html_str += f"<td>{cell['text']}</td>"
879
+ html_str += '</tr>'
880
+ html_str += '</table>'
881
+ return html_str
882
+
883
+
884
+ def extract_table(table: BeautifulSoup) -> List:
885
+ """
886
+ Extract table values from table entry
887
+ :param table:
888
+ :return:
889
+ """
890
+ table_rep = []
891
+ for row in table.find_all('row'):
892
+ cells = []
893
+ for cell in row.find_all('cell'):
894
+
895
+ text_items = []
896
+ latex_items = []
897
+
898
+ for child in cell:
899
+
900
+ if type(child) == NavigableString:
901
+ text_items.append(str(child))
902
+ latex_items.append(str(child))
903
+ elif child.name == 'formula':
904
+ text_items.append(child.math.text)
905
+ latex_items.append(child.texmath.text)
906
+ else:
907
+ text_items.append(child.text)
908
+ latex_items.append(child.text)
909
+
910
+ text = ' '.join(text_items)
911
+ text = re.sub(r'\s+', ' ', text)
912
+ text = re.sub(r'\s', ' ', text)
913
+
914
+ latex = ' '.join(latex_items)
915
+ latex = re.sub(r'\s+', ' ', latex)
916
+
917
+ cells.append({
918
+ "alignment": cell.get('halign'),
919
+ "right-border": cell.get('right-border') == 'true',
920
+ "left-border": cell.get('left-border') == 'true',
921
+ "text": text.strip(),
922
+ "latex": latex.strip()
923
+ })
924
+ table_rep.append({
925
+ "top-border": row.get('top-border') == "true",
926
+ "bottom-border": row.get('bottom-border') == "true",
927
+ "cells": cells
928
+ })
929
+ return table_rep
930
+
931
+
932
+ def get_table_map_from_text(sp: BeautifulSoup, keep_table_contents=True) -> Dict:
933
+ """
934
+ Generate table dict only
935
+ :param sp:
936
+ :param keep_table_contents:
937
+ :return:
938
+ """
939
+ table_map = dict()
940
+
941
+ for flt in sp.find_all('float'):
942
+ try:
943
+ if flt.name and flt.get('name') == 'table':
944
+ if flt.get('id'):
945
+ # normalize table id
946
+ ref_id = flt.get('id').replace('uid', 'TABREF')
947
+ # get table content
948
+ content = extract_table(flt) if keep_table_contents else None
949
+ html = convert_table_to_html(content) if keep_table_contents else None
950
+ # form tabmap entry
951
+ table_map[ref_id] = {
952
+ "num": flt.get('id-text', None),
953
+ "text": None, # placeholder
954
+ "content": content,
955
+ "html": html,
956
+ "ref_id": ref_id
957
+ }
958
+ for row in flt.find_all('row'):
959
+ row.decompose()
960
+ except AttributeError:
961
+ print('Attribute error with table float: ', flt.name)
962
+ continue
963
+
964
+ for tab in sp.find_all('table'):
965
+ try:
966
+ # skip inline tables
967
+ if tab.get('rend') == 'inline':
968
+ continue
969
+ # process them
970
+ if tab.name and tab.get('id'):
971
+ # normalize table id
972
+ ref_id = tab.get('id').replace('uid', 'TABREF')
973
+ # get table content
974
+ content = extract_table(tab) if keep_table_contents else None
975
+ html = convert_table_to_html(content) if keep_table_contents else None
976
+ # form tabmap entry
977
+ table_map[ref_id] = {
978
+ "num": tab.get('id-text', None),
979
+ "text": None, # placeholder
980
+ "content": content,
981
+ "html": html,
982
+ "ref_id": ref_id
983
+ }
984
+ for row in tab.find_all('row'):
985
+ row.decompose()
986
+ except AttributeError:
987
+ print('Attribute error with table: ', tab.name)
988
+ continue
989
+
990
+ return table_map
991
+
992
+
993
+ def process_tables_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
994
+ """
995
+ Generate table dict and replace with id tokens
996
+ :param sp:
997
+ :param ref_map:
998
+ :return:
999
+ """
1000
+ # process floats first because they are on the outside
1001
+ for flt in sp.find_all('float'):
1002
+ try:
1003
+ if flt.name and flt.get('name') == 'table':
1004
+ if flt.get('id'):
1005
+ # normalize table id
1006
+ ref_id = flt.get('id').replace('uid', 'TABREF')
1007
+ # remove equation tex
1008
+ if flt.caption:
1009
+ caption_el = replace_ref_tokens(sp, flt.caption, ref_map)
1010
+ for eq in caption_el.find_all('texmath'):
1011
+ eq.decompose()
1012
+ caption_text = caption_el.text.strip()
1013
+ elif flt.head:
1014
+ head_el = replace_ref_tokens(sp, flt.head, ref_map)
1015
+ for eq in head_el.find_all('texmath'):
1016
+ eq.decompose()
1017
+ caption_text = head_el.text.strip()
1018
+ elif flt.p:
1019
+ caption_parts = []
1020
+ for tab_p in flt.find_all('p'):
1021
+ p_el = replace_ref_tokens(sp, tab_p, ref_map)
1022
+ for eq in p_el.find_all('texmath'):
1023
+ eq.decompose()
1024
+ caption_parts.append(p_el.text.strip())
1025
+ caption_text = ' '.join(caption_parts)
1026
+ else:
1027
+ tab_el = replace_ref_tokens(sp, flt, ref_map)
1028
+ caption_text = tab_el.text.strip()
1029
+ if caption_text:
1030
+ caption_text = re.sub(r'\s+', ' ', caption_text)
1031
+ caption_text = re.sub(r'\s', ' ', caption_text)
1032
+ # form tabmap entry
1033
+ ref_map[ref_id]['text'] = caption_text
1034
+ flt.decompose()
1035
+ except AttributeError:
1036
+ print('Attribute error with table float: ', flt.name)
1037
+ continue
1038
+
1039
+ for tab in sp.find_all('table'):
1040
+ try:
1041
+ # skip inline tables
1042
+ if tab.get('rend') == 'inline':
1043
+ continue
1044
+ # process them
1045
+ if tab.name and tab.get('id'):
1046
+ # normalize table id
1047
+ ref_id = tab.get('id').replace('uid', 'TABREF')
1048
+ # remove equation tex from caption and clean and resolve refs
1049
+ if tab.caption:
1050
+ caption_el = replace_ref_tokens(sp, tab.caption, ref_map)
1051
+ for eq in caption_el.find_all('texmath'):
1052
+ eq.decompose()
1053
+ caption_text = caption_el.text.strip()
1054
+ elif tab.head:
1055
+ head_el = replace_ref_tokens(sp, tab.head, ref_map)
1056
+ for eq in head_el.find_all('texmath'):
1057
+ eq.decompose()
1058
+ caption_text = head_el.text.strip()
1059
+ elif tab.p:
1060
+ caption_parts = []
1061
+ for tab_p in tab.find_all('p'):
1062
+ p_el = replace_ref_tokens(sp, tab_p, ref_map)
1063
+ for eq in p_el.find_all('texmath'):
1064
+ eq.decompose()
1065
+ caption_parts.append(p_el.text.strip())
1066
+ caption_text = ' '.join(caption_parts)
1067
+ else:
1068
+ tab_el = replace_ref_tokens(sp, tab, ref_map)
1069
+ caption_text = tab_el.text.strip()
1070
+ if caption_text:
1071
+ caption_text = re.sub(r'\s+', ' ', caption_text)
1072
+ caption_text = re.sub(r'\s', ' ', caption_text)
1073
+ # form tabmap entry
1074
+ ref_map[ref_id]['text'] = caption_text
1075
+ except AttributeError:
1076
+ print('Attribute error with table: ', tab.name)
1077
+ continue
1078
+ tab.decompose()
1079
+
1080
+ return ref_map
1081
+
1082
+
1083
+ def combine_ref_maps(eq_map: Dict, fig_map: Dict, tab_map: Dict, foot_map: Dict, sec_map: Dict):
1084
+ """
1085
+ Combine all items with ref ids into one map
1086
+ :param eq_map:
1087
+ :param fig_map:
1088
+ :param tab_map:
1089
+ :param sec_map:
1090
+ :return:
1091
+ """
1092
+ ref_map = dict()
1093
+ for k, v in eq_map.items():
1094
+ v['type'] = 'equation'
1095
+ ref_map[k] = v
1096
+ for k, v in fig_map.items():
1097
+ v['type'] = 'figure'
1098
+ ref_map[k] = v
1099
+ for k, v in tab_map.items():
1100
+ v['type'] = 'table'
1101
+ ref_map[k] = v
1102
+ for k, v in foot_map.items():
1103
+ v['type'] = 'footnote'
1104
+ ref_map[k] = v
1105
+ for k, v in sec_map.items():
1106
+ v['type'] = 'section'
1107
+ ref_map[k] = v
1108
+ return ref_map
1109
+
1110
+
1111
+ def collapse_formatting_tags(sp: BeautifulSoup):
1112
+ """
1113
+ Collapse formatting tags like <hi>
1114
+ :param sp:
1115
+ :return:
1116
+ """
1117
+ for hi in sp.find_all('hi'):
1118
+ hi.replace_with(f' {sp.new_string(hi.text.strip())} ')
1119
+
1120
+
1121
+ def process_abstract_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
1122
+ """
1123
+ Parse abstract from soup
1124
+ :param sp:
1125
+ :param bib_map:
1126
+ :param ref_map:
1127
+ :return:
1128
+ """
1129
+ abstract_text = []
1130
+ if sp.abstract:
1131
+ for p in sp.abstract.find_all('p'):
1132
+ abstract_text.append(
1133
+ process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
1134
+ )
1135
+ sp.abstract.decompose()
1136
+ else:
1137
+ if sp.std:
1138
+ p_tags = [tag for tag in sp.std if tag.name == 'p' and not tag.get('s2orc_id', None)]
1139
+ elif sp.unknown:
1140
+ p_tags = [tag for tag in sp.unknown if tag.name == 'p' and not tag.get('s2orc_id', None)]
1141
+ else:
1142
+ p_tags = None
1143
+ if p_tags:
1144
+ for p in p_tags:
1145
+ abstract_text.append(
1146
+ process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
1147
+ )
1148
+ p.decompose()
1149
+ return [para.__dict__ for para in abstract_text]
1150
+
1151
+
1152
+ def build_section_list(sec_id: str, ref_map: Dict) -> List[Tuple]:
1153
+ """
1154
+ Build list of sections from reference map from sec_id using parent entry recursively
1155
+ :param sec_id:
1156
+ :param ref_map:
1157
+ :return:
1158
+ """
1159
+ if not sec_id:
1160
+ return []
1161
+ elif sec_id not in ref_map:
1162
+ return []
1163
+ else:
1164
+ sec_entry = [(ref_map[sec_id]['num'], ref_map[sec_id]['text'])]
1165
+ if ref_map[sec_id]['parent'] == sec_id:
1166
+ return sec_entry
1167
+ else:
1168
+ return build_section_list(ref_map[sec_id]['parent'], ref_map) + sec_entry
1169
+
1170
+
1171
+ def get_seclist_for_el(el: bs4.element.Tag, ref_map: Dict, default_seclist: List) -> List[Tuple]:
1172
+ """
1173
+ Build sec_list for tag
1174
+ :param el:
1175
+ :param ref_map:
1176
+ :param default_seclist:
1177
+ :return:
1178
+ """
1179
+ if type(el) == NavigableString:
1180
+ return default_seclist
1181
+ sec_id = el.get('s2orc_id', None)
1182
+ if sec_id:
1183
+ return build_section_list(sec_id, ref_map)
1184
+ else:
1185
+ return default_seclist
1186
+
1187
+
1188
+ def process_div(tag: bs4.element.Tag, secs: List, sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
1189
+ """
1190
+ Process div recursively
1191
+ :param tag:
1192
+ :param secs:
1193
+ :param sp:
1194
+ :param bib_map:
1195
+ :param ref_map:
1196
+ :return:
1197
+ """
1198
+ # iterate through children of this tag
1199
+ body_text = []
1200
+
1201
+ # navigable strings
1202
+ if type(tag) == NavigableString:
1203
+ return []
1204
+ # skip these tags
1205
+ elif tag.name in SKIP_TAGS:
1206
+ return []
1207
+ # process normal tags
1208
+ elif tag.name in TEXT_TAGS:
1209
+ if tag.text:
1210
+ body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
1211
+ # process lists
1212
+ elif tag.name == 'list':
1213
+ if tag.text:
1214
+ body_text += process_list_el(sp, tag, secs, bib_map, ref_map)
1215
+ # process formula
1216
+ elif tag.name == 'formula':
1217
+ replace_item = sp.new_tag('p')
1218
+ tag_copy = copy.copy(tag)
1219
+ tag_copy['type'] = 'inline'
1220
+ replace_item.insert(0, tag_copy)
1221
+ tag.replace_with(replace_item)
1222
+ if tag.text:
1223
+ body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
1224
+ # process divs
1225
+ elif tag.name.startswith('div'):
1226
+ for el in tag:
1227
+ # process tags
1228
+ if type(el) == bs4.element.Tag:
1229
+ el_sec_list = get_seclist_for_el(el, ref_map, secs)
1230
+ body_text += process_div(el, el_sec_list, sp, bib_map, ref_map)
1231
+ # unknown tag type, skip for now
1232
+ else:
1233
+ print(f'Unknown tag type: {tag.name}')
1234
+ return []
1235
+
1236
+ return body_text
1237
+
1238
+
1239
+ def process_body_text_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
1240
+ """
1241
+ Parse body text from tag recursively
1242
+ :param sp:
1243
+ :param bib_map:
1244
+ :param ref_map:
1245
+ :return:
1246
+ """
1247
+ body_text = []
1248
+ for tag in sp.body:
1249
+ # skip navigable string
1250
+ if type(tag) == NavigableString:
1251
+ continue
1252
+ else:
1253
+ sec_list = get_seclist_for_el(tag, ref_map, [])
1254
+ for cld in tag:
1255
+ # skip navigable string
1256
+ if type(tag) == NavigableString:
1257
+ continue
1258
+ else:
1259
+ sec_list = get_seclist_for_el(cld, ref_map, sec_list)
1260
+ if type(cld) == bs4.element.Tag:
1261
+ body_text += process_div(cld, sec_list, sp, bib_map, ref_map)
1262
+
1263
+ # decompose everything
1264
+ sp.body.decompose()
1265
+
1266
+ return [para.__dict__ for para in body_text]
1267
+
1268
+
1269
+ def convert_xml_to_s2orc(
1270
+ sp: BeautifulSoup, file_id: str, year_str: str, log_file: str, grobid_config: Optional[Dict]=None
1271
+ ) -> Paper:
1272
+ """
1273
+ Convert a bunch of xml to gorc format
1274
+ :param sp:
1275
+ :param file_id:
1276
+ :param year_str:
1277
+ :param log_file:
1278
+ :param grobid_config:
1279
+ :return:
1280
+ """
1281
+ # create grobid client
1282
+ client = GrobidClient(grobid_config)
1283
+
1284
+ # TODO: not sure why but have to run twice
1285
+ decompose_tags_before_title(sp)
1286
+ decompose_tags_before_title(sp)
1287
+
1288
+ # process maketitle info
1289
+ title, authors = process_metadata(sp, client, log_file)
1290
+
1291
+ # processing of bibliography entries
1292
+ # TODO: look into why authors aren't processing
1293
+ bibkey_map = process_bibliography_from_tex(sp, client, log_file)
1294
+
1295
+ # no bibliography entries
1296
+ if not bibkey_map:
1297
+ with open(log_file, 'a+') as bib_f:
1298
+ bib_f.write(f'{file_id},warn_no_bibs\n')
1299
+
1300
+ # process section headers
1301
+ section_map = process_sections_from_text(sp)
1302
+
1303
+ # process and replace non-inline equations
1304
+ equation_map = process_equations_from_tex(sp)
1305
+
1306
+ # process footnote markers
1307
+ footnote_map = process_footnotes_from_text(sp)
1308
+
1309
+ # get figure map
1310
+ figure_map = get_figure_map_from_tex(sp)
1311
+
1312
+ # get table_map
1313
+ table_map = get_table_map_from_text(sp)
1314
+
1315
+ # combine references in one dict
1316
+ refkey_map = combine_ref_maps(equation_map, figure_map, table_map, footnote_map, section_map)
1317
+
1318
+ # process and replace figures
1319
+ refkey_map = process_figures_from_tex(sp, refkey_map)
1320
+
1321
+ # process and replace tables
1322
+ refkey_map = process_tables_from_tex(sp, refkey_map)
1323
+
1324
+ # collapse all hi tags
1325
+ collapse_formatting_tags(sp)
1326
+
1327
+ # process abstract if possible
1328
+ abstract = process_abstract_from_tex(sp, bibkey_map, refkey_map)
1329
+
1330
+ # process body text
1331
+ body_text = process_body_text_from_tex(sp, bibkey_map, refkey_map)
1332
+
1333
+ # skip if no body text parsed
1334
+ if not body_text:
1335
+ with open(log_file, 'a+') as body_f:
1336
+ body_f.write(f'{file_id},warn_no_body\n')
1337
+
1338
+ metadata = {
1339
+ "title": title,
1340
+ "authors": authors,
1341
+ "year": year_str,
1342
+ "venue": "",
1343
+ "identifiers": {
1344
+ "arxiv_id": file_id
1345
+ }
1346
+ }
1347
+
1348
+ return Paper(
1349
+ paper_id=file_id,
1350
+ pdf_hash="",
1351
+ metadata=metadata,
1352
+ abstract=abstract,
1353
+ body_text=body_text,
1354
+ back_matter=[],
1355
+ bib_entries=bibkey_map,
1356
+ ref_entries=refkey_map
1357
+ )
1358
+
1359
+
1360
+ def convert_latex_xml_to_s2orc_json(xml_fpath: str, log_dir: str, grobid_config: Optional[Dict]=None) -> Paper:
1361
+ """
1362
+ :param xml_fpath:
1363
+ :param log_dir:
1364
+ :param grobid_config:
1365
+ :return:
1366
+ """
1367
+ assert os.path.exists(xml_fpath)
1368
+
1369
+ # get file id
1370
+ file_id = str(os.path.splitext(xml_fpath)[0]).split('/')[-1]
1371
+
1372
+ # try to get year from file name
1373
+ year = file_id.split('.')[0][:2]
1374
+ if year.isdigit():
1375
+ year = int(year)
1376
+ if year < 40:
1377
+ year += 2000
1378
+ else:
1379
+ year += 1900
1380
+ year = str(year)
1381
+ else:
1382
+ year = ""
1383
+
1384
+ # log file
1385
+ log_file = os.path.join(log_dir, 'failed.log')
1386
+
1387
+ with open(xml_fpath, 'r') as f:
1388
+ try:
1389
+ xml = f.read()
1390
+ soup = BeautifulSoup(xml, "lxml")
1391
+ paper = convert_xml_to_s2orc(soup, file_id, year, log_file, grobid_config=grobid_config)
1392
+ return paper
1393
+ except UnicodeDecodeError:
1394
+ with open(log_file, 'a+') as log_f:
1395
+ log_f.write(f'{file_id},err_unicode_decode\n')
1396
+ raise UnicodeDecodeError
s2orc-doc2json/doc2json/utils/__init__.py ADDED
File without changes
s2orc-doc2json/doc2json/utils/citation_util.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utility functions for handling failure situations with grobid-detected citation spans
2
+
3
+ import re
4
+ from typing import Dict, List, Tuple
5
+
6
+
7
+ BRACKET_REGEX = re.compile(r'\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]')
8
+ BRACKET_STYLE_THRESHOLD = 5
9
+
10
+ SINGLE_BRACKET_REGEX = re.compile(r'\[([1-9]\d{0,2})\]')
11
+ EXPANSION_CHARS = {'-', '–'}
12
+
13
+
14
+ def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool:
15
+ """
16
+ Check if span is a subspan of existing span
17
+ :param sub_start:
18
+ :param sub_end:
19
+ :param span_indices:
20
+ :return:
21
+ """
22
+ for span_start, span_end in span_indices:
23
+ if sub_start >= span_start and sub_end <= span_end:
24
+ return True
25
+ return False
26
+
27
+
28
+ def is_expansion_string(between_string: str) -> bool:
29
+ """
30
+ Check if the string between two refs is an expansion string
31
+ :param between_string:
32
+ :return:
33
+ """
34
+ if len(between_string) <= 2 \
35
+ and any([c in EXPANSION_CHARS for c in between_string]) \
36
+ and all([c in EXPANSION_CHARS.union({' '}) for c in between_string]):
37
+ return True
38
+ return False
39
+
40
+
41
+ # TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler']
42
+ # example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f`
43
+ def _clean_empty_and_duplicate_authors_from_grobid_parse(authors: List[Dict]) -> List[Dict]:
44
+ """
45
+ Within affiliation, `location` is a dict with fields <settlement>, <region>, <country>, <postCode>, etc.
46
+ Too much hassle, so just take the first one that's not empty.
47
+ """
48
+ # stripping empties
49
+ clean_authors_list = []
50
+ for author in authors:
51
+ clean_first = author['first'].strip()
52
+ clean_last = author['last'].strip()
53
+ clean_middle = [m.strip() for m in author['middle']]
54
+ clean_suffix = author['suffix'].strip()
55
+ if clean_first or clean_last or clean_middle:
56
+ author['first'] = clean_first
57
+ author['last'] = clean_last
58
+ author['middle'] = clean_middle
59
+ author['suffix'] = clean_suffix
60
+ clean_authors_list.append(author)
61
+ # combining duplicates (preserve first occurrence of author name as position)
62
+ key_to_author_blobs = {}
63
+ ordered_keys_by_author_pos = []
64
+ for author in clean_authors_list:
65
+ key = (author['first'], author['last'], ' '.join(author['middle']), author['suffix'])
66
+ if key not in key_to_author_blobs:
67
+ key_to_author_blobs[key] = author
68
+ ordered_keys_by_author_pos.append(key)
69
+ else:
70
+ if author['email']:
71
+ key_to_author_blobs[key]['email'] = author['email']
72
+ if author['affiliation'] and (author['affiliation']['institution'] or author['affiliation']['laboratory'] or author['affiliation']['location']):
73
+ key_to_author_blobs[key]['affiliation'] = author['affiliation']
74
+ dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos]
75
+ return dedup_authors_list
s2orc-doc2json/doc2json/utils/grobid_util.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Optional
2
+ import bs4
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ from collections import defaultdict
6
+
7
+
8
+ SUBSTITUTE_TAGS = {
9
+ 'persName',
10
+ 'orgName',
11
+ 'publicationStmt',
12
+ 'titleStmt',
13
+ 'biblScope'
14
+ }
15
+
16
+
17
+ def clean_tags(el: bs4.element.Tag):
18
+ """
19
+ Replace all tags with lowercase version
20
+ :param el:
21
+ :return:
22
+ """
23
+ for sub_tag in SUBSTITUTE_TAGS:
24
+ for sub_el in el.find_all(sub_tag):
25
+ sub_el.name = sub_tag.lower()
26
+
27
+
28
+ def soup_from_path(file_path: str):
29
+ """
30
+ Read XML file
31
+ :param file_path:
32
+ :return:
33
+ """
34
+ return BeautifulSoup(open(file_path, "rb").read(), "xml")
35
+
36
+
37
+ def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
38
+ """
39
+ Returns title
40
+ :return:
41
+ """
42
+ for title_entry in raw_xml.find_all("title"):
43
+ if title_entry.has_attr("level") \
44
+ and title_entry["level"] == "a":
45
+ return title_entry.text
46
+ try:
47
+ return raw_xml.title.text
48
+ except AttributeError:
49
+ return ""
50
+
51
+
52
+ def get_author_names_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict[str, str]]:
53
+ """
54
+ Returns a list of dictionaries, one for each author,
55
+ containing the first and last names.
56
+
57
+ e.g.
58
+ {
59
+ "first": first,
60
+ "middle": middle,
61
+ "last": last,
62
+ "suffix": suffix
63
+ }
64
+ """
65
+ names = []
66
+
67
+ for author in raw_xml.find_all("author"):
68
+ if not author.persname:
69
+ continue
70
+
71
+ # forenames include first and middle names
72
+ forenames = author.persname.find_all("forename")
73
+
74
+ # surnames include last names
75
+ surnames = author.persname.find_all("surname")
76
+
77
+ # name suffixes
78
+ suffixes = author.persname.find_all("suffix")
79
+
80
+ first = ""
81
+ middle = []
82
+ last = ""
83
+ suffix = ""
84
+
85
+ for forename in forenames:
86
+ if forename["type"] == "first":
87
+ if not first:
88
+ first = forename.text
89
+ else:
90
+ middle.append(forename.text)
91
+ elif forename["type"] == "middle":
92
+ middle.append(forename.text)
93
+
94
+ if len(surnames) > 1:
95
+ for surname in surnames[:-1]:
96
+ middle.append(surname.text)
97
+ last = surnames[-1].text
98
+ elif len(surnames) == 1:
99
+ last = surnames[0].text
100
+
101
+ if len(suffix) >= 1:
102
+ suffix = " ".join([suffix.text for suffix in suffixes])
103
+
104
+ names_dict = {
105
+ "first": first,
106
+ "middle": middle,
107
+ "last": last,
108
+ "suffix": suffix
109
+ }
110
+
111
+ names.append(names_dict)
112
+ return names
113
+
114
+
115
+ def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
116
+ """
117
+ Get affiliation from grobid xml
118
+ :param raw_xml:
119
+ :return:
120
+ """
121
+ location_dict = dict()
122
+ laboratory_name = ""
123
+ institution_name = ""
124
+
125
+ if raw_xml and raw_xml.affiliation:
126
+ for child in raw_xml.affiliation:
127
+ if child.name == "orgname":
128
+ if child.has_attr("type"):
129
+ if child["type"] == "laboratory":
130
+ laboratory_name = child.text
131
+ elif child["type"] == "institution":
132
+ institution_name = child.text
133
+ elif child.name == "address":
134
+ for grandchild in child:
135
+ if grandchild.name and grandchild.text:
136
+ location_dict[grandchild.name] = grandchild.text
137
+
138
+ if laboratory_name or institution_name:
139
+ return {
140
+ "laboratory": laboratory_name,
141
+ "institution": institution_name,
142
+ "location": location_dict
143
+ }
144
+
145
+ return {}
146
+
147
+
148
+ def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
149
+ """
150
+ Returns a list of dictionaries, one for each author,
151
+ containing the first and last names.
152
+
153
+ e.g.
154
+ {
155
+ "first": first,
156
+ "middle": middle,
157
+ "last": last,
158
+ "suffix": suffix,
159
+ "affiliation": {
160
+ "laboratory": "",
161
+ "institution": "",
162
+ "location": "",
163
+ },
164
+ "email": ""
165
+ }
166
+ """
167
+ authors = []
168
+
169
+ for author in raw_xml.find_all("author"):
170
+
171
+ first = ""
172
+ middle = []
173
+ last = ""
174
+ suffix = ""
175
+
176
+ if author.persname:
177
+ # forenames include first and middle names
178
+ forenames = author.persname.find_all("forename")
179
+
180
+ # surnames include last names
181
+ surnames = author.persname.find_all("surname")
182
+
183
+ # name suffixes
184
+ suffixes = author.persname.find_all("suffix")
185
+
186
+ for forename in forenames:
187
+ if forename.has_attr("type"):
188
+ if forename["type"] == "first":
189
+ if not first:
190
+ first = forename.text
191
+ else:
192
+ middle.append(forename.text)
193
+ elif forename["type"] == "middle":
194
+ middle.append(forename.text)
195
+
196
+ if len(surnames) > 1:
197
+ for surname in surnames[:-1]:
198
+ middle.append(surname.text)
199
+ last = surnames[-1].text
200
+ elif len(surnames) == 1:
201
+ last = surnames[0].text
202
+
203
+ if len(suffix) >= 1:
204
+ suffix = " ".join([suffix.text for suffix in suffixes])
205
+
206
+ affiliation = get_affiliation_from_grobid_xml(author)
207
+
208
+ email = ""
209
+ if author.email:
210
+ email = author.email.text
211
+
212
+ author_dict = {
213
+ "first": first,
214
+ "middle": middle,
215
+ "last": last,
216
+ "suffix": suffix,
217
+ "affiliation": affiliation,
218
+ "email": email
219
+ }
220
+
221
+ authors.append(author_dict)
222
+
223
+ return authors
224
+
225
+
226
+ def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
227
+ """
228
+ Returns date published if exists
229
+ :return:
230
+ """
231
+ if raw_xml.date and raw_xml.date.has_attr("when"):
232
+ # match year in date text (which is in some unspecified date format)
233
+ year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
234
+ if year_match:
235
+ year = year_match.group(0)
236
+ if year and year.isnumeric() and len(year) == 4:
237
+ return int(year)
238
+ return None
239
+
240
+
241
+ def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
242
+ """
243
+ Returns venue/journal/publisher of bib entry
244
+ Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
245
+ level="j": journal title
246
+ level="m": "non journal bibliographical item holding the cited article"
247
+ level="s": series title
248
+ :return:
249
+ """
250
+ title_names = []
251
+ keep_types = ["j", "m", "s"]
252
+ # get all titles of the anove types
253
+ for title_entry in raw_xml.find_all("title"):
254
+ if title_entry.has_attr("level") \
255
+ and title_entry["level"] in keep_types \
256
+ and title_entry.text != title_text:
257
+ title_names.append((title_entry["level"], title_entry.text))
258
+ # return the title name that most likely belongs to the journal or publication venue
259
+ if title_names:
260
+ title_names.sort(key=lambda x: keep_types.index(x[0]))
261
+ return title_names[0][1]
262
+ return ""
263
+
264
+
265
+ def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
266
+ """
267
+ Returns the volume number of grobid bib entry
268
+ Grobid <biblscope unit="volume">
269
+ :return:
270
+ """
271
+ for bibl_entry in raw_xml.find_all("biblscope"):
272
+ if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
273
+ return bibl_entry.text
274
+ return ""
275
+
276
+
277
+ def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
278
+ """
279
+ Returns the issue number of grobid bib entry
280
+ Grobid <biblscope unit="issue">
281
+ :return:
282
+ """
283
+ for bibl_entry in raw_xml.find_all("biblscope"):
284
+ if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
285
+ return bibl_entry.text
286
+ return ""
287
+
288
+
289
+ def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
290
+ """
291
+ Returns the page numbers of grobid bib entry
292
+ Grobid <biblscope unit="page">
293
+ :return:
294
+ """
295
+ for bibl_entry in raw_xml.find_all("biblscope"):
296
+ if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "page" and bibl_entry.has_attr("from"):
297
+ from_page = bibl_entry["from"]
298
+ if bibl_entry.has_attr("to"):
299
+ to_page = bibl_entry["to"]
300
+ return f'{from_page}--{to_page}'
301
+ else:
302
+ return from_page
303
+ return ""
304
+
305
+
306
+ def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
307
+ """
308
+ Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
309
+ :param raw_xml:
310
+ :return:
311
+ """
312
+ other_ids = defaultdict(list)
313
+
314
+ for idno_entry in raw_xml.find_all("idno"):
315
+ if idno_entry.has_attr("type") and idno_entry.text:
316
+ other_ids[idno_entry["type"]].append(idno_entry.text)
317
+
318
+ return other_ids
319
+
320
+
321
+ def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
322
+ """
323
+ Returns the raw bibiliography string
324
+ :param raw_xml:
325
+ :return:
326
+ """
327
+ for note in raw_xml.find_all("note"):
328
+ if note.has_attr("type") and note["type"] == "raw_reference":
329
+ return note.text
330
+ return ""
331
+
332
+
333
+ def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
334
+ """
335
+ Finds and returns the publication datetime if it exists
336
+ :param raw_xml:
337
+ :return:
338
+ """
339
+ if raw_xml.publicationStmt:
340
+ for child in raw_xml.publicationstmt:
341
+ if child.name == "date" \
342
+ and child.has_attr("type") \
343
+ and child["type"] == "published" \
344
+ and child.has_attr("when"):
345
+ return child["when"]
346
+ return ""
347
+
348
+
349
+ def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
350
+ """
351
+ Parse one bib entry
352
+ :param bib_entry:
353
+ :return:
354
+ """
355
+ clean_tags(bib_entry)
356
+ title = get_title_from_grobid_xml(bib_entry)
357
+ return {
358
+ 'ref_id': bib_entry.attrs.get("xml:id", None),
359
+ 'title': title,
360
+ 'authors': get_author_names_from_grobid_xml(bib_entry),
361
+ 'year': get_year_from_grobid_xml(bib_entry),
362
+ 'venue': get_venue_from_grobid_xml(bib_entry, title),
363
+ 'volume': get_volume_from_grobid_xml(bib_entry),
364
+ 'issue': get_issue_from_grobid_xml(bib_entry),
365
+ 'pages': get_pages_from_grobid_xml(bib_entry),
366
+ 'other_ids': get_other_ids_from_grobid_xml(bib_entry),
367
+ 'raw_text': get_raw_bib_text_from_grobid_xml(bib_entry),
368
+ 'urls': []
369
+ }
370
+
371
+
372
+ def is_reference_tag(tag: bs4.element.Tag) -> bool:
373
+ return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"
374
+
375
+
376
+ def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
377
+ """
378
+ Extract paper metadata (title, authors, affiliation, year) from grobid xml
379
+ :param tag:
380
+ :return:
381
+ """
382
+ clean_tags(tag)
383
+ paper_metadata = {
384
+ "title": tag.titlestmt.title.text,
385
+ "authors": get_author_data_from_grobid_xml(tag),
386
+ "year": get_publication_datetime_from_grobid_xml(tag)
387
+ }
388
+ return paper_metadata
s2orc-doc2json/doc2json/utils/latex_util.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Many of the REGEX expressions and pipeline in this set of utilities are borrowed or extended from
3
+ the unarXive project: https://github.com/IllDepence/unarXive
4
+
5
+ Modifications have been made to better identify the primary latex file and expand all other latex
6
+ files into the main file. Latexpand and tralics options have also been changed.
7
+ """
8
+ import chardet
9
+ import magic
10
+ import os
11
+ import re
12
+ import glob
13
+ import subprocess
14
+ import tempfile
15
+
16
+ MAIN_TEX_PATT = re.compile(r'(\\begin\s*\{\s*document\s*\})', re.I)
17
+ # ^ with capturing parentheses so that the pattern can be used for splitting
18
+ PDF_EXT_PATT = re.compile(r'^\.pdf$', re.I)
19
+ GZ_EXT_PATT = re.compile(r'^\.gz$', re.I)
20
+ TEX_EXT_PATT = re.compile(r'^\.tex$', re.I)
21
+ NON_TEXT_PATT = re.compile(r'^\.(pdf|eps|jpg|png|gif)$', re.I)
22
+ BBL_SIGN = '\\bibitem'
23
+ # natbib fix
24
+ PRE_FIX_NATBIB = True
25
+ NATBIB_PATT = re.compile((r'\\cite(t|p|alt|alp|author|year|yearpar)\s*?\*?\s*?'
26
+ '(\[[^\]]*?\]\s*?)*?\s*?\*?\s*?\{([^\}]+?)\}'),
27
+ re.I)
28
+ # bibitem option fix
29
+ PRE_FIX_BIBOPT = True
30
+ BIBOPT_PATT = re.compile(r'\\bibitem\s*?\[[^]]*?\]', re.I|re.M)
31
+
32
+ # ↑ above two solve most tralics problems; except for mnras style bibitems
33
+ # (https://ctan.org/pkg/mnras)
34
+
35
+ # agressive math pre-removal
36
+ PRE_FILTER_MATH = False
37
+ FILTER_PATTS = []
38
+ for env in ['equation', 'displaymath', 'array', 'eqnarray', 'align', 'gather',
39
+ 'multline', 'flalign', 'alignat']:
40
+ s = r'\\begin\{{{0}[*]?\}}.+?\\end\{{{0}\}}'.format(env)
41
+ patt = re.compile(s, re.I | re.M | re.S)
42
+ FILTER_PATTS.append(patt)
43
+ FILTER_PATTS.append(re.compile(r'\$\$.+?\$\$', re.S))
44
+ FILTER_PATTS.append(re.compile(r'\$.+?\$', re.S))
45
+ FILTER_PATTS.append(re.compile(r'\\\(.+?\\\)', re.S))
46
+ FILTER_PATTS.append(re.compile(r'\\\[.+?\\\]', re.S))
47
+
48
+
49
+ def read_file(path):
50
+ try:
51
+ with open(path) as f:
52
+ cntnt = f.read()
53
+ except UnicodeDecodeError:
54
+ blob = open(path, 'rb').read()
55
+ m = magic.Magic(mime_encoding=True)
56
+ encoding = m.from_buffer(blob)
57
+ try:
58
+ cntnt = blob.decode(encoding)
59
+ except (UnicodeDecodeError, LookupError) as e:
60
+ encoding = chardet.detect(blob)['encoding']
61
+ if encoding:
62
+ try:
63
+ cntnt = blob.decode(encoding, errors='replace')
64
+ except:
65
+ return ''
66
+ else:
67
+ return ''
68
+ return cntnt
69
+
70
+
71
+ def remove_math(latex_str):
72
+ parts = re.split(MAIN_TEX_PATT, latex_str, maxsplit=1)
73
+ for patt in FILTER_PATTS:
74
+ parts[2] = re.sub(patt, '', parts[2])
75
+ return ''.join(parts)
76
+
77
+
78
+ def normalize(path, out_dir, write_logs=True):
79
+ """
80
+ Normalize an arXiv file
81
+ Adapted from https://github.com/IllDepence/unarXive
82
+ with modifications
83
+
84
+ Identifies the primary *.tex file, the bibliography file,
85
+ and expands other tex files and the bibliography into the
86
+ main tex file
87
+ """
88
+ def log(msg):
89
+ if write_logs:
90
+ with open(os.path.join(out_dir, 'log.txt'), 'a') as f:
91
+ f.write('{}\n'.format(msg))
92
+
93
+ # break path
94
+ _, fn = os.path.split(path.strip('/'))
95
+
96
+ # identify main tex file
97
+ main_tex_path = None
98
+ ignored_names = []
99
+
100
+ # check .tex files first
101
+ for tfn in os.listdir(path):
102
+
103
+ if not TEX_EXT_PATT.match(os.path.splitext(tfn)[1]):
104
+ ignored_names.append(tfn)
105
+ continue
106
+
107
+ try:
108
+ cntnt = read_file(os.path.join(path, tfn))
109
+ except:
110
+ continue
111
+
112
+ if re.search(MAIN_TEX_PATT, cntnt) is not None:
113
+ main_tex_path = tfn
114
+
115
+ # try other files
116
+ if main_tex_path is None:
117
+ for tfn in ignored_names:
118
+ if NON_TEXT_PATT.match(os.path.splitext(tfn)[1]):
119
+ continue
120
+ try:
121
+ cntnt = read_file(os.path.join(path, tfn))
122
+ if re.search(MAIN_TEX_PATT, cntnt) is not None:
123
+ main_tex_path = tfn
124
+ except:
125
+ continue
126
+
127
+ # give up
128
+ if main_tex_path is None:
129
+ log(('couldn\'t find main tex file in dump archive {}'
130
+ '').format(fn))
131
+
132
+ # flatten to single tex file and save
133
+ with tempfile.TemporaryDirectory() as tmp_dir_path:
134
+ temp_tex_fn = os.path.join(tmp_dir_path, f'{fn}.tex')
135
+
136
+ # find bbl file
137
+ main_tex_fn = os.path.join(path, main_tex_path)
138
+ bbl_files = glob.glob(os.path.join(path, '*.bbl'))
139
+
140
+ if bbl_files:
141
+ latexpand_args = ['latexpand',
142
+ '--expand-bbl',
143
+ os.path.split(bbl_files[0])[1],
144
+ main_tex_path,
145
+ '--output',
146
+ temp_tex_fn]
147
+ else:
148
+ latexpand_args = ['latexpand',
149
+ main_tex_path,
150
+ '--output',
151
+ temp_tex_fn]
152
+
153
+ # run latexpand
154
+ with open(os.path.join(out_dir, 'log_latexpand.txt'), 'a+') as err:
155
+ subprocess.run(latexpand_args, stderr=err, cwd=path)
156
+
157
+ # re-read and write to ensure utf-8 b/c latexpand doesn't
158
+ # behave
159
+ new_tex_fn = os.path.join(out_dir, f'{fn}.tex')
160
+ cntnt = read_file(temp_tex_fn)
161
+ if PRE_FIX_NATBIB:
162
+ cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
163
+ if PRE_FIX_BIBOPT:
164
+ cntnt = BIBOPT_PATT.sub(r'\\bibitem', cntnt)
165
+ if PRE_FILTER_MATH:
166
+ cntnt = remove_math(cntnt)
167
+ with open(new_tex_fn, mode='w', encoding='utf-8') as f:
168
+ f.write(cntnt)
169
+
170
+
171
+ def latex_to_xml(tex_file: str, out_dir: str, out_file: str, err_file: str, log_file: str):
172
+ """
173
+ Convert expanded latex file to XML using tralics
174
+ :param tex_file:
175
+ :param out_dir:
176
+ :param out_file:
177
+ :param err_file:
178
+ :param log_file:
179
+ :return:
180
+ """
181
+ with open(os.devnull, 'w') as devnull, \
182
+ open(err_file, 'a+') as err_f, \
183
+ open(log_file, 'a+') as skip_f:
184
+ # run tralics
185
+ tralics_args = ['tralics',
186
+ '-silent',
187
+ '-noxmlerror',
188
+ '-utf8',
189
+ '-oe8',
190
+ '-entnames=false',
191
+ '-nomathml',
192
+ f'-output_dir={out_dir}',
193
+ tex_file]
194
+ try:
195
+ subprocess.run(tralics_args, stdout=devnull, stderr=err_f, timeout=5)
196
+ except subprocess.TimeoutExpired:
197
+ skip_f.write(f'{tex_file}\n')
198
+
199
+ # if no output, skip
200
+ if not os.path.exists(out_file):
201
+ skip_f.write(f'{tex_file}\n')
202
+
203
+ if os.path.exists(out_file):
204
+ return out_file
s2orc-doc2json/doc2json/utils/refspan_util.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+
4
+ def replace_refspans(
5
+ spans_to_replace: List[Tuple[int, int, str, str]],
6
+ full_string: str,
7
+ pre_padding: str = "",
8
+ post_padding: str = "",
9
+ btwn_padding: str = ", "
10
+ ) -> str:
11
+ """
12
+ For each span within the full string, replace that span with new text
13
+ :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
14
+ :param full_string:
15
+ :param pre_padding:
16
+ :param post_padding:
17
+ :param btwn_padding:
18
+ :return:
19
+ """
20
+ # assert all spans are equal to full_text span
21
+ assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])
22
+
23
+ # assert none of the spans start with the same start ind
24
+ start_inds = [rep[0] for rep in spans_to_replace]
25
+ assert len(set(start_inds)) == len(start_inds)
26
+
27
+ # sort by start index
28
+ spans_to_replace.sort(key=lambda x: x[0])
29
+
30
+ # form strings for each span group
31
+ for i, entry in enumerate(spans_to_replace):
32
+ start, end, span, new_string = entry
33
+
34
+ # skip empties
35
+ if end <= 0:
36
+ continue
37
+
38
+ # compute shift amount
39
+ shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)
40
+
41
+ # shift remaining appropriately
42
+ for ind in range(i + 1, len(spans_to_replace)):
43
+ next_start, next_end, next_span, next_string = spans_to_replace[ind]
44
+ # skip empties
45
+ if next_end <= 0:
46
+ continue
47
+ # if overlap between ref span and current ref span, remove from replacement
48
+ if next_start < end:
49
+ next_start = 0
50
+ next_end = 0
51
+ next_string = ""
52
+ # if ref span abuts previous reference span
53
+ elif next_start == end:
54
+ next_start += shift_amount
55
+ next_end += shift_amount
56
+ next_string = btwn_padding + pre_padding + next_string + post_padding
57
+ # if ref span starts after, shift starts and ends
58
+ elif next_start > end:
59
+ next_start += shift_amount
60
+ next_end += shift_amount
61
+ next_string = pre_padding + next_string + post_padding
62
+ # save adjusted span
63
+ spans_to_replace[ind] = (next_start, next_end, next_span, next_string)
64
+
65
+ spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
66
+ spans_to_replace.sort(key=lambda x: x[0])
67
+
68
+ # apply shifts in series
69
+ for start, end, span, new_string in spans_to_replace:
70
+ assert full_string[start:end] == span
71
+ full_string = full_string[:start] + new_string + full_string[end:]
72
+
73
+ return full_string
74
+
75
+
76
+ def sub_spans_and_update_indices(
77
+ spans_to_replace: List[Tuple[int, int, str, str]],
78
+ full_string: str
79
+ ) -> Tuple[str, List]:
80
+ """
81
+ Replace all spans and recompute indices
82
+ :param spans_to_replace:
83
+ :param full_string:
84
+ :return:
85
+ """
86
+ # TODO: check no spans overlapping
87
+ # TODO: check all spans well-formed
88
+
89
+ # assert all spans are equal to full_text span
90
+ assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])
91
+
92
+ # assert none of the spans start with the same start ind
93
+ start_inds = [rep[0] for rep in spans_to_replace]
94
+ assert len(set(start_inds)) == len(start_inds)
95
+
96
+ # sort by start index
97
+ spans_to_replace.sort(key=lambda x: x[0])
98
+
99
+ # compute offsets for each span
100
+ new_spans = [[start, end, token, surface, 0] for start, end, token, surface in spans_to_replace]
101
+ for i, entry in enumerate(spans_to_replace):
102
+ start, end, token, surface = entry
103
+ new_end = start + len(surface)
104
+ offset = new_end - end
105
+ new_spans[i][1] += offset
106
+ for new_span_entry in new_spans[i+1:]:
107
+ new_span_entry[4] += offset
108
+
109
+ # generate new text and create final spans
110
+ new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
111
+ new_spans = [(start + offset, end + offset, token, surface) for start, end, token, surface, offset in new_spans]
112
+
113
+ return new_text, new_spans
114
+
115
+