diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..47fe0bb8d8c86d93ed651b512f702f6debd4c608 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.ipynb_checkpoints/ +*.gz +*.pdf diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..5eccaf2a7f6c115cb11e93f1e9d902c69216722c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,55 @@ +FROM ubuntu:22.04 + +# Set Environment Variable +ENV HOME="/root" +ENV JAVA_TOOL_OPTIONS="-Dhttps.protocols=TLSv1.2" +ENV PDF2JSON_HOME="/app/src/s2orc-doc2json" + +# install system-wide deps for python and node +RUN apt-get -yqq update && \ + apt-get -yqq install software-properties-common curl wget zip screen git gcc build-essential openjdk-8-jdk + +# Install Miniconda +RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ + rm Miniconda3-latest-Linux-x86_64.sh +ENV PATH=/miniconda/bin:${PATH} + +# Create a Python 3.10 environment +RUN conda create -n my_env python=3.10 + +SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"] + +WORKDIR /app/src +COPY ./requirements.txt . +RUN pip install -r requirements.txt + +WORKDIR $PDF2JSON_HOME +COPY ./s2orc-doc2json/ . +RUN python setup.py develop + +WORKDIR $HOME +RUN wget https://github.com/kermitt2/grobid/archive/0.6.1.zip && \ + unzip 0.6.1.zip && \ + rm 0.6.1.zip + +WORKDIR $HOME/grobid-0.6.1 +RUN ./gradlew clean install && \ + cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/config.yaml $HOME/grobid-0.6.1/grobid-service/config/config.yaml && \ + cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/grobid.properties $HOME/grobid-0.6.1/grobid-home/config/grobid.properties + +WORKDIR /app/models/ +# Download necessary model checkpoint +RUN python -c "from huggingface_hub import snapshot_download; model_folder = '/app/models/'; snapshot_download('nianlong/memsum-word-embedding', local_dir = model_folder + 'word_embedding'); snapshot_download('nianlong/memsum-arxiv-summarization', local_dir = model_folder + 'memsum_arxiv' )" + +WORKDIR /app/src +COPY ./Dockerfile . + +WORKDIR /app/src/services +RUN git clone https://github.com/nianlonggu/MemSum + +COPY ./services/ . + +# start app +# will use the pure bash, ignoring the bash environment specified by SHELL command above +CMD [ "bash", "./start_service.sh" ] \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15018e2a3a36406d8e2b6d08bb2395c94ff13097 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,7 @@ +version: '3' + +services: + summarization_service: + build: . + ports: + - 7860:7860 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ddc1de3f55b77a2ccaa61a0d53fe74578b56923 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +tqdm +beautifulsoup4==4.7.1 +boto3==1.9.147 +requests==2.21.0 +flask==2.3.2 +flask_cors==4.0.0 +python-magic==0.4.18 +latex2mathml==2.16.2 +gunicorn==20.1.0 +lxml==4.9.0 +unidecode +nltk==3.7 +jsonschema==4.17.3 +six==1.16.0 +numpy==1.21.6 +ujson==5.2.0 +more-itertools==9.1.0 +dateparser==1.1.8 +streamlit +transformers==4.30.0 +torch==2.2.2 \ No newline at end of file diff --git a/s2orc-doc2json/LICENSE b/s2orc-doc2json/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/s2orc-doc2json/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/s2orc-doc2json/README.md b/s2orc-doc2json/README.md new file mode 100644 index 0000000000000000000000000000000000000000..47cfd262e1d20cb0e7b9084f441875e774df3ebb --- /dev/null +++ b/s2orc-doc2json/README.md @@ -0,0 +1,138 @@ +# Convert scientific papers to S2ORC JSON + +This project is a part of [S2ORC](https://github.com/allenai/s2orc). For S2ORC, we convert PDFs to JSON using Grobid and a custom TEI.XML to JSON parser. That TEI.XML to JSON parser (`grobid2json`) is made available here. We additionally process LaTeX dumps from arXiv. That parser (`tex2json`) is also made available here. + +The S2ORC github page includes a JSON schema, but it may be easier to understand that schema based on the python classes in `doc2json/s2orc.py`. + +This custom JSON schema is also used for the [CORD-19](https://github.com/allenai/cord19) project, so those who have interacted with CORD-19 may find this format familiar. + +Possible future components (no promises): +- Linking bibliography entries (bibliography consolidation) to papers in S2ORC + +## Setup your environment + +NOTE: Conda is shown but any other python env manager should be fine + +Go [here](https://docs.conda.io/en/latest/miniconda.html) to install the latest version of miniconda. + +Then, create an environment: + +```console +conda create -n doc2json python=3.8 pytest +conda activate doc2json +pip install -r requirements.txt +python setup.py develop +``` + +## PDF Processing + +The current `grobid2json` tool uses Grobid to first process each PDF into XML, then extracts paper components from the XML. + +### Install Grobid + +You will need to have Java installed on your machine. Then, you can install your own version of Grobid and get it running, or you can run the following script: + +```console +bash scripts/setup_grobid.sh +``` + +This will setup Grobid, currently hard-coded as version 0.6.1. Then run: + +```console +bash scripts/run_grobid.sh +``` + +to start the Grobid server. Don't worry if it gets stuck at 87%; this is normal and means Grobid is ready to process PDFs. + +The expected port for the Grobid service is 8070, but you can change this as well. Make sure to edit the port in both the Grobid config file as well as `grobid/grobid_client.py`. + +### Process a PDF + +There are a couple of test PDFs in `tests/input/` if you'd like to try with that. + +For example, you can try: + +```console +python doc2json/grobid2json/process_pdf.py -i tests/pdf/N18-3011.pdf -t temp_dir/ -o output_dir/ +``` + +This will generate a JSON file in the specified `output_dir`. If unspecified, the file will be in the `output/` directory from your path. + +## LaTeX Processing + +If you want to process LaTeX, you also need to install the following libraries: + +- [latexpand](https://ctan.org/pkg/latexpand?lang=en) (`apt install texlive-extra-utils`) +- [tralics](http://www-sop.inria.fr/marelle/tralics/) (`apt install tralics`) + +To process LaTeX, all files must be in a zip file, similar to the `*.gz` files you can download from arXiv. + +A few examples are available under `tests/latex/`. For example, you can try: + +```console +python doc2json/tex2json/process_tex.py -i test/latex/1911.02782.gz -t temp_dir/ -o output_dir/ +``` + +Again, this will produce a JSON file in the specified `output_dir`. + +## PMC JATS XML Processing + +To process JATS XML, try: + +```console +python doc2json/jats2json/process_jats.py -i test/jats/PMC5828200.nxml -o output_dir/ +``` + +This will create a JSON file with the same paper id in the specified output directory. + +## Loading a S2ORC JSON file + +The format of S2ORC releases have drifted over time. Use the `load_s2orc` function in `doc2json/s2orc.py` to try and load historic and currect S2ORC JSON. + +## Run a Flask app and process documents through a web service + +To process PDFs, you will first need to start Grobid (defaults to port 8070). If you are processing LaTeX, no need for this step. + +```console +bash scripts/run_grobid.sh +``` + +Then, start the Flask app (defaults to port 8080). + +```console +python doc2json/flask/app.py +``` + +Go to [localhost:8080](localhost:8080) to upload and process papers. + +Or alternatively, you can do things like: + +```console +curl localhost:8080/ -F file=@tests/pdf/N18-3011.pdf +``` + +## Citation + +If you use this utility in your research, please cite: + +``` +@inproceedings{lo-wang-2020-s2orc, + title = "{S}2{ORC}: The Semantic Scholar Open Research Corpus", + author = "Lo, Kyle and Wang, Lucy Lu and Neumann, Mark and Kinney, Rodney and Weld, Daniel", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.acl-main.447", + doi = "10.18653/v1/2020.acl-main.447", + pages = "4969--4983" +} +``` + +## Contact + +Contributions are welcome. Note the embarassingly poor test coverage. Also, please note this pipeline is not perfect. It will miss text or make errors on most PDFs. The current PDF to JSON step uses Grobid; we may replace this with a different model in the future. + +Issues: contact `lucyw@allenai.org` or `kylel@allenai.org` + diff --git a/s2orc-doc2json/doc2json.egg-info/PKG-INFO b/s2orc-doc2json/doc2json.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..7a255bab34af7fa2a0eb301a1d2e93615ea659c8 --- /dev/null +++ b/s2orc-doc2json/doc2json.egg-info/PKG-INFO @@ -0,0 +1,4 @@ +Metadata-Version: 2.1 +Name: doc2json +Version: 0.1 +License-File: LICENSE diff --git a/s2orc-doc2json/doc2json.egg-info/SOURCES.txt b/s2orc-doc2json/doc2json.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cb45132651b3a6266ad0396403f61dcc116a20b --- /dev/null +++ b/s2orc-doc2json/doc2json.egg-info/SOURCES.txt @@ -0,0 +1,42 @@ +LICENSE +README.md +setup.py +doc2json/__init__.py +doc2json/config.py +doc2json/s2orc.py +doc2json.egg-info/PKG-INFO +doc2json.egg-info/SOURCES.txt +doc2json.egg-info/dependency_links.txt +doc2json.egg-info/not-zip-safe +doc2json.egg-info/top_level.txt +doc2json/grobid2json/__init__.py +doc2json/grobid2json/pdf_to_tei.py +doc2json/grobid2json/process_pdf.py +doc2json/grobid2json/tei_to_json.py +doc2json/grobid2json/grobid/__init__.py +doc2json/grobid2json/grobid/client.py +doc2json/grobid2json/grobid/grobid_client.py +doc2json/jats2json/__init__.py +doc2json/jats2json/jats_to_json.py +doc2json/jats2json/process_jats.py +doc2json/jats2json/pmc_utils/__init__.py +doc2json/jats2json/pmc_utils/all_tag_utils.py +doc2json/jats2json/pmc_utils/back_tag_utils.py +doc2json/jats2json/pmc_utils/extract_utils.py +doc2json/jats2json/pmc_utils/front_tag_utils.py +doc2json/jats2json/pmc_utils/tests.py +doc2json/spp2json/__init__.py +doc2json/spp2json/process_pdf.py +doc2json/spp2json/spp/__init__.py +doc2json/spp2json/spp/spp_client.py +doc2json/spp2json/spp/spp_json_to_s2orc_json.py +doc2json/tex2json/__init__.py +doc2json/tex2json/process_tex.py +doc2json/tex2json/tex_to_xml.py +doc2json/tex2json/xml_to_json.py +doc2json/utils/__init__.py +doc2json/utils/citation_util.py +doc2json/utils/grobid_util.py +doc2json/utils/latex_util.py +doc2json/utils/refspan_util.py +doc2json/utils/soup_utils.py \ No newline at end of file diff --git a/s2orc-doc2json/doc2json.egg-info/dependency_links.txt b/s2orc-doc2json/doc2json.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/s2orc-doc2json/doc2json.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/s2orc-doc2json/doc2json.egg-info/not-zip-safe b/s2orc-doc2json/doc2json.egg-info/not-zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/s2orc-doc2json/doc2json.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/s2orc-doc2json/doc2json.egg-info/top_level.txt b/s2orc-doc2json/doc2json.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..39eed2abbc790f9591ea0071068dfea8190b1238 --- /dev/null +++ b/s2orc-doc2json/doc2json.egg-info/top_level.txt @@ -0,0 +1 @@ +doc2json diff --git a/s2orc-doc2json/doc2json/__init__.py b/s2orc-doc2json/doc2json/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/s2orc-doc2json/doc2json/config.py b/s2orc-doc2json/doc2json/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2cea9a4e01cb035610dbc246b96c8280e04b5dd3 --- /dev/null +++ b/s2orc-doc2json/doc2json/config.py @@ -0,0 +1,2 @@ +S2ORC_NAME_STRING = 'S2ORC' +S2ORC_VERSION_STRING = '1.0.0' \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/flask/app.py b/s2orc-doc2json/doc2json/flask/app.py new file mode 100644 index 0000000000000000000000000000000000000000..74ae0dae4e855efb3e2c6677d09c98fe8fca1f56 --- /dev/null +++ b/s2orc-doc2json/doc2json/flask/app.py @@ -0,0 +1,57 @@ +""" +Flask app for S2ORC pdf2json utility +""" +import hashlib +from flask import Flask, request, jsonify, flash, url_for, redirect, render_template, send_file +from doc2json.grobid2json.process_pdf import process_pdf_stream +from doc2json.tex2json.process_tex import process_tex_stream +from doc2json.jats2json.process_jats import process_jats_stream + +app = Flask(__name__) + +ALLOWED_EXTENSIONS = {'pdf', 'gz', 'nxml'} + + +@app.route('/') +def home(): + return render_template("home.html") + +@app.route('/', methods=['POST']) +def upload_file(): + uploaded_file = request.files['file'] + if uploaded_file.filename != '': + filename = uploaded_file.filename + # read pdf file + if filename.endswith('pdf'): + pdf_stream = uploaded_file.stream + pdf_content = pdf_stream.read() + # compute hash + pdf_sha = hashlib.sha1(pdf_content).hexdigest() + # get results + results = process_pdf_stream(filename, pdf_sha, pdf_content) + return jsonify(results) + # read latex file + elif filename.endswith('gz'): + zip_stream = uploaded_file.stream + zip_content = zip_stream.read() + # get results + results = process_tex_stream(filename, zip_content) + return jsonify(results) + # read nxml file (jats) + elif filename.endswith('nxml'): + xml_stream = uploaded_file.stream + xml_content = xml_stream.read() + # get results + results = process_jats_stream(filename, xml_content) + return jsonify(results) + # unknown + else: + return { + "Error": "Unknown file type!" + } + + return redirect(url_for('index')) + + +if __name__ == '__main__': + app.run(port=8080, host='0.0.0.0') diff --git a/s2orc-doc2json/doc2json/flask/static/style.css b/s2orc-doc2json/doc2json/flask/static/style.css new file mode 100644 index 0000000000000000000000000000000000000000..6dcf4a4ec6ee8d3f7cbb04432c49dfc28b00fe33 --- /dev/null +++ b/s2orc-doc2json/doc2json/flask/static/style.css @@ -0,0 +1,40 @@ +html { + box-sizing: border-box; + } + + * { + box-sizing: inherit; + font-family: Calibri, Arial, sans-serif !important; + } + + h1 { + font-size: 32px; + } + + h2, h3 { + font-size: 24px; + } + + body { + margin: 20px; + font-size: 125%; + line-height: 1.4; + max-width: 800px; + margin: 0 auto; + } + + footer { + margin-top: 50px; + border-top: 1px solid silver; + font-size: 0.8em; + } + + footer ol { + padding-left: 20px; + } + + .p { + text-align: center; + font-size: .75em; + padding-top: 150px; + } \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/flask/templates/home.html b/s2orc-doc2json/doc2json/flask/templates/home.html new file mode 100644 index 0000000000000000000000000000000000000000..9ee764038853bf89fe52500c12712184f4d8d63e --- /dev/null +++ b/s2orc-doc2json/doc2json/flask/templates/home.html @@ -0,0 +1,18 @@ + + + + + S2ORC doc2json + + + +

S2ORC doc2json utility

+

Upload a scientific PDF, LaTeX zip file, or JATS XML file and get back a JSON:

+

(Accepted file extensions: *.pdf, *.gz, *.nxml)

+
+

+

+
+

Please wait, processing takes time...

+ + \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/grobid2json/__init__.py b/s2orc-doc2json/doc2json/grobid2json/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md b/s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..73b77008fdc894724249123c49f80cf783496b2c --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md @@ -0,0 +1,92 @@ +# Simple python client for GROBID REST services + +**NOTE: This README is adapted from GROBID** + +This Python client can be used to process in an efficient concurrent manner a set of PDF in a given directory by the [GROBID](https://github.com/kermitt2/grobid) service. Results are written in a given output directory and include the resulting XML TEI representation of the PDF. + +## Build and run + +You need first to install and start the *grobid* service, latest stable version, see the [documentation](http://grobid.readthedocs.io/). It is assumed that the server will run on the address `http://localhost:8070`. You can change the server address by editing the file `config.json`. + +## Requirements + +This client has been developed and tested with Python 3.5. + +## Install + +Get the github repo: + +> git clone https://github.com/kermitt2/grobid-client-python + +> cd grobid-client-python + +It is advised to setup first a virtual environment to avoid falling into one of these gloomy python dependency marshlands: + +> virtualenv --system-site-packages -p python3 env + +> source env/bin/activate + +## Usage and options + +``` +usage: grobid-client.py [-h] [--input INPUT] [--config CONFIG] + [--output OUTPUT] [--n N] + service + +Client for GROBID services + +positional arguments: + service one of [processFulltextDocument, + processHeaderDocument, processReferences] + +optional arguments: + -h, --help show this help message and exit + --input INPUT path to the directory containing PDF to process + --output OUTPUT path to the directory where to put the results + --config CONFIG path to the config file, default is ./config.json + --n N concurrency for service usage + --generateIDs generate random xml:id to textual XML elements of the + result files + --consolidate_header call GROBID with consolidation of the metadata + extracted from the header + --consolidate_citations + call GROBID with consolidation of the extracted + bibliographical references +``` + +Examples: + +> python3 grobid-client.py --input ~/tmp/in2 --output ~/tmp/out processFulltextDocument + +This command will process all the PDF files present in the input directory (files with extension `.pdf` only) with the `processFulltextDocument` service of GROBID, and write the resulting XML TEI files under the output directory, reusing the file name with a different file extension (`.tei.xml`), using the default `10` concurrent workers. + +> python3 grobid-client.py --input ~/tmp/in2 --output ~/tmp/out --n 20 processHeaderDocument + +This command will process all the PDF files present in the input directory (files with extension `.pdf` only) with the `processHeaderDocument` service of GROBID, and write the resulting XML TEI files under the output directory, reusing the file name with a different file extension (`.tei.xml`), using `20` concurrent workers. + +## Benchmarking + +Full text processing of __136 PDF__ (total 3443 pages, in average 25 pages per PDF) on Intel Core i7-4790K CPU 4.00GHz, 4 cores (8 threads), 16GB memory, `n` being the concurrency parameter: + +| n | runtime (s)| s/PDF | PDF/s | +|----|------------|-------|-------| +| 1 | 209.0 | 1.54 | 0.65 | +| 2 | 112.0 | 0.82 | 1.21 | +| 3 | 80.4 | 0.59 | 1.69 | +| 5 | 62.9 | 0.46 | 2.16 | +| 8 | 55.7 | 0.41 | 2.44 | +| 10 | 55.3 | 0.40 | 2.45 | + +![Runtime Plot](resources/20180928112135.png) + +As complementary info, GROBID processing of header of the 136 PDF and with `n=10` takes 3.74 s (15 times faster than the complete full text processing because only the two first pages of the PDF are considered), 36 PDF/s. In similar conditions, extraction and structuring of bibliographical references takes 26.9 s (5.1 PDF/s). + +## Todo + +Benchmarking with more files (e.g. million ISTEX PDF). Also implement existing GROBID services for text input (date, name, affiliation/address, raw bibliographical references, etc.). Better support for parameters (including elements where to put coordinates). + +## License and contact + +Distributed under [Apache 2.0 license](http://www.apache.org/licenses/LICENSE-2.0). + +Main author and contact: Patrice Lopez () diff --git a/s2orc-doc2json/doc2json/grobid2json/grobid/__init__.py b/s2orc-doc2json/doc2json/grobid2json/grobid/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/s2orc-doc2json/doc2json/grobid2json/grobid/client.py b/s2orc-doc2json/doc2json/grobid2json/grobid/client.py new file mode 100644 index 0000000000000000000000000000000000000000..92a0d06e8fcae219582a0b2d703ee07726de5084 --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/grobid/client.py @@ -0,0 +1,225 @@ +""" Generic API Client """ +from copy import deepcopy +import json +import requests + +try: + from urlparse import urljoin +except ImportError: + from urllib.parse import urljoin + + +class ApiClient(object): + """ Client to interact with a generic Rest API. + + Subclasses should implement functionality accordingly with the provided + service methods, i.e. ``get``, ``post``, ``put`` and ``delete``. + """ + + accept_type = 'application/xml' + api_base = None + + def __init__( + self, + base_url, + username=None, + api_key=None, + status_endpoint=None, + timeout=60 + ): + """ Initialise client. + + Args: + base_url (str): The base URL to the service being used. + username (str): The username to authenticate with. + api_key (str): The API key to authenticate with. + timeout (int): Maximum time before timing out. + """ + self.base_url = base_url + self.username = username + self.api_key = api_key + self.status_endpoint = urljoin(self.base_url, status_endpoint) + self.timeout = timeout + + @staticmethod + def encode(request, data): + """ Add request content data to request body, set Content-type header. + + Should be overridden by subclasses if not using JSON encoding. + + Args: + request (HTTPRequest): The request object. + data (dict, None): Data to be encoded. + + Returns: + HTTPRequest: The request object. + """ + if data is None: + return request + + request.add_header('Content-Type', 'application/json') + request.data = json.dumps(data) + + return request + + @staticmethod + def decode(response): + """ Decode the returned data in the response. + + Should be overridden by subclasses if something else than JSON is + expected. + + Args: + response (HTTPResponse): The response object. + + Returns: + dict or None. + """ + try: + return response.json() + except ValueError as e: + return e.message + + def get_credentials(self): + """ Returns parameters to be added to authenticate the request. + + This lives on its own to make it easier to re-implement it if needed. + + Returns: + dict: A dictionary containing the credentials. + """ + return {"username": self.username, "api_key": self.api_key} + + def call_api( + self, + method, + url, + headers=None, + params=None, + data=None, + files=None, + timeout=None, + ): + """ Call API. + + This returns object containing data, with error details if applicable. + + Args: + method (str): The HTTP method to use. + url (str): Resource location relative to the base URL. + headers (dict or None): Extra request headers to set. + params (dict or None): Query-string parameters. + data (dict or None): Request body contents for POST or PUT requests. + files (dict or None: Files to be passed to the request. + timeout (int): Maximum time before timing out. + + Returns: + ResultParser or ErrorParser. + """ + headers = deepcopy(headers) or {} + headers['Accept'] = self.accept_type + params = deepcopy(params) or {} + data = data or {} + files = files or {} + #if self.username is not None and self.api_key is not None: + # params.update(self.get_credentials()) + r = requests.request( + method, + url, + headers=headers, + params=params, + files=files, + data=data, + timeout=timeout, + ) + + return r, r.status_code + + def get(self, url, params=None, **kwargs): + """ Call the API with a GET request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + + Returns: + ResultParser or ErrorParser. + """ + return self.call_api( + "GET", + url, + params=params, + **kwargs + ) + + def delete(self, url, params=None, **kwargs): + """ Call the API with a DELETE request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + + Returns: + ResultParser or ErrorParser. + """ + return self.call_api( + "DELETE", + url, + params=params, + **kwargs + ) + + def put(self, url, params=None, data=None, files=None, **kwargs): + """ Call the API with a PUT request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + data (dict or None): Request body contents. + files (dict or None: Files to be passed to the request. + + Returns: + An instance of ResultParser or ErrorParser. + """ + return self.call_api( + "PUT", + url, + params=params, + data=data, + files=files, + **kwargs + ) + + def post(self, url, params=None, data=None, files=None, **kwargs): + """ Call the API with a POST request. + + Args: + url (str): Resource location relative to the base URL. + params (dict or None): Query-string parameters. + data (dict or None): Request body contents. + files (dict or None: Files to be passed to the request. + + Returns: + An instance of ResultParser or ErrorParser. + """ + return self.call_api( + method="POST", + url=url, + params=params, + data=data, + files=files, + **kwargs + ) + + def service_status(self, **kwargs): + """ Call the API to get the status of the service. + + Returns: + An instance of ResultParser or ErrorParser. + """ + return self.call_api( + 'GET', + self.status_endpoint, + params={'format': 'json'}, + **kwargs + ) diff --git a/s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml b/s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac42c312fa03ea6dfa2a9f3a9def1469b03f7841 --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml @@ -0,0 +1,36 @@ +grobid: + # NOTE: change these values to absolute paths when running on production + grobidHome: "grobid-home" + + # how to load the models, + # false -> models are loaded when needed (default), avoiding puting in memory useless models + # true -> all the models are loaded into memory at the server statup, slow the start of the services and models not + # used will take some memory + modelPreload: true + +server: + type: custom + applicationConnectors: + - type: http + port: 8070 + adminConnectors: + - type: http + port: 8071 + registerDefaultExceptionMappers: false + + +logging: + level: WARN + loggers: + org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" + appenders: + - type: console + threshold: ALL + timeZone: UTC +# - type: file +# currentLogFilename: logs/grobid-service.log +# threshold: ALL +# archive: true +# archivedLogFilenamePattern: logs/grobid-service-%d.log +# archivedFileCount: 5 +# timeZone: UTC diff --git a/s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties b/s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties new file mode 100644 index 0000000000000000000000000000000000000000..59076ec1ed879dbf411e43be809bfd877abb5640 --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties @@ -0,0 +1,59 @@ +#-------------------- resource directories --------------------- +# properties of where to find directories necessary for GROBID +# EACH KEY REFERENCING A PATH HAS TO ENDS WITH ".path" +grobid.resource.path=./resources +grobid.temp.path=./tmp +grobid.bin.path=./bin + +#-------------------- external/native libs --------------------- +#path to folder containing native libraries of 3rd parties +grobid.nativelibrary.path=./lib +grobid.3rdparty.pdf2xml.path=./pdf2xml +grobid.3rdparty.pdf2xml.memory.limit.mb=6096 +grobid.3rdparty.pdf2xml.timeout.sec=60 +#------------------------------------------------------------- + +#-------------------- consolidation -------------------- +# Define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or "glutton" for https://github.com/kermitt2/biblio-glutton +grobid.consolidation.service=crossref +#grobid.consolidation.service=glutton +#org.grobid.glutton.host=cloud.science-miner.com/glutton +#org.grobid.glutton.port=0 +org.grobid.glutton.host=localhost +org.grobid.glutton.port=8070 +#org.grobid.crossref.mailto=toto@titi.tutu +#org.grobid.crossref.token=yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere + +#-------------------- proxy -------------------- +#proxy to be used for external call to the crossref REST API service or Glutton service if not deployed under proxy ("null" when no proxy) +grobid.proxy_host=null +grobid.proxy_port=null +#------------------------------------------------------ + +#-------------------- runtime ------------------ +grobid.crf.engine=wapiti +#grobid.crf.engine=delft +#grobid.crf.engine=crfpp +grobid.delft.install=../delft +grobid.delft.useELMo=false +grobid.pdf.blocks.max=100000 +grobid.pdf.tokens.max=1000000 + +#-------------------- training ------------------ +#number of threads for training the wapiti models (0 to use all available processors) +grobid.nb_threads=0 + +#-------------------- language identification ------------------ +#property for using or not the language identifier (true|false) +grobid.use_language_id=true +grobid.language_detector_factory=org.grobid.core.lang.impl.CybozuLanguageDetectorFactory +#determines if properties like the firstnames, lastnames country codes and dictionaries are supposed to be read from $GROBID_HOME path or not (possible values (true|false) dafault is false) +grobid.resources.inHome=true +#------------------------------------------------------ + +#-------------------- pooling ------------------- +# Maximum parallel connections allowed +org.grobid.max.connections=72 +# Maximum time wait to get a connection when the pool is full (in seconds) +org.grobid.pool.max.wait=1 +#------------------------------------------------------ diff --git a/s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py b/s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py new file mode 100644 index 0000000000000000000000000000000000000000..640c39791df1314ecc3b16aeaf0a92eda24e1e92 --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py @@ -0,0 +1,249 @@ +import os +import io +import json +import argparse +import time +import glob +from doc2json.grobid2json.grobid.client import ApiClient +import ntpath +from typing import List + +''' +This version uses the standard ProcessPoolExecutor for parallelizing the concurrent calls to the GROBID services. +Given the limits of ThreadPoolExecutor (input stored in memory, blocking Executor.map until the whole input +is acquired), it works with batches of PDF of a size indicated in the config.json file (default is 1000 entries). +We are moving from first batch to the second one only when the first is entirely processed - which means it is +slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would +require something scalable too, which is not implemented for the moment. +''' + +DEFAULT_GROBID_CONFIG = { + "grobid_server": "localhost", + "grobid_port": "8070", + "batch_size": 1000, + "sleep_time": 5, + "generateIDs": False, + "consolidate_header": False, + "consolidate_citations": False, + "include_raw_citations": True, + "include_raw_affiliations": False, + "max_workers": 2, +} + +class GrobidClient(ApiClient): + + def __init__(self, config=None): + self.config = config or DEFAULT_GROBID_CONFIG + self.generate_ids = self.config["generateIDs"] + self.consolidate_header = self.config["consolidate_header"] + self.consolidate_citations = self.config["consolidate_citations"] + self.include_raw_citations = self.config["include_raw_citations"] + self.include_raw_affiliations = self.config["include_raw_affiliations"] + self.max_workers = self.config["max_workers"] + self.grobid_server = self.config["grobid_server"] + self.grobid_port = self.config["grobid_port"] + self.sleep_time = self.config["sleep_time"] + + def process(self, input: str, output: str, service: str): + batch_size_pdf = self.config['batch_size'] + pdf_files = [] + + for pdf_file in glob.glob(input + "/*.pdf"): + pdf_files.append(pdf_file) + + if len(pdf_files) == batch_size_pdf: + self.process_batch(pdf_files, output, service) + pdf_files = [] + + # last batch + if len(pdf_files) > 0: + self.process_batch(pdf_files, output, service) + + def process_batch(self, pdf_files: List[str], output: str, service: str) -> None: + print(len(pdf_files), "PDF files to process") + for pdf_file in pdf_files: + self.process_pdf(pdf_file, output, service) + + def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, service: str) -> str: + # process the stream + files = { + 'input': ( + pdf_file, + pdf_strm, + 'application/pdf', + {'Expires': '0'} + ) + } + + the_url = 'http://' + self.grobid_server + the_url += ":" + self.grobid_port + the_url += "/api/" + service + + # set the GROBID parameters + the_data = {} + if self.generate_ids: + the_data['generateIDs'] = '1' + else: + the_data['generateIDs'] = '0' + + if self.consolidate_header: + the_data['consolidateHeader'] = '1' + else: + the_data['consolidateHeader'] = '0' + + if self.consolidate_citations: + the_data['consolidateCitations'] = '1' + else: + the_data['consolidateCitations'] = '0' + + if self.include_raw_affiliations: + the_data['includeRawAffiliations'] = '1' + else: + the_data['includeRawAffiliations'] = '0' + + if self.include_raw_citations: + the_data['includeRawCitations'] = '1' + else: + the_data['includeRawCitations'] = '0' + + res, status = self.post( + url=the_url, + files=files, + data=the_data, + headers={'Accept': 'text/plain'} + ) + + if status == 503: + time.sleep(self.sleep_time) + return self.process_pdf_stream(pdf_file, pdf_strm, service) + elif status != 200: + with open(os.path.join(output, "failed.log"), "a+") as failed: + failed.write(pdf_file.strip(".pdf") + "\n") + print('Processing failed with error ' + str(status)) + return "" + else: + return res.text + + def process_pdf(self, pdf_file: str, output: str, service: str) -> None: + # check if TEI file is already produced + # we use ntpath here to be sure it will work on Windows too + pdf_file_name = ntpath.basename(pdf_file) + filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + '.tei.xml') + if os.path.isfile(filename): + return + + print(pdf_file) + pdf_strm = open(pdf_file, 'rb').read() + tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service) + + # writing TEI file + if tei_text: + with io.open(filename, 'w+', encoding='utf8') as tei_file: + tei_file.write(tei_text) + + def process_citation(self, bib_string: str, log_file: str) -> str: + # process citation raw string and return corresponding dict + the_data = { + 'citations': bib_string, + 'consolidateCitations': '0' + } + + the_url = 'http://' + self.grobid_server + the_url += ":" + self.grobid_port + the_url += "/api/processCitation" + + for _ in range(5): + try: + res, status = self.post( + url=the_url, + data=the_data, + headers={'Accept': 'text/plain'} + ) + if status == 503: + time.sleep(self.sleep_time) + continue + elif status != 200: + with open(log_file, "a+") as failed: + failed.write("-- BIBSTR --\n") + failed.write(bib_string + "\n\n") + break + else: + return res.text + except Exception: + continue + + def process_header_names(self, header_string: str, log_file: str) -> str: + # process author names from header string + the_data = { + 'names': header_string + } + + the_url = 'http://' + self.grobid_server + the_url += ":" + self.grobid_port + the_url += "/api/processHeaderNames" + + res, status = self.post( + url=the_url, + data=the_data, + headers={'Accept': 'text/plain'} + ) + + if status == 503: + time.sleep(self.sleep_time) + return self.process_header_names(header_string, log_file) + elif status != 200: + with open(log_file, "a+") as failed: + failed.write("-- AUTHOR --\n") + failed.write(header_string + "\n\n") + else: + return res.text + + def process_affiliations(self, aff_string: str, log_file: str) -> str: + # process affiliation from input string + the_data = { + 'affiliations': aff_string + } + + the_url = 'http://' + self.grobid_server + the_url += ":" + self.grobid_port + the_url += "/api/processAffiliations" + + res, status = self.post( + url=the_url, + data=the_data, + headers={'Accept': 'text/plain'} + ) + + if status == 503: + time.sleep(self.sleep_time) + return self.process_affiliations(aff_string, log_file) + elif status != 200: + with open(log_file, "a+") as failed: + failed.write("-- AFFILIATION --\n") + failed.write(aff_string + "\n\n") + else: + return res.text + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Client for GROBID services") + parser.add_argument("service", help="one of [processFulltextDocument, processHeaderDocument, processReferences]") + parser.add_argument("--input", default=None, help="path to the directory containing PDF to process") + parser.add_argument("--output", default=None, help="path to the directory where to put the results") + parser.add_argument("--config", default=None, help="path to the config file, default is ./config.json") + + args = parser.parse_args() + + input_path = args.input + config = json.load(open(args.config)) if args.config else DEFAULT_GROBID_CONFIG + output_path = args.output + service = args.service + + client = GrobidClient(config=config) + + start_time = time.time() + + client.process(input_path, output_path, service) + + runtime = round(time.time() - start_time, 3) + print("runtime: %s seconds " % (runtime)) diff --git a/s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py b/s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py new file mode 100644 index 0000000000000000000000000000000000000000..74def5ce805456fb4ebf4a003df14e40c1608c9f --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py @@ -0,0 +1,7 @@ +import os +import sys +from typing import Dict, List +from PyPDF2 import PdfFileReader + + + diff --git a/s2orc-doc2json/doc2json/grobid2json/process_pdf.py b/s2orc-doc2json/doc2json/grobid2json/process_pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..8c60084bc7349956e5c9879b91123f13b81f3a5d --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/process_pdf.py @@ -0,0 +1,104 @@ +import os +import json +import argparse +import time +from bs4 import BeautifulSoup +from typing import Optional, Dict + +from doc2json.grobid2json.grobid.grobid_client import GrobidClient +from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json + +BASE_TEMP_DIR = 'temp' +BASE_OUTPUT_DIR = 'output' +BASE_LOG_DIR = 'log' + + +def process_pdf_stream(input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None) -> Dict: + """ + Process PDF stream + :param input_file: + :param sha: + :param input_stream: + :return: + """ + # process PDF through Grobid -> TEI.XML + client = GrobidClient(grobid_config) + tei_text = client.process_pdf_stream(input_file, input_stream, 'temp', "processFulltextDocument") + + # make soup + soup = BeautifulSoup(tei_text, "xml") + + # get paper + paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha) + + return paper.release_json('pdf') + + +def process_pdf_file( + input_file: str, + temp_dir: str = BASE_TEMP_DIR, + output_dir: str = BASE_OUTPUT_DIR, + grobid_config: Optional[Dict] = None +) -> str: + """ + Process a PDF file and get JSON representation + :param input_file: + :param temp_dir: + :param output_dir: + :return: + """ + os.makedirs(temp_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + # get paper id as the name of the file + paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1]) + tei_file = os.path.join(temp_dir, f'{paper_id}.tei.xml') + output_file = os.path.join(output_dir, f'{paper_id}.json') + + # check if input file exists and output file doesn't + if not os.path.exists(input_file): + raise FileNotFoundError(f"{input_file} doesn't exist") + if os.path.exists(output_file): + print(f'{output_file} already exists!') + + # process PDF through Grobid -> TEI.XML + client = GrobidClient(grobid_config) + # TODO: compute PDF hash + # TODO: add grobid version number to output + client.process_pdf(input_file, temp_dir, "processFulltextDocument") + + # process TEI.XML -> JSON + assert os.path.exists(tei_file) + paper = convert_tei_xml_file_to_s2orc_json(tei_file) + + # write to file + with open(output_file, 'w') as outf: + json.dump(paper.release_json(), outf, indent=4, sort_keys=False) + + return output_file + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON") + parser.add_argument("-i", "--input", default=None, help="path to the input PDF file") + parser.add_argument("-t", "--temp", default=BASE_TEMP_DIR, help="path to the temp dir for putting tei xml files") + parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json files") + parser.add_argument("-k", "--keep", action='store_true') + + args = parser.parse_args() + + input_path = args.input + temp_path = args.temp + output_path = args.output + keep_temp = args.keep + + start_time = time.time() + + os.makedirs(temp_path, exist_ok=True) + os.makedirs(output_path, exist_ok=True) + + process_pdf_file(input_path, temp_path, output_path) + + runtime = round(time.time() - start_time, 3) + print("runtime: %s seconds " % (runtime)) + print('done.') \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/grobid2json/tei_to_json.py b/s2orc-doc2json/doc2json/grobid2json/tei_to_json.py new file mode 100644 index 0000000000000000000000000000000000000000..097d9f0fcbf61e780940518c0c1a682ea8d66b4a --- /dev/null +++ b/s2orc-doc2json/doc2json/grobid2json/tei_to_json.py @@ -0,0 +1,750 @@ +#!/usr/bin/env python + +import os +import sys +import bs4 +import re +from bs4 import BeautifulSoup, NavigableString +from typing import List, Dict, Tuple + +from doc2json.s2orc import Paper + +from doc2json.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml +from doc2json.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD +from doc2json.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse +from doc2json.utils.refspan_util import sub_spans_and_update_indices + + +REPLACE_TABLE_TOKS = { + "": "", + "": "", + "": "", + "": "", + "": "", + "": "", + " List[Dict]: + """ + Finds all bibliography entries in a grobid xml. + """ + bibliography = soup.listBibl + if bibliography is None: + return [] + + entries = bibliography.find_all("biblStruct") + + structured_entries = [] + for entry in entries: + bib_entry = parse_bib_entry(entry) + # add bib entry only if it has a title + if bib_entry['title']: + structured_entries.append(bib_entry) + + bibliography.decompose() + + return structured_entries + + +def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None: + """ + Replace all formulas with the text + :param sp: + :return: + """ + for eq in sp.find_all('formula'): + eq.replace_with(sp.new_string(eq.text.strip())) + + +def table_to_html(table: bs4.element.Tag) -> str: + """ + Sub table tags with html table tags + :param table_str: + :return: + """ + for tag in table: + if tag.name != 'row': + print(f'Unknown table subtag: {tag.name}') + tag.decompose() + table_str = str(table) + for token, subtoken in REPLACE_TABLE_TOKS.items(): + table_str = table_str.replace(token, subtoken) + return table_str + + +def extract_figures_and_tables_from_tei_xml(sp: BeautifulSoup) -> Dict[str, Dict]: + """ + Generate figure and table dicts + :param sp: + :return: + """ + ref_map = dict() + + for fig in sp.find_all('figure'): + try: + if fig.name and fig.get('xml:id'): + if fig.get('type') == 'table': + ref_map[normalize_grobid_id(fig.get('xml:id'))] = { + "text": fig.figDesc.text.strip() if fig.figDesc else fig.head.text.strip() if fig.head else "", + "latex": None, + "type": "table", + "content": table_to_html(fig.table) + } + else: + ref_map[normalize_grobid_id(fig.get('xml:id'))] = { + "text": fig.figDesc.text.strip() if fig.figDesc else "", + "latex": None, + "type": "figure", + "content": "" + } + except AttributeError: + continue + fig.decompose() + + return ref_map + + +def check_if_citations_are_bracket_style(sp: BeautifulSoup) -> bool: + """ + Check if the document has bracket style citations + :param sp: + :return: + """ + cite_strings = [] + if sp.body: + for div in sp.body.find_all('div'): + if div.head: + continue + for rtag in div.find_all('ref'): + ref_type = rtag.get('type') + if ref_type == 'bibr': + cite_strings.append(rtag.text.strip()) + + # check how many match bracket style + bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings] + + # return true if + if sum(bracket_style) > BRACKET_STYLE_THRESHOLD: + return True + + return False + + +def sub_all_note_tags(sp: BeautifulSoup) -> BeautifulSoup: + """ + Sub all note tags with p tags + :param para_el: + :param sp: + :return: + """ + for ntag in sp.find_all('note'): + p_tag = sp.new_tag('p') + p_tag.string = ntag.text.strip() + ntag.replace_with(p_tag) + return sp + + +def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None: + """ + Process all formulas in paragraph and replace with text and label + :param para_el: + :param sp: + :return: + """ + for ftag in para_el.find_all('formula'): + # get label if exists and insert a space between formula and label + if ftag.label: + label = ' ' + ftag.label.text + ftag.label.decompose() + else: + label = '' + ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}')) + + +def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict: + """ + Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form) + :param para_el: + :param sp: + :param refs: + :return: + """ + tokgen = UniqTokenGenerator('REFTOKEN') + ref_dict = dict() + for rtag in para_el.find_all('ref'): + try: + ref_type = rtag.get('type') + # skip if citation + if ref_type == 'bibr': + continue + if ref_type == 'table' or ref_type == 'figure': + ref_id = rtag.get('target') + if ref_id and normalize_grobid_id(ref_id) in refs: + # normalize reference string + rtag_string = normalize_grobid_id(ref_id) + else: + rtag_string = None + # add to ref set + ref_key = tokgen.next() + ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type) + rtag.replace_with(sp.new_string(f" {ref_key} ")) + else: + # replace with surface form + rtag.replace_with(sp.new_string(rtag.text.strip())) + except AttributeError: + continue + return ref_dict + + +def process_citations_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, bibs: Dict, bracket: bool) -> Dict: + """ + Process all citations in paragraph and generate a dict for surface forms + :param para_el: + :param sp: + :param bibs: + :param bracket: + :return: + """ + # CHECK if range between two surface forms is appropriate for bracket style expansion + def _get_surface_range(start_surface, end_surface): + span1_match = SINGLE_BRACKET_REGEX.match(start_surface) + span2_match = SINGLE_BRACKET_REGEX.match(end_surface) + if span1_match and span2_match: + # get numbers corresponding to citations + span1_num = int(span1_match.group(1)) + span2_num = int(span2_match.group(1)) + # expand if range is between 1 and 20 + if 1 < span2_num - span1_num < 20: + return span1_num, span2_num + return None + + # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4 + def _create_ref_id_range(start_ref_id, end_ref_id): + start_ref_num = int(start_ref_id[6:]) + end_ref_num = int(end_ref_id[6:]) + return [f'BIBREF{curr_ref_num}' for curr_ref_num in range(start_ref_num, end_ref_num + 1)] + + # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4] + def _create_surface_range(start_number, end_number): + return [f'[{n}]' for n in range(start_number, end_number + 1)] + + # create citation dict with keywords + cite_map = dict() + tokgen = UniqTokenGenerator('CITETOKEN') + + for rtag in para_el.find_all('ref'): + try: + # get surface span, e.g. [3] + surface_span = rtag.text.strip() + + # check if target is available (#b2 -> BID2) + if rtag.get('target'): + # normalize reference string + rtag_ref_id = normalize_grobid_id(rtag.get('target')) + + # skip if rtag ref_id not in bibliography + if rtag_ref_id not in bibs: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (None, surface_span) + continue + + # if bracket style, only keep if surface form is bracket + if bracket: + # valid bracket span + if surface_span and (surface_span[0] == '[' or surface_span[-1] == ']' or surface_span[-1] == ','): + pass + # invalid, replace tag with surface form and continue to next ref tag + else: + rtag.replace_with(sp.new_string(f" {surface_span} ")) + continue + # not bracket, add cite span and move on + else: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (rtag_ref_id, surface_span) + continue + + ### EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ### + # look backward for range marker, e.g. [1]-*[3]* + backward_between_span = "" + for sib in rtag.previous_siblings: + if sib.name == 'ref': + break + elif type(sib) == NavigableString: + backward_between_span += sib + else: + break + + # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3] + if is_expansion_string(backward_between_span): + # get surface number range + surface_num_range = _get_surface_range( + rtag.find_previous_sibling('ref').text.strip(), + surface_span + ) + # if the surface number range is reasonable (range < 20, in order), EXPAND + if surface_num_range: + # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces) + for sib in rtag.previous_siblings: + if sib.name == 'ref': + break + elif type(sib) == NavigableString: + sib.replace_with(sp.new_string("")) + else: + break + + # get ref id of previous ref, e.g. [1] (#b0 -> BID0) + previous_rtag = rtag.find_previous_sibling('ref') + previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target')) + previous_rtag.decompose() + + # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2) + id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id) + surface_range = _create_surface_range(surface_num_range[0], surface_num_range[1]) + replace_string = '' + for range_ref_id, range_surface_form in zip(id_range, surface_range): + # only replace if ref id is in bibliography, else add none + if range_ref_id in bibs: + cite_key = tokgen.next() + cite_map[cite_key] = (range_ref_id, range_surface_form) + else: + cite_key = tokgen.next() + cite_map[cite_key] = (None, range_surface_form) + replace_string += cite_key + ' ' + rtag.replace_with(sp.new_string(f" {replace_string} ")) + # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id + else: + # add mapping between ref id and surface form for previous ref tag + previous_rtag = rtag.find_previous_sibling('ref') + previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target')) + previous_rtag_surface = previous_rtag.text.strip() + cite_key = tokgen.next() + previous_rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (previous_rtag_ref_id, previous_rtag_surface) + + # add mapping between ref id and surface form for current reftag + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (rtag_ref_id, surface_span) + else: + # look forward and see if expansion string, e.g. *[1]*-[3] + forward_between_span = "" + for sib in rtag.next_siblings: + if sib.name == 'ref': + break + elif type(sib) == NavigableString: + forward_between_span += sib + else: + break + # look forward for range marker (if is a range, continue -- range will be expanded + # when we get to the second value) + if is_expansion_string(forward_between_span): + continue + # else treat like normal reference + else: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (rtag_ref_id, surface_span) + + else: + cite_key = tokgen.next() + rtag.replace_with(sp.new_string(f" {cite_key} ")) + cite_map[cite_key] = (None, surface_span) + except AttributeError: + continue + + return cite_map + + +def process_paragraph( + sp: BeautifulSoup, + para_el: bs4.element.Tag, + section_names: List[Tuple], + bib_dict: Dict, + ref_dict: Dict, + bracket: bool +) -> Dict: + """ + Process one paragraph + :param sp: + :param para_el: + :param section_names: + :param bib_dict: + :param ref_dict: + :param bracket: if bracket style, expand and clean up citations + :return: + """ + # return empty paragraph if no text + if not para_el.text: + return { + 'text': "", + 'cite_spans': [], + 'ref_spans': [], + 'eq_spans': [], + 'section': section_names + } + + # replace formulas with formula text + process_formulas_in_paragraph(para_el, sp) + + # get references to tables and figures + ref_map = process_references_in_paragraph(para_el, sp, ref_dict) + + # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked) + cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket) + + # substitute space characters + para_text = re.sub(r'\s+', ' ', para_el.text) + para_text = re.sub(r'\s', ' ', para_text) + + # get all cite and ref spans + all_spans_to_replace = [] + for span in re.finditer(r'(CITETOKEN\d+)', para_text): + uniq_token = span.group() + ref_id, surface_text = cite_map[uniq_token] + all_spans_to_replace.append(( + span.start(), + span.start() + len(uniq_token), + uniq_token, + surface_text + )) + for span in re.finditer(r'(REFTOKEN\d+)', para_text): + uniq_token = span.group() + ref_id, surface_text, ref_type = ref_map[uniq_token] + all_spans_to_replace.append(( + span.start(), + span.start() + len(uniq_token), + uniq_token, + surface_text + )) + + # replace cite and ref spans and create json blobs + para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text) + + cite_span_blobs = [{ + "start": start, + "end": end, + "text": surface, + "ref_id": cite_map[token][0] + } for start, end, token, surface in all_spans_to_replace if token.startswith('CITETOKEN')] + + ref_span_blobs = [{ + "start": start, + "end": end, + "text": surface, + "ref_id": ref_map[token][0] + } for start, end, token, surface in all_spans_to_replace if token.startswith('REFTOKEN')] + + for cite_blob in cite_span_blobs: + assert para_text[cite_blob["start"]:cite_blob["end"]] == cite_blob["text"] + + for ref_blob in ref_span_blobs: + assert para_text[ref_blob["start"]:ref_blob["end"]] == ref_blob["text"] + + return { + 'text': para_text, + 'cite_spans': cite_span_blobs, + 'ref_spans': ref_span_blobs, + 'eq_spans': [], + 'section': section_names + } + + +def extract_abstract_from_tei_xml( + sp: BeautifulSoup, + bib_dict: Dict, + ref_dict: Dict, + cleanup_bracket: bool +) -> List[Dict]: + """ + Parse abstract from soup + :param sp: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + abstract_text = [] + if sp.abstract: + # process all divs + if sp.abstract.div: + for div in sp.abstract.find_all('div'): + if div.text: + if div.p: + for para in div.find_all('p'): + if para.text: + abstract_text.append( + process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket) + ) + else: + if div.text: + abstract_text.append( + process_paragraph(sp, div, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket) + ) + # process all paragraphs + elif sp.abstract.p: + for para in sp.abstract.find_all('p'): + if para.text: + abstract_text.append( + process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket) + ) + # else just try to get the text + else: + if sp.abstract.text: + abstract_text.append( + process_paragraph(sp, sp.abstract, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket) + ) + sp.abstract.decompose() + return abstract_text + + +def extract_body_text_from_div( + sp: BeautifulSoup, + div: bs4.element.Tag, + sections: List[Tuple], + bib_dict: Dict, + ref_dict: Dict, + cleanup_bracket: bool +) -> List[Dict]: + """ + Parse body text from soup + :param sp: + :param div: + :param sections: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + chunks = [] + # check if nested divs; recursively process + if div.div: + for subdiv in div.find_all('div'): + # has header, add to section list and process + if subdiv.head: + chunks += extract_body_text_from_div( + sp, + subdiv, + sections + [(subdiv.head.get('n', None), subdiv.head.text.strip())], + bib_dict, + ref_dict, + cleanup_bracket + ) + subdiv.head.decompose() + # no header, process with same section list + else: + chunks += extract_body_text_from_div( + sp, + subdiv, + sections, + bib_dict, + ref_dict, + cleanup_bracket + ) + # process tags individuals + for tag in div: + try: + if tag.name == 'p': + if tag.text: + chunks.append(process_paragraph( + sp, tag, sections, bib_dict, ref_dict, cleanup_bracket + )) + elif tag.name == 'formula': + # e.g. Y = W T X. + label = tag.label.text + tag.label.decompose() + eq_text = tag.text + chunks.append({ + 'text': 'EQUATION', + 'cite_spans': [], + 'ref_spans': [], + 'eq_spans': [ + { + "start": 0, + "end": 8, + "text": "EQUATION", + "ref_id": "EQREF", + "raw_str": eq_text, + "eq_num": label + } + ], + 'section': sections + }) + except AttributeError: + if tag.text: + chunks.append(process_paragraph( + sp, tag, sections, bib_dict, ref_dict, cleanup_bracket + )) + + return chunks + + +def extract_body_text_from_tei_xml( + sp: BeautifulSoup, + bib_dict: Dict, + ref_dict: Dict, + cleanup_bracket: bool +) -> List[Dict]: + """ + Parse body text from soup + :param sp: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + body_text = [] + if sp.body: + body_text = extract_body_text_from_div(sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket) + sp.body.decompose() + return body_text + + +def extract_back_matter_from_tei_xml( + sp: BeautifulSoup, + bib_dict: Dict, + ref_dict: Dict, + cleanup_bracket: bool +) -> List[Dict]: + """ + Parse back matter from soup + :param sp: + :param bib_dict: + :param ref_dict: + :param cleanup_bracket: + :return: + """ + back_text = [] + + if sp.back: + for div in sp.back.find_all('div'): + if div.get('type'): + section_type = div.get('type') + else: + section_type = '' + + for child_div in div.find_all('div'): + if child_div.head: + section_title = child_div.head.text.strip() + section_num = child_div.head.get('n', None) + child_div.head.decompose() + else: + section_title = section_type + section_num = None + if child_div.text: + if child_div.text: + back_text.append( + process_paragraph(sp, child_div, [(section_num, section_title)], bib_dict, ref_dict, cleanup_bracket) + ) + sp.back.decompose() + return back_text + + +def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper: + """ + Convert Grobid TEI XML to S2ORC json format + :param soup: BeautifulSoup of XML file content + :param paper_id: name of file + :param pdf_hash: hash of PDF + :return: + """ + # extract metadata + metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc) + # clean metadata authors (remove dupes etc) + metadata['authors'] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata['authors']) + + # parse bibliography entries (removes empty bib entries) + biblio_entries = parse_bibliography(soup) + bibkey_map = { + normalize_grobid_id(bib['ref_id']): bib for bib in biblio_entries + } + + # # process formulas and replace with text + # extract_formulas_from_tei_xml(soup) + + # extract figure and table captions + refkey_map = extract_figures_and_tables_from_tei_xml(soup) + + # get bracket style + is_bracket_style = check_if_citations_are_bracket_style(soup) + + # substitute all note tags with p tags + soup = sub_all_note_tags(soup) + + # process abstract if possible + abstract_entries = extract_abstract_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) + + # process body text + body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) + + # parse back matter (acks, author statements, competing interests, abbrevs etc) + back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style) + + # form final paper entry + return Paper( + paper_id=paper_id, + pdf_hash=pdf_hash, + metadata=metadata, + abstract=abstract_entries, + body_text=body_entries, + back_matter=back_matter, + bib_entries=bibkey_map, + ref_entries=refkey_map + ) + + +def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper: + """ + Convert a TEI XML file to S2ORC JSON + :param tei_file: + :param pdf_hash: + :return: + """ + if not os.path.exists(tei_file): + raise FileNotFoundError("Input TEI XML file doesn't exist") + paper_id = tei_file.split('/')[-1].split('.')[0] + soup = BeautifulSoup(open(tei_file, "rb").read(), "xml") + paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash) + return paper \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/jats2json/__init__.py b/s2orc-doc2json/doc2json/jats2json/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/s2orc-doc2json/doc2json/jats2json/jats_to_json.py b/s2orc-doc2json/doc2json/jats2json/jats_to_json.py new file mode 100644 index 0000000000000000000000000000000000000000..1889047802d0e0834981ca0737e3b681f7158754 --- /dev/null +++ b/s2orc-doc2json/doc2json/jats2json/jats_to_json.py @@ -0,0 +1,341 @@ +""" +Mostly copied from cite2vec paper_parsing.parse_nxml +""" + +from typing import List, Set, Dict, Callable + +import os +import json +import re +import multiprocessing +from bs4 import BeautifulSoup +from tqdm import tqdm +from glob import glob +from pprint import pprint + +from doc2json.utils.soup_utils import destroy_unimportant_tags_inplace +from doc2json.jats2json.pmc_utils.front_tag_utils import parse_journal_id_tag, parse_journal_name_tag, \ + parse_title_tag, parse_category_tag, parse_date_tag, parse_doi_tag, parse_pmc_id_tag, parse_pubmed_id_tag, \ + parse_authors, parse_affiliations, parse_abstract_tag, parse_funding_groups, NoAuthorNamesError +from doc2json.jats2json.pmc_utils.extract_utils import extract_fig_blobs, extract_table_blobs, extract_suppl_blobs +from doc2json.jats2json.pmc_utils.all_tag_utils import replace_xref_with_string_placeholders, \ + replace_sup_sub_tags_with_string_placeholders, recurse_parse_section +from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section +from doc2json.jats2json.pmc_utils.back_tag_utils import parse_bib_entries + +from doc2json.s2orc import Paper + + +def process_front_tag(front_tag, soup) -> Dict: + # process tags + journal_id: str = parse_journal_id_tag(front_tag=front_tag) + journal_name: str = parse_journal_name_tag(front_tag=front_tag) + + # process tags + title: str = parse_title_tag(front_tag=front_tag) + + try: + authors: List[Dict] = parse_authors(front_tag=front_tag) + except NoAuthorNamesError: + authors: List[Dict] = [] + affiliations: Dict = parse_affiliations(front_tag=front_tag) + + dates: Dict = parse_date_tag(front_tag=front_tag) + + pubmed_id: str = parse_pubmed_id_tag(front_tag=front_tag) + pmc_id: str = parse_pmc_id_tag(front_tag=front_tag) + doi: str = parse_doi_tag(front_tag=front_tag) + + abstract: List[Dict] = parse_abstract_tag(front_tag=front_tag, soup=soup) + + # categories: str = parse_category_tag(front_tag=front_tag) + + funding_groups: List[str] = parse_funding_groups(front_tag=front_tag) + + return { + 'title': title, + 'abstract': abstract, + 'authors': authors, + 'affiliations': affiliations, + 'journal_id': journal_id, + 'journal_name': journal_name, + 'pubmed_id': pubmed_id, + 'pmc_id': pmc_id, + 'doi': doi, + 'year': dates, + 'funding_groups': funding_groups + } + + +def process_body_tag(body_tag, soup) -> Dict: + # replace all xref tags with string placeholders + replace_xref_with_string_placeholders(soup_tag=body_tag, soup=soup) + + # replace all sup/sub tags with string placeholders + replace_sup_sub_tags_with_string_placeholders(soup_tag=body_tag, soup=soup) + + # some articles (like PMC2844102) have no sections + sec_tags = body_tag.find_all('sec', recursive=False) + + # try looking in article tag + if not sec_tags: + try: + sec_tags = body_tag.article.find_all('sec', recursive=False) + except: + pass + + if sec_tags: + all_par_blobs = [] + for sec_tag in sec_tags: + # note; most sections dont have this 'sec-type' attribute + if sec_tag.get('sec-type') == 'supplementary-material': + # hopefully all the important supplementary content already extracted above in previous step + continue + else: + par_blobs = recurse_parse_section(sec_tag=sec_tag) + all_par_blobs.extend(par_blobs) + else: + all_par_blobs = parse_all_paragraphs_in_section(body_tag) + + return { + 'body_text': all_par_blobs, + } + + +def process_back_tag(back_tag) -> Dict: + # glossary = {} + # if back_tag.find('glossary'): + # for def_item_tag in back_tag.find('glossary').find_all('def-item'): + # glossary[def_item_tag.find('term').text] = def_item_tag.find('def').text + + # TODO: author contrib and COIs + # notes = [] + # for notes_tag in back_tag.find_all('notes'): + # pass + + # TODO: PMC2778891 has back tag that looks like: Acknowledgements

Supported by the Austrian Science Fund (P-20670 and W11).

+ # that is, it doesn't have 'ack' section. + acknowledgements: List[Dict] = [] + for ack_tag in back_tag.find_all('ack'): + title_tag = ack_tag.find('title') + for par_tag in ack_tag.find_all('p'): + acknowledgements.append({ + 'section': title_tag.text if title_tag is not None else None, + 'text': par_tag.text, + 'funding_sources': [fund_tag.text for fund_tag in par_tag.find_all('funding-source')], + 'urls': [url_tag.text for url_tag in par_tag.find_all('ext-link')] + }) + + bib_entries = parse_bib_entries(back_tag) + + return { + 'acknowledgements': acknowledgements, + 'bib_entries': bib_entries, + } + + +def postprocess_front_tags_for_s2orc(init_front_dict: Dict): + """ + Fix authors and year for S2ORC format + """ + # Make authors in front tags look like S2ORC + for a in init_front_dict['authors']: + a['affiliation'] = {} + # get affiliation if available + if a['affiliation_ids']: + affil_id = a['affiliation_ids'][0] + affil_text = [affil['text'] for affil in init_front_dict['affiliations'] if affil['id'] == affil_id] + if affil_text: + a['affiliation'] = { + 'laboratory': "", + 'institution': affil_text[0], + 'location': {} + } + del a['affiliation_ids'] + del a['corresponding'] + del a['orcid'] + del init_front_dict['affiliations'] + + # Pick best year and make year int in front tags + if init_front_dict['year'].get('epub'): + year = init_front_dict['year'].get('epub') + elif init_front_dict['year'].get('accepted'): + year = init_front_dict['year'].get('accepted') + elif init_front_dict['year'].get('collection'): + year = init_front_dict['year'].get('collection') + elif init_front_dict['year'].get('received'): + year = init_front_dict['year'].get('received') + else: + year = None + init_front_dict['year'] = year + + return init_front_dict + + +def convert_acks_to_s2orc(paragraphs: List) -> List[Dict]: + """ + Convert acks to S2ORC paragraphs + """ + for paragraph_blob in paragraphs: + paragraph_blob['cite_spans'] = [] + paragraph_blob['ref_spans'] = [] + del paragraph_blob['funding_sources'] + del paragraph_blob['urls'] + return paragraphs + + +def convert_paragraphs_to_s2orc(paragraphs: List, old_to_new: Dict) -> List[Dict]: + """ + Convert paragraphs into S2ORC format + """ + # TODO: temp code to process body text into S2ORC format. this includes getting rid of sub/superscript spans. + # also combining fig & table spans into ref spans. + # also remapping the reference / bib labels to the new ones defined earlier in this function. + # temporarily, we cant support PMC xml parse bibs, so remove all links to the bibliography (cuz they'll be wrong) + for paragraph_blob in paragraphs: + del paragraph_blob['sup_spans'] + del paragraph_blob['sub_spans'] + paragraph_blob['ref_spans'] = [] + for fig_tab_span in paragraph_blob['fig_spans'] + paragraph_blob['table_spans']: + # replace old ref_id with new ref_id. default to None if null + # optional, just wanted to check if this ever happens + assert fig_tab_span['ref_id'] + fig_tab_span['ref_id'] = old_to_new.get(fig_tab_span['ref_id']) + paragraph_blob['ref_spans'].append(fig_tab_span) + del paragraph_blob['fig_spans'] + del paragraph_blob['table_spans'] + for cite_span in paragraph_blob['cite_spans']: + # replace old cite ids with new cite ids. again default to None if null + # optional, just wanted to check if this ever happens + assert cite_span['ref_id'] + cite_span['ref_id'] = old_to_new.get(cite_span['ref_id']) + return paragraphs + + +def convert_jats_xml_to_s2orc_json(jats_file: str, log_dir: str): + """ + Convert JATS XML to S2ORC JSON + :param jats_file: + :param log_dir: + :return: + """ + # get file id (PMC id usually) + file_id = jats_file.split('/')[-1].split('.')[0] + + # read JATS XML + with open(jats_file, 'r') as f_in: + soup = BeautifulSoup(f_in, 'lxml') + destroy_unimportant_tags_inplace(soup, tags_to_remove=['bold', 'italic', 'graphic']) + + # all the XML files have their own wonky reference IDs. we want to standardize them, but need to remember the old->new mapping + old_key_to_new_key = {} + + # REFERENCES + table_blobs = extract_table_blobs(soup) + figure_blobs = extract_fig_blobs(soup) + # TODO: not current represented in S2ORC, keep for later + suppl_blobs = extract_suppl_blobs(soup) + # TODO: for S2ORC, need to process them into a single ref dict. need to construct new IDs to match ID conventions. and update all cite spans. + # also, S2ORC table captions are free text without detected reference/citation mentions + # TODO: may want to keep table representations around + ref_entries = {} + for i, (old_table_key, table_blob) in enumerate(sorted(table_blobs.items())): + # TODO: PMC2557072 table `tbl5` has no label. skip. + # TODO: PMC3137981 table `tab1` has no caption text. skip. + if not table_blob['label'] or not table_blob['caption']: + continue + table_text = table_blob['label'] + ': ' + ' '.join( + [c['text'] for c in table_blob['caption']] + ) + '\n' + ' '.join([f['text'] for f in table_blob['footnote']]) + new_table_key = f'TABREF{i}' + old_key_to_new_key[old_table_key] = new_table_key + # TODO: skipping over any citations or references in the table for now + if table_blob['xml']: + table_content = table_blob['xml'][0]['text'] + ref_entries[new_table_key] = {'text': table_text, 'content': table_content, 'type': 'table'} + for i, (old_figure_key, figure_blob) in enumerate(sorted(figure_blobs.items())): + # TODO: double-check, but it seems like figure blobs dont have footnotes parsed out? might be bug + # TODO: PMC1326260 first figure has no ['label']. just skip these for now (because no inline references) + # TODO: PMC2403743 has null-valued caption in `fig1`. also skip here. fix later. + if not figure_blob['label'] or not figure_blob['caption']: + continue + figure_text = figure_blob['label'] + ': ' + ' '.join([c['text'] for c in figure_blob['caption']]) + new_figure_key = f'FIGREF{i}' + old_key_to_new_key[old_figure_key] = new_figure_key + ref_entries[new_figure_key] = {'text': figure_text, 'type': 'figure'} + + # FRONT TAGS + front_tag = soup.find('front').extract() + front_dict = process_front_tag(front_tag=front_tag, soup=soup) + front_dict = postprocess_front_tags_for_s2orc(front_dict) + front_dict['abstract'] = convert_paragraphs_to_s2orc(front_dict['abstract'], old_key_to_new_key) + + # BACK TAGS + back_tag = soup.find('back') + back_dict = {} + # PMC1139917 doesnt have 'back' tag + if back_tag is not None: + back_dict = process_back_tag(back_tag=back_tag) + # TODO: format bib entries to S2ORC format. we're already very close, but need a couple changes: + # - author blobs include a 'suffix' which defaults to empty string + # - issn defaults to empty string + # - rename all the bib IDs + bib_entries = {} + for i, (old_bib_key, bib_entry) in enumerate(sorted(back_dict['bib_entries'].items())): + del bib_entry['ref_id'] + new_bib_key = f'BIBREF{i}' + old_key_to_new_key[old_bib_key] = new_bib_key + bib_entries[new_bib_key] = bib_entry + else: + bib_entries = {} + + if back_dict and back_dict.get('acknowledgements'): + back_dict['acknowledgements'] = convert_acks_to_s2orc(back_dict['acknowledgements']) + + # BODY TAGS + body_tag = soup.find('body') + # PMC1240684 doesnt have 'body' tag + if body_tag is not None: + body_dict = process_body_tag(body_tag=body_tag, soup=soup) + body_text = body_dict['body_text'] + else: + # Has no body: /disk2/gorpus/20200101/pmc/Br_Foreign_Med_Chir_Rev/PMC5163425.nxml + body_text = [] + + body_text = convert_paragraphs_to_s2orc(body_text, old_key_to_new_key) + + metadata = { + "title": front_dict['title'], + "authors": front_dict['authors'], + "year": front_dict['year'], + "venue": front_dict['journal_name'], + "identifiers": { + "doi": front_dict['doi'], + "pubmed_id": front_dict['pubmed_id'], + "pmc_id": front_dict['pmc_id'] + } + } + + return Paper( + paper_id=file_id, + pdf_hash="", + metadata=metadata, + abstract=front_dict['abstract'], + body_text=body_text, + back_matter=back_dict.get('acknowledgements', []), + bib_entries=bib_entries, + ref_entries=ref_entries + ) + + +if __name__ == '__main__': + jats_file = 'tests/jats/PMC5828200.nxml' + paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs') + + jats_file = 'tests/jats/PMC6398430.nxml' + paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs') + + jats_file = 'tests/jats/PMC7417471.nxml' + paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs') + + print('done.') \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f9cc1a5875d226e97bc5be8a8ea42b18b52b7086 --- /dev/null +++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py @@ -0,0 +1,300 @@ +from typing import Dict, List, Callable + +import re +import itertools + +from bs4 import BeautifulSoup + +START_TOKENS = {"#!start#", "@!start@", "&!start&"} +SEP_TOKENS = {"#!sep#"} +END_TOKENS = {"#!end#", "@!end@", "&!end&"} +ALL_TOKENS = START_TOKENS | SEP_TOKENS | END_TOKENS + + +def replace_xref_with_string_placeholders(soup_tag, soup): + # replace all xref tags with string placeholders + for xref_tag in soup_tag.find_all("xref"): + rid = xref_tag['rid'] if 'rid' in xref_tag.attrs else None + ref_type = xref_tag['ref-type'] if 'ref-type' in xref_tag.attrs else None + xref_tag.replace_with( + soup.new_string( + f"#!start#{xref_tag.text}#!sep#{rid}#!sep#{ref_type}#!end#" + ) + ) + + +def replace_sup_sub_tags_with_string_placeholders(soup_tag, soup): + # replace all sup/sub tags with string placeholders + for sup_tag in soup_tag.find_all("sup"): + sup_tag.replace_with(soup.new_string(f"@!start@{sup_tag.text}@!end@")) + for sub_tag in soup_tag.find_all("sub"): + sub_tag.replace_with(soup.new_string(f"&!start&{sub_tag.text}&!end&")) + + +def recurse_parse_section( + sec_tag, + # suppl_blobs: Dict +) -> List[Dict]: + """Recursive function for getting paragraph blobs to look like + { + 'text': ..., + ..., + 'section': SUBSUBSECTION_NAME :: SUBSECTION_NAME :: SECTION_NAME + } + """ + subsections = sec_tag.find_all("sec", recursive=False) + if not subsections: + return parse_all_paragraphs_in_section( + sec_tag=sec_tag + ) # , suppl_blobs=suppl_blobs) + else: + outputs = [] + for child in subsections: + child_blobs = recurse_parse_section( + sec_tag=child + ) # , suppl_blobs=suppl_blobs) + for blob in child_blobs: + # PMC373254 - process blob['section'] to remove any span markers left in there + for t in ALL_TOKENS: + blob['section'] = blob['section'].replace(t, '') + blob["section"] = blob["section"] + " :: " + sec_tag.find("title").text + outputs.extend(child_blobs) + return outputs + + +def _reduce_args(stack: List, end_token: str) -> List[List]: + """Helper function for `_parse_all_paragraphs_in_section`. + + Pop arguments for the xref off the top of the stack and return a list of argument lists, + where the outer lists represent groups divided by separators.""" + start_token = end_token.replace('end', 'start') + sep_token = end_token.replace('end', 'sep') + args = [[]] + while True: + token = stack.pop() + if token == start_token: + return args + elif token == sep_token: + args.insert(0, []) + else: + args[0].insert(0, token) + + +def _add_spans( + end_token: str, + start_pos: int, + text: str, + ref_id, + ref_type, + cite_spans: List, + fig_spans: List, + table_spans: List, + sup_spans: List, + sub_spans: List, +): + """Helper function used by `_parse_all_paragraphs_in_section`.""" + if end_token.startswith("#"): # process xref + blob = { + "start": start_pos, + "end": start_pos + len(text), + "mention": text, + "ref_id": ref_id, + } + if ref_type == "bibr": + cite_spans.append(blob) + elif ref_type == "fig": + fig_spans.append(blob) + elif ref_type == "table": + table_spans.append(blob) + + else: + blob = { + "start": start_pos, + "end": start_pos + len(text), + "mention": text, + } + if end_token.startswith("@"): + sup_spans.append(blob) + else: + assert end_token.startswith("&") + sub_spans.append(blob) + + +def get_latex_from_formula( + formula_tag +): + if formula_tag.find('tex-math'): + latex_text = formula_tag.find('tex-math').text + match = re.search(r'\\begin\{document\}(.+)\\end\{document\}', latex_text) + if match: + return match.group(1).strip('$') + return None + + +def get_mathml_from_formula( + formula_tag +): + if formula_tag.find('mml:math'): + return str(formula_tag.find('mml:math')) + return None + + +def parse_formulas( + para_el, + sp, + replace +): + # sub and get corresponding spans of inline formulas + formula_dict = dict() + eq_ind = 0 + for ftag in para_el.find_all('inline-formula'): + try: + formula_key = f'INLINEFORM{eq_ind}' + eq_ind += 1 + try: + formula_text = ftag.find('mml:math').text + except: + if 'begin{document}' not in ftag.text: + formula_text = ftag.text + else: + formula_text = "FORMULA" + formula_latex = get_latex_from_formula(ftag) + formula_mathml = get_mathml_from_formula(ftag) + if not formula_mathml and formula_latex: + formula_mathml = latex2mathml.converter.convert(formula_latex) + formula_dict[formula_key] = (formula_text, formula_latex, formula_mathml, ftag.get('id')) + if replace: + ftag.replace_with(sp.new_string(f" {formula_key} ")) + else: + # replace with mathml text if available + if formula_text != 'FORMULA': + ftag.replace_with(sp.new_string(f" {formula_text} ")) + except AttributeError: + continue + + return formula_dict + + +def parse_all_paragraphs_in_section( + sec_tag, + par_to_text: Callable = None, + replace_formula=True +) -> List[Dict]: + """Internal function. Assumes section has no nested tags + `par_to_text` is an optional function that converts the `par` tag into a string. by default, calls `par_tag.text`. + """ + outputs = [] + sp = BeautifulSoup('', 'lxml') + for par_tag in sec_tag.find_all("p", recursive=True): + cite_spans = [] + fig_spans = [] + table_spans = [] + # suppl_spans = [] + sup_spans = [] + sub_spans = [] + eq_spans = [] + + if par_tag.find('display-formula'): + raise NotImplementedError('Display formula!') + + if par_tag.find('formula'): + raise NotImplementedError('Formula!') + + formula_dict = parse_formulas(par_tag, sp, replace_formula) + + par_text = par_to_text(par_tag) if par_to_text else par_tag.text + par_text = re.sub( + r"[^\S\n\t]", " ", par_text + ) # replaces whitespace but not newline or tab + par_text = re.sub( + r" ", " ", par_text + ) # replaces two spaces w/ one + + # Tokenize the text into normal text and special placeholder tokens. + pattern = r"(#!start#)|(#!sep#)|(#!end#)|(@!start@)|(@!end@)|(&!start&)|(&!end&)" + tokens = [tok for tok in re.split(pattern, par_text) if tok] + + # To handle nested structures, use a shift-reduce algorithm to consume the text. Placeholder tags are merged away, and related spans are registered. + stack = [] + full_text = [] + pos = 0 + disable_count = False + for token in tokens: + if token in START_TOKENS: + stack.append(token) + stack.append(pos) + stack.append(token.replace('start', 'sep')) + elif token in SEP_TOKENS: + assert stack + stack.append(token) + disable_count = True + elif token in END_TOKENS: + assert stack + disable_count = False + args = _reduce_args(stack, token) + start_pos = args[0][0] + text = "".join(args[1]) + assert len(args) == 2 or len(args) == 4 + if len(args) == 2: + ref_id, ref_type = None, None + elif len(args) == 4: + ref_id = args[2] and args[2][0] + ref_type = args[3] and args[3][0] + stack.append(text) + _add_spans( + token, + start_pos, + text, + ref_id, + ref_type, + cite_spans, + fig_spans, + table_spans, + sup_spans, + sub_spans, + ) + else: # just normal text + stack.append(token) + if not disable_count: # metadata appearing after a separator + full_text.append(token) + pos += len(token) + + full_text = "".join(full_text) + assert pos == len(full_text) + + title = sec_tag.find("title") + title = title.text if title else "" + + # get all equation spans + eq_spans = [] + for span in itertools.chain( + re.finditer(r'(INLINEFORM\d+)', full_text), + re.finditer(r'(DISPLAYFORM\d+)', full_text) + ): + try: + matching_formula = formula_dict[span.group()] + eq_spans.append({ + "start": span.start(), + "end": span.start() + len(span.group()), + "text": matching_formula[0], + "latex": matching_formula[1], + "mathml": matching_formula[2], + "ref_id": span.group() + }) + except KeyError: + continue + + outputs.append( + { + "text": full_text, + 'cite_spans': cite_spans, + 'fig_spans': fig_spans, + 'table_spans': table_spans, + # 'suppl_spans': suppl_spans, + 'sup_spans': sup_spans, + 'sub_spans': sub_spans, + 'eq_spans': eq_spans, + "section": title, + } + ) + return outputs diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7502dff8f0fd39e9a5e925ef11b5e1b9ca9564a0 --- /dev/null +++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py @@ -0,0 +1,56 @@ +from typing import Dict, List + + +def _wrap_text(tag): + return tag.text if tag else '' + + +def parse_authors(authors_tag) -> List: + """The PMC XML has a slightly different format than authors listed in front tag.""" + if not authors_tag: + return [] + + authors = [] + for name_tag in authors_tag.find_all('name', recursive=False): + surname = name_tag.find('surname') + given_names = name_tag.find('given-names') + given_names = given_names.text.split(' ') if given_names else None + suffix = name_tag.find('suffix') + authors.append({ + 'first': given_names[0] if given_names else '', + 'middle': given_names[1:] if given_names else [], + 'last': surname.text if surname else '', + 'suffix': suffix.text if suffix else '' + }) + return authors + + +def parse_bib_entries(back_tag) -> Dict: + bib_entries = {} + # TODO: PMC2778891 does not have 'ref-list' in its back_tag. do we even need this, or can directly .find_all('ref')? + ref_list_tag = back_tag.find('ref-list') + if ref_list_tag: + for ref_tag in ref_list_tag.find_all('ref'): + # The ref ID and label are semantically swapped between CORD-19 and PMC, lol + ref_label = ref_tag['id'] + ref_id = ref_tag.find('label') + authors_tag = ref_tag.find('person-group', {'person-group-type': 'author'}) + year = ref_tag.find('year') + fpage = ref_tag.find('fpage') + lpage = ref_tag.find('lpage') + pages = f'{fpage.text}-{lpage.text}' if fpage and lpage else None + dois = [tag.text for tag in ref_tag.find_all('pub-id', {'pub-id-type': 'doi'})] + bib_entries[ref_label] = { + 'ref_id': _wrap_text(ref_id), + 'title': _wrap_text(ref_tag.find('article-title')), + 'authors': parse_authors(authors_tag), + 'year': int(year.text) if year and year.text.isdigit() else None, + 'venue': _wrap_text(ref_tag.find('source')), + 'volume': _wrap_text(ref_tag.find('volume')), + 'issn': _wrap_text(ref_tag.find('issue')), + 'pages': pages, + 'other_ids': { + 'DOI': dois, + } + } + return bib_entries \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4c95c456fa584097591ad5d08db48713cfb72a28 --- /dev/null +++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py @@ -0,0 +1,106 @@ + +from typing import Dict + +import bs4 +from bs4 import BeautifulSoup + +from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section + + +def extract_fig_blobs(body_tag) -> Dict: + fig_blobs = {} + for fig_tag in body_tag.find_all('fig'): + fig = fig_tag.extract() + label = fig.find('label') + fig_blobs[fig['id']] = { + 'label': label and label.text, + 'caption': fig.find('caption') + } + _update_fig_blobs(fig_blobs) + return fig_blobs + + +def _update_fig_blobs(fig_blobs: Dict): + for fig_blob in fig_blobs.values(): + if fig_blob['caption'] is None: + continue + # replace non-p tags w/ p tags in figure caption (mostly dealing with title tags, which weren't being extracted before) + for tag in fig_blob['caption']: + if type(tag) == bs4.element.Tag and tag.name != 'p': + tag.name = 'p' + par_blobs = parse_all_paragraphs_in_section(sec_tag=fig_blob['caption'], replace_formula=False) + for par_blob in par_blobs: + del par_blob['section'] + fig_blob['caption'] = par_blobs + + +def extract_table_blobs(body_tag) -> Dict: + # note 1: footnotes dont always exist for each table; hence the if statement + # note 2: we want to preserve the XML tags for tables, but also need to run it through the regex cleaner for xrefs and other spans + # hence, wrapping all of the table XML text into a fake

paragraph tag + table_blobs = {} + for table_tag in body_tag.find_all('table-wrap'): + table = table_tag.extract() + label = table.find('label') + # TODO: currently restricting to tables with identifiers. might want to include unreferenced tables once we care more. + if table.get('id'): + table_blobs[table['id']] = { + 'label': label and label.text, + 'caption': table.find('caption'), + 'footnote': table.find('table-wrap-foot') if table.find('table-wrap-foot') else BeautifulSoup('

', 'xml'), + 'xml': BeautifulSoup('

' + str(table.find('table')) + '

', 'xml') + } + _update_table_blobs(table_blobs) + return table_blobs + + +def _update_table_blobs(table_blobs: Dict): + for table_blob in table_blobs.values(): + if table_blob['caption'] is not None: + # replace non-p tags w/ p tags in table caption (mostly dealing with title tags, which weren't being extracted before) + for tag in table_blob['caption']: + if type(tag) == bs4.element.Tag and tag.name != 'p': + tag.name = 'p' + par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['caption'], replace_formula=False) + for par_blob in par_blobs: + del par_blob['section'] + table_blob['caption'] = par_blobs + if table_blob['footnote'] is not None: + par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['footnote'], replace_formula=False) + for par_blob in par_blobs: + del par_blob['section'] + table_blob['footnote'] = par_blobs + # note: if we dont include `par_to_text` function, the parser will convert all

tags to text via `par_tag.text` + # which actually removes all XML tags we wanted to preserve in table. + # by passing in str(), we ensure to keep all of those tags + if table_blob['xml'] is not None: + par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['xml'], par_to_text=str, replace_formula=False) + for par_blob in par_blobs: + del par_blob['section'] + table_blob['xml'] = par_blobs + + +def extract_suppl_blobs(body_tag) -> Dict: + suppl_blobs = {} + for suppl_tag in body_tag.find_all('supplementary-material'): + suppl = suppl_tag.extract() + # We only care about supplementary material that can be referenced (like figures/tables) + # for example, we dont care about PMC1139917 which has supplementary material but without an ID + if 'id' in suppl: + label = suppl.find('label') + suppl_blobs[suppl['id']] = { + 'label': label and label.text, + 'caption': suppl.find('caption') + } + _update_suppl_blobs(suppl_blobs) + return suppl_blobs + + +def _update_suppl_blobs(suppl_blobs: Dict): + for suppl_blob in suppl_blobs.values(): + if suppl_blob['caption'] is None: + continue + par_blobs = parse_all_paragraphs_in_section(sec_tag=suppl_blob['caption']) + for par_blob in par_blobs: + del par_blob['section'] + suppl_blob['caption'] = par_blobs diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d192185cc4c30aa6a77f01b25f33872f1dc6567d --- /dev/null +++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py @@ -0,0 +1,381 @@ +""" + +Functions for parsing specific `front_tag` soup tags + +""" + +from typing import Dict, List, Optional + +from collections import Counter + +import re + + +from doc2json.jats2json.pmc_utils.all_tag_utils import recurse_parse_section, parse_all_paragraphs_in_section, \ + replace_sup_sub_tags_with_string_placeholders, replace_xref_with_string_placeholders + + +class NoAuthorNamesError(Exception): + """Known papers that trigger: + - PMC3462967 + """ + pass + + +def parse_journal_id_tag(front_tag) -> str: + """ + front_tag.find_all('journal-id') returns: + [ + Neurosci J, + Neurosci J, + NEUROSCIENCE + ] + [ + BMC Biochem + BMC Biochem + ] + """ + c = Counter() + for tag in front_tag.find_all('journal-id'): + c[tag.text] += 1 + tag.decompose() + journal_id, n = c.most_common(1)[0] + return journal_id + + +def parse_journal_name_tag(front_tag) -> str: + """ + Examples: + # Paper 1 + + BMC Biochemistry + + # Paper 2 + + Neuroscience Journal + + + But not all titles are contained within a `journal-title-group`. See PMC1079901 + + + Biomed Eng Online + + + BioMedical Engineering OnLine + + ... + """ + if len(front_tag.find_all('journal-title')) > 1: + raise Exception('Multiple journal titles?!') + return front_tag.find('journal-title').extract().text + + +def parse_pubmed_id_tag(front_tag) -> Optional[str]: + """Not every PMC paper has a PMID """ + pmid_tag = front_tag.find('article-id', {'pub-id-type': 'pmid'}) + if pmid_tag is None: + return None + else: + return pmid_tag.extract().text + + +def parse_pmc_id_tag(front_tag) -> str: + return f"PMC{front_tag.find('article-id', {'pub-id-type': 'pmc'}).extract().text}" + + +def parse_doi_tag(front_tag) -> Optional[str]: + """Not all papers have a DOI""" + doi_tag = front_tag.find('article-id', {'pub-id-type': 'doi'}) + if doi_tag is not None: + return doi_tag.extract().text + else: + return None + + +def parse_title_tag(front_tag) -> str: + """ + Examples: + # Paper 1 + + Role of the highly conserved G68 residue in the yeast phosphorelay protein Ypd1: implications for interactions between histidine phosphotransfer (HPt) and response regulator proteins + + # Paper 2 + + Association of Strength and Physical Functions in People with Parkinson's Disease + + + Want to restrict to `title-group` because sometimes title shows up in under self-citation + """ + title_group = front_tag.find('title-group').extract() + if len(title_group.find_all('article-title')) > 1: + raise Exception('Multiple article titles?!') + return title_group.find('article-title').text + + +def parse_category_tag(front_tag) -> List[str]: + """ + Examples: + # Paper 1 + + + Research Article + + + # Paper 2 + + + Research Article + + + """ + if len(front_tag.find_all('subj-group')) > 1 or len(front_tag.find_all('subject')) > 1: + raise Exception('Multiple categories?!') + article_categories = front_tag.find('article-categories').extract() + return article_categories.find('subject').text + + +def parse_date_tag(front_tag) -> Dict: + """ + Two sets of tags contain dates: + + 2018 + + + 12 + 12 + 2018 + + And: + + + 15 + 10 + 2018 + + + 20 + 11 + 2018 + + + 26 + 11 + 2018 + + + + PMC2557072 has `date` tag with no `day`, only `year` and `month` + """ + out = {} + for pub_date in front_tag.find_all('pub-date'): + year = pub_date.find('year') + month = pub_date.find('month') + day = pub_date.find('day') + out[pub_date.get('pub-type', 'MISSING_PUB_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None]) + pub_date.decompose() + for date in front_tag.find_all('date'): + year = date.find('year') + month = date.find('month') + day = date.find('day') + out[date.get('date-type', 'MISSING_DATE_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None]) + date.decompose() + return out + + +def parse_funding_groups(front_tag) -> List[str]: + outs = [] + for tag in front_tag.find_all(): + + # AND statement skips cases where the two tag types nest within each other; we only process the inner one + if (tag.name == 'funding-source' or tag.name == 'funding-statement') and tag.find('funding-source') is None and tag.find('funding-statement') is None: + + out = { + 'name': None, + 'doi': None, + 'notes': None, + # 'raw': str(tag) # for debugging + } + + # handle institution + institution_id_tag = tag.find('institution-id') + if institution_id_tag: + out['doi'] = institution_id_tag.extract().text.replace('http://dx.doi.org/', '') + institution_tag = tag.find('institution') + if institution_tag: + out['name'] = tag.find('institution').extract().text + + # handle named content + funder_name_tag = tag.find('named-content', {'content-type': 'funder-name'}) + if funder_name_tag: + out['name'] = funder_name_tag.extract().text + + funder_id_tag = tag.find('named-content', {'content-type': 'funder-identifier'}) + if funder_id_tag: + out['doi'] = funder_id_tag.extract().text.replace('http://dx.doi.org/', '') + + # handle urls + if tag.get('xlink:href'): + out['doi'] = tag['xlink:href'] + + # fix DOIs with URLs in them + if out['doi']: + match = re.search(r'http(s?)://dx.doi.org/(.+)', out['doi']) + if match: + out['doi'] = match.group(2) + + # remainder text is either a name or a full statement + text = tag.text + if tag.name == 'funding-statement' or ('fund' in text or 'support' in text or 'provide' in text): + out['notes'] = text + else: + # what if something already in 'name'? observed it's typically empty string; so ignore. + if not out['name']: + out['name'] = text + + # if DOI link is in the name, remove it and parse (PMC5407128) + if out['name'] and not out['doi']: + pattern = r'\s*http(s?)://dx.doi.org/(.+)$' + match = re.search(pattern, out['name']) + if match: + out['doi'] = match.group(2) + out['name'] = re.sub(pattern, r'', out['name']) + + outs.append(out) + return outs + + +# TODO: didnt want to handle group names; seemed rare and inconsistent; focus on with and +def parse_authors(front_tag) -> List[Dict]: + authors = [] + for contrib_tag in front_tag.find_all('contrib'): + + # skip nesting; just process children (individual authors) + if contrib_tag.find_all('contrib'): + continue + + # skip contribs without a name; these should be ones that consist of tag + if contrib_tag.find('name') is None: + continue + + # corresponding tag + if (contrib_tag.get('corresp') == 'yes') or (contrib_tag.find('xref', {'ref-type': 'corresp'})): + is_corresp = True + else: + is_corresp = False + + # orcid ID is sometimes a URL or just a number. standardize as hyphenized number. + if contrib_tag.find('contrib-id'): + orcid_id = contrib_tag.find('contrib-id').text + match = re.search(r'http(s?)://orcid.org/(.+)', orcid_id) + if match: + orcid_id = match.group(2) + # A very small number of articles have ID type CATS, which we don't handle. For example: + # /disk2/gorpus/20200101/pmc/Change/PMC6176774.nxml + if len(orcid_id) != 19: + orcid_id = None + else: + orcid_id = None + + # Email may or may not be present. + email = contrib_tag.find('email') + email = email.text if email else None + + # Get the name info for the author. + name_info = {name_tag.name: name_tag.text for name_tag in contrib_tag.find('name').find_all()} + # TODO: PMC3462967 is an Erratum. It does not have ['given-names']. not sure we care about those, so try-catch for now + try: + given_names = name_info['given-names'].split(' ') + except KeyError as e: + raise NoAuthorNamesError + + authors.append({ + 'first': given_names[0] if given_names else None, + 'middle': given_names[1:] if given_names else None, + 'last': name_info['surname'], + 'suffix': name_info.get('suffix', ''), + 'email': email, + 'affiliation_ids': [xref_tag.get('rid') for xref_tag in contrib_tag.find_all('xref', {'ref-type': 'aff'})], + 'corresponding': is_corresp, + 'orcid': orcid_id + }) + + # authors.append(str(contrib_tag.extract())) + return authors + + +def parse_affiliations(front_tag) -> List[Dict]: + """ + Sometimes affiliations is nested within '' along with + authors. Sometimes, they're not and listed outside as multiple tags. + + Not all have IDs. For example: + St. Paul, Minnesota + """ + outs = [] + for aff_tag in front_tag.find_all('aff'): + if aff_tag.find('label'): # get rid of unused markers so `.text` is cleaner + aff_tag.find('label').decompose() + if aff_tag.find('sup'): + aff_tag.find('sup').decompose() # same treatment as label + + aff_id = aff_tag.get('id') + + # it looks like we want to go to the full affiliation surface form without worrying about all possible handlings of and other fields + # BUT, we do want to keep ISNI and GRID IDs when they occur. They seem to occur typically within + # so let's handle those if they exist; safely decompose the tags (because they dont contribute to surface form); then grab remaining affiliation surface form + + # implicit in this approach is that we dont need to actually handle tags because only one per affiliation + if len(aff_tag.find_all('institution-wrap')) > 1: + import pdb; pdb.set_trace() + id_type_to_id = {} + for institution_id_tag in aff_tag.find_all('institution-id'): + id_type_to_id[institution_id_tag['institution-id-type']] = institution_id_tag.text + institution_id_tag.decompose() + + # TODO: processing of text: there are a lot of random newline chars (cuz XML preserves page layout) + # --> replace them with whitespace if there's preceding punctuation char + # --> otherwise, replace them with comma + text = aff_tag.text + + outs.append({ + 'id': aff_id, + 'other_ids': id_type_to_id, + 'text': text + }) + + return outs + + +def parse_abstract_tag(front_tag, soup) -> List[Dict]: + """Not every paper has an abstract + + Furthermore, note very abstract is structured into sections. + Some abstracts (see PMC1914226) look like: + +

...

+

...

+ + """ + # TODO: are there cases where text

text ? + abstract: List[Dict] = [] + if front_tag.find('abstract'): + abstract_tag = front_tag.find('abstract').extract() + + # replace all xref tags with string placeholders + replace_xref_with_string_placeholders(soup_tag=abstract_tag, soup=soup) + + # replace all sup/sub tags with string placeholders + replace_sup_sub_tags_with_string_placeholders(soup_tag=abstract_tag, soup=soup) + + if abstract_tag.find('sec'): + all_par_blobs = [] + for sec_tag in abstract_tag.find_all('sec', recursive=False): + par_blobs = recurse_parse_section(sec_tag=sec_tag) + all_par_blobs.extend(par_blobs) + else: + all_par_blobs = parse_all_paragraphs_in_section(sec_tag=abstract_tag) + for par_blob in all_par_blobs: + # these 'sections' typically show up as empty string + par_blob['section'] = 'Abstract' + abstract.append(par_blob) + return abstract \ No newline at end of file diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py new file mode 100644 index 0000000000000000000000000000000000000000..f296c141efa6f87651b014a4cb0cafcfa1a4f652 --- /dev/null +++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py @@ -0,0 +1,347 @@ + +funding_tags_and_parsed_dicts = [ + # is typically the top-level tag + # + # within, we see and as containing the main information we want + # + # here, we see with an 'id' attribute. we can ignore these. + (""" + + Wellcome Trust + + """, None), + # sometimes, there are also tags, but we can ignore these. they're funding-group specific. + (""" + + US Department of Energy's Office of Science, Biological and Environmental Research Program + DE-AC02-05CH11231 + DE-AC52-07NA27344 + DE-AC02-06NA25396 + DE-AC05-00OR22725 + + + German Research Foundation + INST 599/1-2 + + """, None), + + # is a less structured alternative to + (""" + No sources of funding were used to assist in the preparation of this study. + """, None), + + # Rarely, there is nesting! ignore parents. + (""" + + This work was supported by the Swedish Association for Sexuality Education (RFSU). + + """, None), + + + # Sometimes both can occur, sort of duplicating the same information. + # For example "Cornell" is mentioned as both a and a + (""" + + + Cornell University Institute for the Social Sciences + + + The research was supported by a grant from the Cornell University Institute for the Social Sciences. + """, None), + + # many + (""" + + Brien Holden Vision Institute + + + Australian Federal Government + + + International Postgraduate Research Scholarship (Cathleen Fedtke) + + + University of New South Wales, Australia + + + National Institutes of Health + P30EY14801 + + + Florida Lions Eye Bank + + + Bascom Palmer Eye Institute + + """, None), + + # institutions can optionally occur within + # 'institution-id-type' is common, but also optional + # regardless of the institution ID type, it looks like the ID is always a DOI (or URL to a DOI) + (""" + + + + http://dx.doi.org/10.13039/100000025 + National Institute of Mental Health + + + R01MH107333 + + KimWoong-Ki + + + """, None), + (""" + + + + Deutsche Forschungsgemeinschaft + http://search.crossref.org/fundref?q=501100001659 + + + Re 628/16-1 + GRK 1216 + + """, None), + (""" + + + + National Institutes of Health + 10.13039/100000002 + + + + """, None), + + # handing + (""" + + + Austrian Science Fund + 10.13039/501100002428 + + P 27625 + + This work was supported by Austrian Science Fund [grant number P 27625]. + """, None), + + # handling xlink:href attributes + (""" + + Economic and Social Research Council + RES-360-25-0032 + + + Wellcome Trust + 106542/Z/14/Z + + """, None) +] + +acknowledgement_tags_and_parsed_dicts = [ + # variants with may/may not have a . always have <p> but may/may not have <p id>. <title> never has attributes. + # the <p> text might contain <funding-source> or <ext-link> tags. + # the <ext-link> tags have required attributes 'ext-link-type' and 'xlink:href', and optional attribute 'id'. all the <ext-links> are URLs. + ("""<ack id=\"ack0005\"> + <title>Acknowledgements +

The authors thank the BBSRC (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.

+ """, { + 'text': 'The authors thank the BBSRC (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.', + 'funding': [{'text': 'BBSRC', 'id': 'gs0005'}], + 'url': None}), + (""" +

Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.

+
""", { + 'text': 'Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.', + 'funding': [], + 'url': None}), + (""" + Acknowledgements +

This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.

+
""", { + 'text': 'This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.', + 'funding': [], + 'url': None}), + (""" + Data accessibility +

The data used is included in the RepeatABEL package available at https://cran.r-project.org/web/packages/RepeatABEL.

+
""", { + 'text': 'The data used is included in the RepeatABEL package available at https://cran.r-project.org/web/packages/RepeatABEL.', + 'funding': [], + 'url': 'https://cran.r-project.org/web/packages/RepeatABEL'}), + # variants with are similar to the above. + (""" + Acknowledgments +

D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.

+
""", { + 'text': 'D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.', + 'funding': [], + 'url': None}), + (""" + Conflict of interest +

The authors declare there is no conflict of interest associated with this manuscript.

+
""", { + 'text': 'The authors declare there is no conflict of interest associated with this manuscript.', + 'funding': [], + 'url': None}) +] + +affiliation_tags_and_parsed_dicts = [ + # mix of tags with and without IDs + ("""Department of Internal Medicine, Division of Cardiology, Inha University Hospital, Incheon, South Korea""", None), + ("""Department of Cardiology, Atatürk Chest Diseases and Chest Surgery Training and Research Hospital; Ankara-Turkey""", None), + # there can exist a