Spaces:

nianlong
/

memsum-arxiv-summarizer

Sleeping

App Files Files Community

nianlonggu commited on Apr 8

Commit

02ae0bf

•

1 Parent(s): a57888c

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -0
Dockerfile +55 -0
docker-compose.yaml +7 -0
requirements.txt +21 -0
s2orc-doc2json/LICENSE +201 -0
s2orc-doc2json/README.md +138 -0
s2orc-doc2json/doc2json.egg-info/PKG-INFO +4 -0
s2orc-doc2json/doc2json.egg-info/SOURCES.txt +42 -0
s2orc-doc2json/doc2json.egg-info/dependency_links.txt +1 -0
s2orc-doc2json/doc2json.egg-info/not-zip-safe +1 -0
s2orc-doc2json/doc2json.egg-info/top_level.txt +1 -0
s2orc-doc2json/doc2json/__init__.py +0 -0
s2orc-doc2json/doc2json/config.py +2 -0
s2orc-doc2json/doc2json/flask/app.py +57 -0
s2orc-doc2json/doc2json/flask/static/style.css +40 -0
s2orc-doc2json/doc2json/flask/templates/home.html +18 -0
s2orc-doc2json/doc2json/grobid2json/__init__.py +0 -0
s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md +92 -0
s2orc-doc2json/doc2json/grobid2json/grobid/__init__.py +0 -0
s2orc-doc2json/doc2json/grobid2json/grobid/client.py +225 -0
s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml +36 -0
s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties +59 -0
s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py +249 -0
s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py +7 -0
s2orc-doc2json/doc2json/grobid2json/process_pdf.py +104 -0
s2orc-doc2json/doc2json/grobid2json/tei_to_json.py +750 -0
s2orc-doc2json/doc2json/jats2json/__init__.py +0 -0
s2orc-doc2json/doc2json/jats2json/jats_to_json.py +341 -0
s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py +0 -0
s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py +300 -0
s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py +56 -0
s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py +106 -0
s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py +381 -0
s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py +347 -0
s2orc-doc2json/doc2json/jats2json/process_jats.py +104 -0
s2orc-doc2json/doc2json/s2orc.py +527 -0
s2orc-doc2json/doc2json/spp2json/__init__.py +0 -0
s2orc-doc2json/doc2json/spp2json/process_pdf.py +72 -0
s2orc-doc2json/doc2json/spp2json/spp/__init__.py +0 -0
s2orc-doc2json/doc2json/spp2json/spp/spp_client.py +32 -0
s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py +7 -0
s2orc-doc2json/doc2json/tex2json/__init__.py +0 -0
s2orc-doc2json/doc2json/tex2json/process_tex.py +127 -0
s2orc-doc2json/doc2json/tex2json/tex_to_xml.py +201 -0
s2orc-doc2json/doc2json/tex2json/xml_to_json.py +1396 -0
s2orc-doc2json/doc2json/utils/__init__.py +0 -0
s2orc-doc2json/doc2json/utils/citation_util.py +75 -0
s2orc-doc2json/doc2json/utils/grobid_util.py +388 -0
s2orc-doc2json/doc2json/utils/latex_util.py +204 -0
s2orc-doc2json/doc2json/utils/refspan_util.py +115 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.ipynb_checkpoints/
+*.gz
+*.pdf

Dockerfile ADDED Viewed

	@@ -0,0 +1,55 @@

+FROM ubuntu:22.04
+# Set Environment Variable
+ENV HOME="/root"
+ENV JAVA_TOOL_OPTIONS="-Dhttps.protocols=TLSv1.2"
+ENV PDF2JSON_HOME="/app/src/s2orc-doc2json"
+# install system-wide deps for python and node
+RUN apt-get -yqq update && \
+    apt-get -yqq install software-properties-common curl wget zip screen git gcc build-essential openjdk-8-jdk
+# Install Miniconda
+RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
+    rm Miniconda3-latest-Linux-x86_64.sh
+ENV PATH=/miniconda/bin:${PATH}
+# Create a Python 3.10 environment
+RUN conda create -n my_env python=3.10
+SHELL ["conda", "run", "-n", "my_env", "/bin/bash", "-c"]
+WORKDIR /app/src
+COPY ./requirements.txt .
+RUN pip install -r requirements.txt
+WORKDIR $PDF2JSON_HOME
+COPY ./s2orc-doc2json/ .
+RUN python setup.py develop
+WORKDIR $HOME
+RUN wget https://github.com/kermitt2/grobid/archive/0.6.1.zip && \
+    unzip 0.6.1.zip && \
+    rm 0.6.1.zip
+WORKDIR $HOME/grobid-0.6.1
+RUN ./gradlew clean install && \
+    cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/config.yaml $HOME/grobid-0.6.1/grobid-service/config/config.yaml && \
+    cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/grobid.properties $HOME/grobid-0.6.1/grobid-home/config/grobid.properties
+WORKDIR /app/models/
+# Download necessary model checkpoint
+RUN python -c "from huggingface_hub import snapshot_download; model_folder = '/app/models/'; snapshot_download('nianlong/memsum-word-embedding', local_dir = model_folder + 'word_embedding'); snapshot_download('nianlong/memsum-arxiv-summarization', local_dir = model_folder + 'memsum_arxiv' )"
+WORKDIR /app/src
+COPY ./Dockerfile .
+WORKDIR /app/src/services
+RUN git clone https://github.com/nianlonggu/MemSum
+COPY ./services/ .
+# start app
+# will use the pure bash, ignoring the bash environment specified by SHELL command above
+CMD [ "bash", "./start_service.sh" ]

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+version: '3'
+services:
+    summarization_service:
+        build: .
+        ports:
+            - 7860:7860

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+tqdm
+beautifulsoup4==4.7.1
+boto3==1.9.147
+requests==2.21.0
+flask==2.3.2
+flask_cors==4.0.0
+python-magic==0.4.18
+latex2mathml==2.16.2
+gunicorn==20.1.0
+lxml==4.9.0
+unidecode
+nltk==3.7
+jsonschema==4.17.3
+six==1.16.0
+numpy==1.21.6
+ujson==5.2.0
+more-itertools==9.1.0
+dateparser==1.1.8
+streamlit
+transformers==4.30.0
+torch==2.2.2

s2orc-doc2json/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

s2orc-doc2json/README.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# Convert scientific papers to S2ORC JSON
+This project is a part of [S2ORC](https://github.com/allenai/s2orc). For S2ORC, we convert PDFs to JSON using Grobid and a custom TEI.XML to JSON parser. That TEI.XML to JSON parser (`grobid2json`) is made available here. We additionally process LaTeX dumps from arXiv. That parser (`tex2json`) is also made available here.
+The S2ORC github page includes a JSON schema, but it may be easier to understand that schema based on the python classes in `doc2json/s2orc.py`.
+This custom JSON schema is also used for the [CORD-19](https://github.com/allenai/cord19) project, so those who have interacted with CORD-19 may find this format familiar.
+Possible future components (no promises):
+- Linking bibliography entries (bibliography consolidation) to papers in S2ORC
+## Setup your environment
+NOTE: Conda is shown but any other python env manager should be fine
+Go [here](https://docs.conda.io/en/latest/miniconda.html) to install the latest version of miniconda.
+Then, create an environment:
+```console
+conda create -n doc2json python=3.8 pytest
+conda activate doc2json
+pip install -r requirements.txt
+python setup.py develop
+```
+## PDF Processing
+The current `grobid2json` tool uses Grobid to first process each PDF into XML, then extracts paper components from the XML.
+### Install Grobid
+You will need to have Java installed on your machine. Then, you can install your own version of Grobid and get it running, or you can run the following script:
+```console
+bash scripts/setup_grobid.sh
+```
+This will setup Grobid, currently hard-coded as version 0.6.1. Then run:
+```console
+bash scripts/run_grobid.sh
+```
+to start the Grobid server. Don't worry if it gets stuck at 87%; this is normal and means Grobid is ready to process PDFs.
+The expected port for the Grobid service is 8070, but you can change this as well. Make sure to edit the port in both the Grobid config file as well as `grobid/grobid_client.py`.
+### Process a PDF
+There are a couple of test PDFs in `tests/input/` if you'd like to try with that.
+For example, you can try:
+```console
+python doc2json/grobid2json/process_pdf.py -i tests/pdf/N18-3011.pdf -t temp_dir/ -o output_dir/
+```
+This will generate a JSON file in the specified `output_dir`. If unspecified, the file will be in the `output/` directory from your path.
+## LaTeX Processing
+If you want to process LaTeX, you also need to install the following libraries:
+- [latexpand](https://ctan.org/pkg/latexpand?lang=en) (`apt install texlive-extra-utils`)
+- [tralics](http://www-sop.inria.fr/marelle/tralics/) (`apt install tralics`)
+To process LaTeX, all files must be in a zip file, similar to the `*.gz` files you can download from arXiv.
+A few examples are available under `tests/latex/`. For example, you can try:
+```console
+python doc2json/tex2json/process_tex.py -i test/latex/1911.02782.gz -t temp_dir/ -o output_dir/
+```
+Again, this will produce a JSON file in the specified `output_dir`.
+## PMC JATS XML Processing
+To process JATS XML, try:
+```console
+python doc2json/jats2json/process_jats.py -i test/jats/PMC5828200.nxml -o output_dir/
+```
+This will create a JSON file with the same paper id in the specified output directory.
+## Loading a S2ORC JSON file
+The format of S2ORC releases have drifted over time. Use the `load_s2orc` function in `doc2json/s2orc.py` to try and load historic and currect S2ORC JSON.
+## Run a Flask app and process documents through a web service
+To process PDFs, you will first need to start Grobid (defaults to port 8070). If you are processing LaTeX, no need for this step.
+```console
+bash scripts/run_grobid.sh
+```
+Then, start the Flask app (defaults to port 8080).
+```console
+python doc2json/flask/app.py
+```
+Go to [localhost:8080](localhost:8080) to upload and process papers.
+Or alternatively, you can do things like:
+```console
+curl localhost:8080/ -F file=@tests/pdf/N18-3011.pdf
+```
+## Citation
+If you use this utility in your research, please cite:
+```
+@inproceedings{lo-wang-2020-s2orc,
+    title = "{S}2{ORC}: The Semantic Scholar Open Research Corpus",
+    author = "Lo, Kyle  and Wang, Lucy Lu  and Neumann, Mark  and Kinney, Rodney  and Weld, Daniel",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.acl-main.447",
+    doi = "10.18653/v1/2020.acl-main.447",
+    pages = "4969--4983"
+}
+```
+## Contact
+Contributions are welcome. Note the embarassingly poor test coverage. Also, please note this pipeline is not perfect. It will miss text or make errors on most PDFs. The current PDF to JSON step uses Grobid; we may replace this with a different model in the future.
+Issues: contact `lucyw@allenai.org` or `kylel@allenai.org`

s2orc-doc2json/doc2json.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,4 @@

+Metadata-Version: 2.1
+Name: doc2json
+Version: 0.1
+License-File: LICENSE

s2orc-doc2json/doc2json.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+LICENSE
+README.md
+setup.py
+doc2json/__init__.py
+doc2json/config.py
+doc2json/s2orc.py
+doc2json.egg-info/PKG-INFO
+doc2json.egg-info/SOURCES.txt
+doc2json.egg-info/dependency_links.txt
+doc2json.egg-info/not-zip-safe
+doc2json.egg-info/top_level.txt
+doc2json/grobid2json/__init__.py
+doc2json/grobid2json/pdf_to_tei.py
+doc2json/grobid2json/process_pdf.py
+doc2json/grobid2json/tei_to_json.py
+doc2json/grobid2json/grobid/__init__.py
+doc2json/grobid2json/grobid/client.py
+doc2json/grobid2json/grobid/grobid_client.py
+doc2json/jats2json/__init__.py
+doc2json/jats2json/jats_to_json.py
+doc2json/jats2json/process_jats.py
+doc2json/jats2json/pmc_utils/__init__.py
+doc2json/jats2json/pmc_utils/all_tag_utils.py
+doc2json/jats2json/pmc_utils/back_tag_utils.py
+doc2json/jats2json/pmc_utils/extract_utils.py
+doc2json/jats2json/pmc_utils/front_tag_utils.py
+doc2json/jats2json/pmc_utils/tests.py
+doc2json/spp2json/__init__.py
+doc2json/spp2json/process_pdf.py
+doc2json/spp2json/spp/__init__.py
+doc2json/spp2json/spp/spp_client.py
+doc2json/spp2json/spp/spp_json_to_s2orc_json.py
+doc2json/tex2json/__init__.py
+doc2json/tex2json/process_tex.py
+doc2json/tex2json/tex_to_xml.py
+doc2json/tex2json/xml_to_json.py
+doc2json/utils/__init__.py
+doc2json/utils/citation_util.py
+doc2json/utils/grobid_util.py
+doc2json/utils/latex_util.py
+doc2json/utils/refspan_util.py
+doc2json/utils/soup_utils.py

s2orc-doc2json/doc2json.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

s2orc-doc2json/doc2json.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

s2orc-doc2json/doc2json.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ doc2json

s2orc-doc2json/doc2json/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/config.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ S2ORC_NAME_STRING = 'S2ORC'
2	+ S2ORC_VERSION_STRING = '1.0.0'

s2orc-doc2json/doc2json/flask/app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Flask app for S2ORC pdf2json utility
+"""
+import hashlib
+from flask import Flask, request, jsonify, flash, url_for, redirect, render_template, send_file
+from doc2json.grobid2json.process_pdf import process_pdf_stream
+from doc2json.tex2json.process_tex import process_tex_stream
+from doc2json.jats2json.process_jats import process_jats_stream
+app = Flask(__name__)
+ALLOWED_EXTENSIONS = {'pdf', 'gz', 'nxml'}
+@app.route('/')
+def home():
+    return render_template("home.html")
+@app.route('/', methods=['POST'])
+def upload_file():
+    uploaded_file = request.files['file']
+    if uploaded_file.filename != '':
+        filename = uploaded_file.filename
+        # read pdf file
+        if filename.endswith('pdf'):
+            pdf_stream = uploaded_file.stream
+            pdf_content = pdf_stream.read()
+            # compute hash
+            pdf_sha = hashlib.sha1(pdf_content).hexdigest()
+            # get results
+            results = process_pdf_stream(filename, pdf_sha, pdf_content)
+            return jsonify(results)
+        # read latex file
+        elif filename.endswith('gz'):
+            zip_stream = uploaded_file.stream
+            zip_content = zip_stream.read()
+            # get results
+            results = process_tex_stream(filename, zip_content)
+            return jsonify(results)
+        # read nxml file (jats)
+        elif filename.endswith('nxml'):
+            xml_stream = uploaded_file.stream
+            xml_content = xml_stream.read()
+            # get results
+            results = process_jats_stream(filename, xml_content)
+            return jsonify(results)
+        # unknown
+        else:
+            return {
+                "Error": "Unknown file type!"
+            }
+    return redirect(url_for('index'))
+if __name__ == '__main__':
+    app.run(port=8080, host='0.0.0.0')

s2orc-doc2json/doc2json/flask/static/style.css ADDED Viewed

	@@ -0,0 +1,40 @@

+html {
+    box-sizing: border-box;
+  }
+  * {
+    box-sizing: inherit;
+    font-family: Calibri, Arial, sans-serif !important;
+  }
+  h1 {
+    font-size: 32px;
+  }
+  h2, h3 {
+    font-size: 24px;
+  }
+  body {
+    margin: 20px;
+    font-size: 125%;
+    line-height: 1.4;
+    max-width: 800px;
+    margin: 0 auto;
+  }
+  footer {
+    margin-top: 50px;
+    border-top: 1px solid silver;
+    font-size: 0.8em;
+  }
+  footer ol {
+    padding-left: 20px;
+  }
+  .p {
+    text-align: center;
+    font-size: .75em;
+    padding-top: 150px;
+  }

s2orc-doc2json/doc2json/flask/templates/home.html ADDED Viewed

	@@ -0,0 +1,18 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>S2ORC doc2json</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+</head>
+<body>
+    <h1>S2ORC doc2json utility</h1>
+    <p>Upload a scientific PDF, LaTeX zip file, or JATS XML file and get back a JSON: </p>
+    <p>(Accepted file extensions: *.pdf, *.gz, *.nxml)</p>
+    <form method=post enctype=multipart/form-data>
+        <p><input type="file" name="file" accept=".pdf,.gz,.nxml"></p>
+        <p><input type="submit" value="Upload"></p>
+    </form>
+    <p>Please wait, processing takes time...</p>
+</body>
+</html>

s2orc-doc2json/doc2json/grobid2json/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/grobid2json/grobid/Readme.md ADDED Viewed

	@@ -0,0 +1,92 @@

+# Simple python client for GROBID REST services
+**NOTE: This README is adapted from GROBID**
+This Python client can be used to process in an efficient concurrent manner a set of PDF in a given directory by the [GROBID](https://github.com/kermitt2/grobid) service. Results are written in a given output directory and include the resulting XML TEI representation of the PDF.
+## Build and run
+You need first to install and start the *grobid* service, latest stable version, see the [documentation](http://grobid.readthedocs.io/). It is assumed that the server will run on the address `http://localhost:8070`. You can change the server address by editing the file `config.json`.
+## Requirements
+This client has been developed and tested with Python 3.5.
+## Install
+Get the github repo:
+> git clone https://github.com/kermitt2/grobid-client-python
+> cd grobid-client-python
+It is advised to setup first a virtual environment to avoid falling into one of these gloomy python dependency marshlands:
+> virtualenv --system-site-packages -p python3 env
+> source env/bin/activate
+## Usage and options
+```
+usage: grobid-client.py [-h] [--input INPUT] [--config CONFIG]
+                        [--output OUTPUT] [--n N]
+                        service
+Client for GROBID services
+positional arguments:
+  service               one of [processFulltextDocument,
+                        processHeaderDocument, processReferences]
+optional arguments:
+  -h, --help            show this help message and exit
+  --input INPUT         path to the directory containing PDF to process
+  --output OUTPUT       path to the directory where to put the results
+  --config CONFIG       path to the config file, default is ./config.json
+  --n N                 concurrency for service usage
+  --generateIDs         generate random xml:id to textual XML elements of the
+                        result files
+  --consolidate_header  call GROBID with consolidation of the metadata
+                        extracted from the header
+  --consolidate_citations
+                        call GROBID with consolidation of the extracted
+                        bibliographical references
+```
+Examples:
+> python3 grobid-client.py --input ~/tmp/in2 --output ~/tmp/out processFulltextDocument
+This command will process all the PDF files present in the input directory (files with extension `.pdf` only) with the `processFulltextDocument` service of GROBID, and write the resulting XML TEI files under the output directory, reusing the file name with a different file extension (`.tei.xml`), using the default `10` concurrent workers.
+> python3 grobid-client.py --input ~/tmp/in2 --output ~/tmp/out --n 20 processHeaderDocument
+This command will process all the PDF files present in the input directory (files with extension `.pdf` only) with the `processHeaderDocument` service of GROBID, and write the resulting XML TEI files under the output directory, reusing the file name with a different file extension (`.tei.xml`), using `20` concurrent workers.
+## Benchmarking
+Full text processing of __136 PDF__ (total 3443 pages, in average 25 pages per PDF) on Intel Core i7-4790K CPU 4.00GHz, 4 cores (8 threads), 16GB memory, `n` being the concurrency parameter:
+| n  | runtime (s)| s/PDF | PDF/s |
+|----|------------|-------|-------|
+| 1  | 209.0 | 1.54       | 0.65 |
+| 2  | 112.0 | 0.82       | 1.21 |
+| 3  | 80.4  | 0.59       | 1.69 |
+| 5  | 62.9  | 0.46       | 2.16 |
+| 8  | 55.7  | 0.41       | 2.44 |
+| 10 | 55.3  | 0.40       | 2.45 |
+![Runtime Plot](resources/20180928112135.png)
+As complementary info, GROBID processing of header of the 136 PDF and with `n=10` takes 3.74 s (15 times faster than the complete full text processing because only the two first pages of the PDF are considered), 36 PDF/s. In similar conditions, extraction and structuring of bibliographical references takes 26.9 s (5.1 PDF/s).
+## Todo
+Benchmarking with more files (e.g. million ISTEX PDF). Also implement existing GROBID services for text input (date, name, affiliation/address, raw bibliographical references, etc.). Better support for parameters (including elements where to put coordinates).
+## License and contact
+Distributed under [Apache 2.0 license](http://www.apache.org/licenses/LICENSE-2.0).
+Main author and contact: Patrice Lopez (<patrice.lopez@science-miner.com>)

s2orc-doc2json/doc2json/grobid2json/grobid/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/grobid2json/grobid/client.py ADDED Viewed

	@@ -0,0 +1,225 @@

+""" Generic API Client """
+from copy import deepcopy
+import json
+import requests
+try:
+    from urlparse import urljoin
+except ImportError:
+    from urllib.parse import urljoin
+class ApiClient(object):
+    """ Client to interact with a generic Rest API.
+    Subclasses should implement functionality accordingly with the provided
+    service methods, i.e. ``get``, ``post``, ``put`` and ``delete``.
+    """
+    accept_type = 'application/xml'
+    api_base = None
+    def __init__(
+            self,
+            base_url,
+            username=None,
+            api_key=None,
+            status_endpoint=None,
+            timeout=60
+    ):
+        """ Initialise client.
+        Args:
+            base_url (str): The base URL to the service being used.
+            username (str): The username to authenticate with.
+            api_key (str): The API key to authenticate with.
+            timeout (int): Maximum time before timing out.
+        """
+        self.base_url = base_url
+        self.username = username
+        self.api_key = api_key
+        self.status_endpoint = urljoin(self.base_url, status_endpoint)
+        self.timeout = timeout
+    @staticmethod
+    def encode(request, data):
+        """ Add request content data to request body, set Content-type header.
+        Should be overridden by subclasses if not using JSON encoding.
+        Args:
+            request (HTTPRequest): The request object.
+            data (dict, None): Data to be encoded.
+        Returns:
+            HTTPRequest: The request object.
+        """
+        if data is None:
+            return request
+        request.add_header('Content-Type', 'application/json')
+        request.data = json.dumps(data)
+        return request
+    @staticmethod
+    def decode(response):
+        """ Decode the returned data in the response.
+        Should be overridden by subclasses if something else than JSON is
+        expected.
+        Args:
+            response (HTTPResponse): The response object.
+        Returns:
+            dict or None.
+        """
+        try:
+            return response.json()
+        except ValueError as e:
+            return e.message
+    def get_credentials(self):
+        """ Returns parameters to be added to authenticate the request.
+        This lives on its own to make it easier to re-implement it if needed.
+        Returns:
+            dict: A dictionary containing the credentials.
+        """
+        return {"username": self.username, "api_key": self.api_key}
+    def call_api(
+            self,
+            method,
+            url,
+            headers=None,
+            params=None,
+            data=None,
+            files=None,
+            timeout=None,
+    ):
+        """ Call API.
+        This returns object containing data, with error details if applicable.
+        Args:
+            method (str): The HTTP method to use.
+            url (str): Resource location relative to the base URL.
+            headers (dict or None): Extra request headers to set.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents for POST or PUT requests.
+            files (dict or None: Files to be passed to the request.
+            timeout (int): Maximum time before timing out.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        headers = deepcopy(headers) or {}
+        headers['Accept'] = self.accept_type
+        params = deepcopy(params) or {}
+        data = data or {}
+        files = files or {}
+        #if self.username is not None and self.api_key is not None:
+        #    params.update(self.get_credentials())
+        r = requests.request(
+            method,
+            url,
+            headers=headers,
+            params=params,
+            files=files,
+            data=data,
+            timeout=timeout,
+        )
+        return r, r.status_code
+    def get(self, url, params=None, **kwargs):
+        """ Call the API with a GET request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            "GET",
+            url,
+            params=params,
+            **kwargs
+        )
+    def delete(self, url, params=None, **kwargs):
+        """ Call the API with a DELETE request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            "DELETE",
+            url,
+            params=params,
+            **kwargs
+        )
+    def put(self, url, params=None, data=None, files=None, **kwargs):
+        """ Call the API with a PUT request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents.
+            files (dict or None: Files to be passed to the request.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            "PUT",
+            url,
+            params=params,
+            data=data,
+            files=files,
+            **kwargs
+        )
+    def post(self, url, params=None, data=None, files=None, **kwargs):
+        """ Call the API with a POST request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents.
+            files (dict or None: Files to be passed to the request.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            method="POST",
+            url=url,
+            params=params,
+            data=data,
+            files=files,
+            **kwargs
+        )
+    def service_status(self, **kwargs):
+        """ Call the API to get the status of the service.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            'GET',
+            self.status_endpoint,
+            params={'format': 'json'},
+            **kwargs
+        )

s2orc-doc2json/doc2json/grobid2json/grobid/config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+grobid:
+  # NOTE: change these values to absolute paths when running on production
+  grobidHome: "grobid-home"
+  # how to load the models,
+  # false -> models are loaded when needed (default), avoiding puting in memory useless models
+  # true -> all the models are loaded into memory at the server statup, slow the start of the services and models not
+  # used will take some memory
+  modelPreload: true
+server:
+    type: custom
+    applicationConnectors:
+    - type: http
+      port: 8070
+    adminConnectors:
+    - type: http
+      port: 8071
+    registerDefaultExceptionMappers: false
+logging:
+  level: WARN
+  loggers:
+    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
+  appenders:
+    - type: console
+      threshold: ALL
+      timeZone: UTC
+#    - type: file
+#      currentLogFilename: logs/grobid-service.log
+#      threshold: ALL
+#      archive: true
+#      archivedLogFilenamePattern: logs/grobid-service-%d.log
+#      archivedFileCount: 5
+#      timeZone: UTC

s2orc-doc2json/doc2json/grobid2json/grobid/grobid.properties ADDED Viewed

	@@ -0,0 +1,59 @@

+#-------------------- resource directories ---------------------
+# properties of where to find directories necessary for GROBID
+# EACH KEY REFERENCING A PATH HAS TO ENDS WITH ".path"
+grobid.resource.path=./resources
+grobid.temp.path=./tmp
+grobid.bin.path=./bin
+#-------------------- external/native libs ---------------------
+#path to folder containing native libraries of 3rd parties
+grobid.nativelibrary.path=./lib
+grobid.3rdparty.pdf2xml.path=./pdf2xml
+grobid.3rdparty.pdf2xml.memory.limit.mb=6096
+grobid.3rdparty.pdf2xml.timeout.sec=60
+#-------------------------------------------------------------
+#-------------------- consolidation --------------------
+# Define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or "glutton" for https://github.com/kermitt2/biblio-glutton
+grobid.consolidation.service=crossref
+#grobid.consolidation.service=glutton
+#org.grobid.glutton.host=cloud.science-miner.com/glutton
+#org.grobid.glutton.port=0
+org.grobid.glutton.host=localhost
+org.grobid.glutton.port=8070
+#org.grobid.crossref.mailto=toto@titi.tutu
+#org.grobid.crossref.token=yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere
+#-------------------- proxy --------------------
+#proxy to be used for external call to the crossref REST API service or Glutton service if not deployed under proxy ("null" when no proxy)
+grobid.proxy_host=null
+grobid.proxy_port=null
+#------------------------------------------------------
+#-------------------- runtime ------------------
+grobid.crf.engine=wapiti
+#grobid.crf.engine=delft
+#grobid.crf.engine=crfpp
+grobid.delft.install=../delft
+grobid.delft.useELMo=false
+grobid.pdf.blocks.max=100000
+grobid.pdf.tokens.max=1000000
+#-------------------- training ------------------
+#number of threads for training the wapiti models (0 to use all available processors)
+grobid.nb_threads=0
+#-------------------- language identification  ------------------
+#property for using or not the language identifier (true|false)
+grobid.use_language_id=true
+grobid.language_detector_factory=org.grobid.core.lang.impl.CybozuLanguageDetectorFactory
+#determines if properties like the firstnames, lastnames country codes and dictionaries are supposed to be read from $GROBID_HOME path or not (possible values (true|false) dafault is false)
+grobid.resources.inHome=true
+#------------------------------------------------------
+#-------------------- pooling -------------------
+# Maximum parallel connections allowed
+org.grobid.max.connections=72
+# Maximum time wait to get a connection when the pool is full (in seconds)
+org.grobid.pool.max.wait=1
+#------------------------------------------------------

s2orc-doc2json/doc2json/grobid2json/grobid/grobid_client.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import os
+import io
+import json
+import argparse
+import time
+import glob
+from doc2json.grobid2json.grobid.client import ApiClient
+import ntpath
+from typing import List
+'''
+This version uses the standard ProcessPoolExecutor for parallelizing the concurrent calls to the GROBID services.
+Given the limits of ThreadPoolExecutor (input stored in memory, blocking Executor.map until the whole input
+is acquired), it works with batches of PDF of a size indicated in the config.json file (default is 1000 entries).
+We are moving from first batch to the second one only when the first is entirely processed - which means it is
+slightly sub-optimal, but should scale better. However acquiring a list of million of files in directories would
+require something scalable too, which is not implemented for the moment.
+'''
+DEFAULT_GROBID_CONFIG = {
+    "grobid_server": "localhost",
+    "grobid_port": "8070",
+    "batch_size": 1000,
+    "sleep_time": 5,
+    "generateIDs": False,
+    "consolidate_header": False,
+    "consolidate_citations": False,
+    "include_raw_citations": True,
+    "include_raw_affiliations": False,
+    "max_workers": 2,
+}
+class GrobidClient(ApiClient):
+    def __init__(self, config=None):
+        self.config = config or DEFAULT_GROBID_CONFIG
+        self.generate_ids = self.config["generateIDs"]
+        self.consolidate_header = self.config["consolidate_header"]
+        self.consolidate_citations = self.config["consolidate_citations"]
+        self.include_raw_citations = self.config["include_raw_citations"]
+        self.include_raw_affiliations = self.config["include_raw_affiliations"]
+        self.max_workers = self.config["max_workers"]
+        self.grobid_server = self.config["grobid_server"]
+        self.grobid_port = self.config["grobid_port"]
+        self.sleep_time = self.config["sleep_time"]
+    def process(self, input: str, output: str, service: str):
+        batch_size_pdf = self.config['batch_size']
+        pdf_files = []
+        for pdf_file in glob.glob(input + "/*.pdf"):
+            pdf_files.append(pdf_file)
+            if len(pdf_files) == batch_size_pdf:
+                self.process_batch(pdf_files, output, service)
+                pdf_files = []
+        # last batch
+        if len(pdf_files) > 0:
+            self.process_batch(pdf_files, output, service)
+    def process_batch(self, pdf_files: List[str], output: str, service: str) -> None:
+        print(len(pdf_files), "PDF files to process")
+        for pdf_file in pdf_files:
+            self.process_pdf(pdf_file, output, service)
+    def process_pdf_stream(self, pdf_file: str, pdf_strm: bytes, output: str, service: str) -> str:
+        # process the stream
+        files = {
+            'input': (
+                pdf_file,
+                pdf_strm,
+                'application/pdf',
+                {'Expires': '0'}
+            )
+        }
+        the_url = 'http://' + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/" + service
+        # set the GROBID parameters
+        the_data = {}
+        if self.generate_ids:
+            the_data['generateIDs'] = '1'
+        else:
+            the_data['generateIDs'] = '0'
+        if self.consolidate_header:
+            the_data['consolidateHeader'] = '1'
+        else:
+            the_data['consolidateHeader'] = '0'
+        if self.consolidate_citations:
+            the_data['consolidateCitations'] = '1'
+        else:
+            the_data['consolidateCitations'] = '0'
+        if self.include_raw_affiliations:
+            the_data['includeRawAffiliations'] = '1'
+        else:
+            the_data['includeRawAffiliations'] = '0'
+        if self.include_raw_citations:
+            the_data['includeRawCitations'] = '1'
+        else:
+            the_data['includeRawCitations'] = '0'
+        res, status = self.post(
+            url=the_url,
+            files=files,
+            data=the_data,
+            headers={'Accept': 'text/plain'}
+        )
+        if status == 503:
+            time.sleep(self.sleep_time)
+            return self.process_pdf_stream(pdf_file, pdf_strm, service)
+        elif status != 200:
+            with open(os.path.join(output, "failed.log"), "a+") as failed:
+                failed.write(pdf_file.strip(".pdf") + "\n")
+            print('Processing failed with error ' + str(status))
+            return ""
+        else:
+            return res.text
+    def process_pdf(self, pdf_file: str, output: str, service: str) -> None:
+        # check if TEI file is already produced
+        # we use ntpath here to be sure it will work on Windows too
+        pdf_file_name = ntpath.basename(pdf_file)
+        filename = os.path.join(output, os.path.splitext(pdf_file_name)[0] + '.tei.xml')
+        if os.path.isfile(filename):
+            return
+        print(pdf_file)
+        pdf_strm = open(pdf_file, 'rb').read()
+        tei_text = self.process_pdf_stream(pdf_file, pdf_strm, output, service)
+        # writing TEI file
+        if tei_text:
+            with io.open(filename, 'w+', encoding='utf8') as tei_file:
+                tei_file.write(tei_text)
+    def process_citation(self, bib_string: str, log_file: str) -> str:
+        # process citation raw string and return corresponding dict
+        the_data = {
+            'citations': bib_string,
+            'consolidateCitations': '0'
+        }
+        the_url = 'http://' + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/processCitation"
+        for _ in range(5):
+            try:
+                res, status = self.post(
+                    url=the_url,
+                    data=the_data,
+                    headers={'Accept': 'text/plain'}
+                )
+                if status == 503:
+                    time.sleep(self.sleep_time)
+                    continue
+                elif status != 200:
+                    with open(log_file, "a+") as failed:
+                        failed.write("-- BIBSTR --\n")
+                        failed.write(bib_string + "\n\n")
+                    break
+                else:
+                    return res.text
+            except Exception:
+                continue
+    def process_header_names(self, header_string: str, log_file: str) -> str:
+        # process author names from header string
+        the_data = {
+            'names': header_string
+        }
+        the_url = 'http://' + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/processHeaderNames"
+        res, status = self.post(
+            url=the_url,
+            data=the_data,
+            headers={'Accept': 'text/plain'}
+        )
+        if status == 503:
+            time.sleep(self.sleep_time)
+            return self.process_header_names(header_string, log_file)
+        elif status != 200:
+            with open(log_file, "a+") as failed:
+                failed.write("-- AUTHOR --\n")
+                failed.write(header_string + "\n\n")
+        else:
+            return res.text
+    def process_affiliations(self, aff_string: str, log_file: str) -> str:
+        # process affiliation from input string
+        the_data = {
+            'affiliations': aff_string
+        }
+        the_url = 'http://' + self.grobid_server
+        the_url += ":" + self.grobid_port
+        the_url += "/api/processAffiliations"
+        res, status = self.post(
+            url=the_url,
+            data=the_data,
+            headers={'Accept': 'text/plain'}
+        )
+        if status == 503:
+            time.sleep(self.sleep_time)
+            return self.process_affiliations(aff_string, log_file)
+        elif status != 200:
+            with open(log_file, "a+") as failed:
+                failed.write("-- AFFILIATION --\n")
+                failed.write(aff_string + "\n\n")
+        else:
+            return res.text
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Client for GROBID services")
+    parser.add_argument("service", help="one of [processFulltextDocument, processHeaderDocument, processReferences]")
+    parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
+    parser.add_argument("--output", default=None, help="path to the directory where to put the results")
+    parser.add_argument("--config", default=None, help="path to the config file, default is ./config.json")
+    args = parser.parse_args()
+    input_path = args.input
+    config = json.load(open(args.config)) if args.config else DEFAULT_GROBID_CONFIG
+    output_path = args.output
+    service = args.service
+    client = GrobidClient(config=config)
+    start_time = time.time()
+    client.process(input_path, output_path, service)
+    runtime = round(time.time() - start_time, 3)
+    print("runtime: %s seconds " % (runtime))

s2orc-doc2json/doc2json/grobid2json/pdf_to_tei.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+import sys
+from typing import Dict, List
+from PyPDF2 import PdfFileReader

s2orc-doc2json/doc2json/grobid2json/process_pdf.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import json
+import argparse
+import time
+from bs4 import BeautifulSoup
+from typing import Optional, Dict
+from doc2json.grobid2json.grobid.grobid_client import GrobidClient
+from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, convert_tei_xml_soup_to_s2orc_json
+BASE_TEMP_DIR = 'temp'
+BASE_OUTPUT_DIR = 'output'
+BASE_LOG_DIR = 'log'
+def process_pdf_stream(input_file: str, sha: str, input_stream: bytes, grobid_config: Optional[Dict] = None) -> Dict:
+    """
+    Process PDF stream
+    :param input_file:
+    :param sha:
+    :param input_stream:
+    :return:
+    """
+    # process PDF through Grobid -> TEI.XML
+    client = GrobidClient(grobid_config)
+    tei_text = client.process_pdf_stream(input_file, input_stream, 'temp', "processFulltextDocument")
+    # make soup
+    soup = BeautifulSoup(tei_text, "xml")
+    # get paper
+    paper = convert_tei_xml_soup_to_s2orc_json(soup, input_file, sha)
+    return paper.release_json('pdf')
+def process_pdf_file(
+        input_file: str,
+        temp_dir: str = BASE_TEMP_DIR,
+        output_dir: str = BASE_OUTPUT_DIR,
+        grobid_config: Optional[Dict] = None
+) -> str:
+    """
+    Process a PDF file and get JSON representation
+    :param input_file:
+    :param temp_dir:
+    :param output_dir:
+    :return:
+    """
+    os.makedirs(temp_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    # get paper id as the name of the file
+    paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
+    tei_file = os.path.join(temp_dir, f'{paper_id}.tei.xml')
+    output_file = os.path.join(output_dir, f'{paper_id}.json')
+    # check if input file exists and output file doesn't
+    if not os.path.exists(input_file):
+        raise FileNotFoundError(f"{input_file} doesn't exist")
+    if os.path.exists(output_file):
+        print(f'{output_file} already exists!')
+    # process PDF through Grobid -> TEI.XML
+    client = GrobidClient(grobid_config)
+    # TODO: compute PDF hash
+    # TODO: add grobid version number to output
+    client.process_pdf(input_file, temp_dir, "processFulltextDocument")
+    # process TEI.XML -> JSON
+    assert os.path.exists(tei_file)
+    paper = convert_tei_xml_file_to_s2orc_json(tei_file)
+    # write to file
+    with open(output_file, 'w') as outf:
+        json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
+    return output_file
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
+    parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
+    parser.add_argument("-t", "--temp", default=BASE_TEMP_DIR, help="path to the temp dir for putting tei xml files")
+    parser.add_argument("-o", "--output", default=BASE_OUTPUT_DIR, help="path to the output dir for putting json files")
+    parser.add_argument("-k", "--keep", action='store_true')
+    args = parser.parse_args()
+    input_path = args.input
+    temp_path = args.temp
+    output_path = args.output
+    keep_temp = args.keep
+    start_time = time.time()
+    os.makedirs(temp_path, exist_ok=True)
+    os.makedirs(output_path, exist_ok=True)
+    process_pdf_file(input_path, temp_path, output_path)
+    runtime = round(time.time() - start_time, 3)
+    print("runtime: %s seconds " % (runtime))
+    print('done.')

s2orc-doc2json/doc2json/grobid2json/tei_to_json.py ADDED Viewed

	@@ -0,0 +1,750 @@

+#!/usr/bin/env python
+import os
+import sys
+import bs4
+import re
+from bs4 import BeautifulSoup, NavigableString
+from typing import List, Dict, Tuple
+from doc2json.s2orc import Paper
+from doc2json.utils.grobid_util import parse_bib_entry, extract_paper_metadata_from_grobid_xml
+from doc2json.utils.citation_util import SINGLE_BRACKET_REGEX, BRACKET_REGEX, BRACKET_STYLE_THRESHOLD
+from doc2json.utils.citation_util import is_expansion_string, _clean_empty_and_duplicate_authors_from_grobid_parse
+from doc2json.utils.refspan_util import sub_spans_and_update_indices
+REPLACE_TABLE_TOKS = {
+    "<row>": "<tr>",
+    "<row/>": "<tr/>",
+    "</row>": "</tr>",
+    "<cell>": "<td>",
+    "<cell/>": "<td/>",
+    "</cell>": "</td>",
+    "<cell ": "<td ",
+    "cols=": "colspan="
+}
+class UniqTokenGenerator:
+    """
+    Generate unique token
+    """
+    def __init__(self, tok_string):
+        self.tok_string = tok_string
+        self.ind = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        return self.next()
+    def next(self):
+        new_token = f'{self.tok_string}{self.ind}'
+        self.ind += 1
+        return new_token
+def normalize_grobid_id(grobid_id: str):
+    """
+    Normalize grobid object identifiers
+    :param grobid_id:
+    :return:
+    """
+    str_norm = grobid_id.upper().replace('_', '').replace('#', '')
+    if str_norm.startswith('B'):
+        return str_norm.replace('B', 'BIBREF')
+    if str_norm.startswith('TAB'):
+        return str_norm.replace('TAB', 'TABREF')
+    if str_norm.startswith('FIG'):
+        return str_norm.replace('FIG', 'FIGREF')
+    if str_norm.startswith('FORMULA'):
+        return str_norm.replace('FORMULA', 'EQREF')
+    return str_norm
+def parse_bibliography(soup: BeautifulSoup) -> List[Dict]:
+    """
+    Finds all bibliography entries in a grobid xml.
+    """
+    bibliography = soup.listBibl
+    if bibliography is None:
+        return []
+    entries = bibliography.find_all("biblStruct")
+    structured_entries = []
+    for entry in entries:
+        bib_entry = parse_bib_entry(entry)
+        # add bib entry only if it has a title
+        if bib_entry['title']:
+            structured_entries.append(bib_entry)
+    bibliography.decompose()
+    return structured_entries
+def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None:
+    """
+    Replace all formulas with the text
+    :param sp:
+    :return:
+    """
+    for eq in sp.find_all('formula'):
+        eq.replace_with(sp.new_string(eq.text.strip()))
+def table_to_html(table: bs4.element.Tag) -> str:
+    """
+    Sub table tags with html table tags
+    :param table_str:
+    :return:
+    """
+    for tag in table:
+        if tag.name != 'row':
+            print(f'Unknown table subtag: {tag.name}')
+            tag.decompose()
+    table_str = str(table)
+    for token, subtoken in REPLACE_TABLE_TOKS.items():
+        table_str = table_str.replace(token, subtoken)
+    return table_str
+def extract_figures_and_tables_from_tei_xml(sp: BeautifulSoup) -> Dict[str, Dict]:
+    """
+    Generate figure and table dicts
+    :param sp:
+    :return:
+    """
+    ref_map = dict()
+    for fig in sp.find_all('figure'):
+        try:
+            if fig.name and fig.get('xml:id'):
+                if fig.get('type') == 'table':
+                    ref_map[normalize_grobid_id(fig.get('xml:id'))] = {
+                        "text": fig.figDesc.text.strip() if fig.figDesc else fig.head.text.strip() if fig.head else "",
+                        "latex": None,
+                        "type": "table",
+                        "content": table_to_html(fig.table)
+                    }
+                else:
+                    ref_map[normalize_grobid_id(fig.get('xml:id'))] = {
+                        "text": fig.figDesc.text.strip() if fig.figDesc else "",
+                        "latex": None,
+                        "type": "figure",
+                        "content": ""
+                    }
+        except AttributeError:
+            continue
+        fig.decompose()
+    return ref_map
+def check_if_citations_are_bracket_style(sp: BeautifulSoup) -> bool:
+    """
+    Check if the document has bracket style citations
+    :param sp:
+    :return:
+    """
+    cite_strings = []
+    if sp.body:
+        for div in sp.body.find_all('div'):
+            if div.head:
+                continue
+            for rtag in div.find_all('ref'):
+                ref_type = rtag.get('type')
+                if ref_type == 'bibr':
+                    cite_strings.append(rtag.text.strip())
+        # check how many match bracket style
+        bracket_style = [bool(BRACKET_REGEX.match(cite_str)) for cite_str in cite_strings]
+        # return true if
+        if sum(bracket_style) > BRACKET_STYLE_THRESHOLD:
+            return True
+    return False
+def sub_all_note_tags(sp: BeautifulSoup) -> BeautifulSoup:
+    """
+    Sub all note tags with p tags
+    :param para_el:
+    :param sp:
+    :return:
+    """
+    for ntag in sp.find_all('note'):
+        p_tag = sp.new_tag('p')
+        p_tag.string = ntag.text.strip()
+        ntag.replace_with(p_tag)
+    return sp
+def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None:
+    """
+    Process all formulas in paragraph and replace with text and label
+    :param para_el:
+    :param sp:
+    :return:
+    """
+    for ftag in para_el.find_all('formula'):
+        # get label if exists and insert a space between formula and label
+        if ftag.label:
+            label = ' ' + ftag.label.text
+            ftag.label.decompose()
+        else:
+            label = ''
+        ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}'))
+def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict:
+    """
+    Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
+    :param para_el:
+    :param sp:
+    :param refs:
+    :return:
+    """
+    tokgen = UniqTokenGenerator('REFTOKEN')
+    ref_dict = dict()
+    for rtag in para_el.find_all('ref'):
+        try:
+            ref_type = rtag.get('type')
+            # skip if citation
+            if ref_type == 'bibr':
+                continue
+            if ref_type == 'table' or ref_type == 'figure':
+                ref_id = rtag.get('target')
+                if ref_id and normalize_grobid_id(ref_id) in refs:
+                    # normalize reference string
+                    rtag_string = normalize_grobid_id(ref_id)
+                else:
+                    rtag_string = None
+                # add to ref set
+                ref_key = tokgen.next()
+                ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
+                rtag.replace_with(sp.new_string(f" {ref_key} "))
+            else:
+                # replace with surface form
+                rtag.replace_with(sp.new_string(rtag.text.strip()))
+        except AttributeError:
+            continue
+    return ref_dict
+def process_citations_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, bibs: Dict, bracket: bool) -> Dict:
+    """
+    Process all citations in paragraph and generate a dict for surface forms
+    :param para_el:
+    :param sp:
+    :param bibs:
+    :param bracket:
+    :return:
+    """
+    # CHECK if range between two surface forms is appropriate for bracket style expansion
+    def _get_surface_range(start_surface, end_surface):
+        span1_match = SINGLE_BRACKET_REGEX.match(start_surface)
+        span2_match = SINGLE_BRACKET_REGEX.match(end_surface)
+        if span1_match and span2_match:
+            # get numbers corresponding to citations
+            span1_num = int(span1_match.group(1))
+            span2_num = int(span2_match.group(1))
+            # expand if range is between 1 and 20
+            if 1 < span2_num - span1_num < 20:
+                return span1_num, span2_num
+        return None
+    # CREATE BIBREF range between two reference ids, e.g. BIBREF1-BIBREF4 -> BIBREF1 BIBREF2 BIBREF3 BIBREF4
+    def _create_ref_id_range(start_ref_id, end_ref_id):
+        start_ref_num = int(start_ref_id[6:])
+        end_ref_num = int(end_ref_id[6:])
+        return [f'BIBREF{curr_ref_num}' for curr_ref_num in range(start_ref_num, end_ref_num + 1)]
+    # CREATE surface form range between two bracket strings, e.g. [1]-[4] -> [1] [2] [3] [4]
+    def _create_surface_range(start_number, end_number):
+        return [f'[{n}]' for n in range(start_number, end_number + 1)]
+    # create citation dict with keywords
+    cite_map = dict()
+    tokgen = UniqTokenGenerator('CITETOKEN')
+    for rtag in para_el.find_all('ref'):
+        try:
+            # get surface span, e.g. [3]
+            surface_span = rtag.text.strip()
+            # check if target is available (#b2 -> BID2)
+            if rtag.get('target'):
+                # normalize reference string
+                rtag_ref_id = normalize_grobid_id(rtag.get('target'))
+                # skip if rtag ref_id not in bibliography
+                if rtag_ref_id not in bibs:
+                    cite_key = tokgen.next()
+                    rtag.replace_with(sp.new_string(f" {cite_key} "))
+                    cite_map[cite_key] = (None, surface_span)
+                    continue
+                # if bracket style, only keep if surface form is bracket
+                if bracket:
+                    # valid bracket span
+                    if surface_span and (surface_span[0] == '[' or surface_span[-1] == ']' or surface_span[-1] == ','):
+                        pass
+                    # invalid, replace tag with surface form and continue to next ref tag
+                    else:
+                        rtag.replace_with(sp.new_string(f" {surface_span} "))
+                        continue
+                # not bracket, add cite span and move on
+                else:
+                    cite_key = tokgen.next()
+                    rtag.replace_with(sp.new_string(f" {cite_key} "))
+                    cite_map[cite_key] = (rtag_ref_id, surface_span)
+                    continue
+                ### EXTRA PROCESSING FOR BRACKET STYLE CITATIONS; EXPAND RANGES ###
+                # look backward for range marker, e.g. [1]-*[3]*
+                backward_between_span = ""
+                for sib in rtag.previous_siblings:
+                    if sib.name == 'ref':
+                        break
+                    elif type(sib) == NavigableString:
+                        backward_between_span += sib
+                    else:
+                        break
+                # check if there's a backwards expansion, e.g. need to expand [1]-[3] -> [1] [2] [3]
+                if is_expansion_string(backward_between_span):
+                    # get surface number range
+                    surface_num_range = _get_surface_range(
+                        rtag.find_previous_sibling('ref').text.strip(),
+                        surface_span
+                    )
+                    # if the surface number range is reasonable (range < 20, in order), EXPAND
+                    if surface_num_range:
+                        # delete previous ref tag and anything in between (i.e. delete "-" and extra spaces)
+                        for sib in rtag.previous_siblings:
+                            if sib.name == 'ref':
+                                break
+                            elif type(sib) == NavigableString:
+                                sib.replace_with(sp.new_string(""))
+                            else:
+                                break
+                        # get ref id of previous ref, e.g. [1] (#b0 -> BID0)
+                        previous_rtag = rtag.find_previous_sibling('ref')
+                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target'))
+                        previous_rtag.decompose()
+                        # replace this ref tag with the full range expansion, e.g. [3] (#b2 -> BID1 BID2)
+                        id_range = _create_ref_id_range(previous_rtag_ref_id, rtag_ref_id)
+                        surface_range = _create_surface_range(surface_num_range[0], surface_num_range[1])
+                        replace_string = ''
+                        for range_ref_id, range_surface_form in zip(id_range, surface_range):
+                            # only replace if ref id is in bibliography, else add none
+                            if range_ref_id in bibs:
+                                cite_key = tokgen.next()
+                                cite_map[cite_key] = (range_ref_id, range_surface_form)
+                            else:
+                                cite_key = tokgen.next()
+                                cite_map[cite_key] = (None, range_surface_form)
+                            replace_string += cite_key + ' '
+                        rtag.replace_with(sp.new_string(f" {replace_string} "))
+                    # ELSE do not expand backwards and replace previous and current rtag with appropriate ref id
+                    else:
+                        # add mapping between ref id and surface form for previous ref tag
+                        previous_rtag = rtag.find_previous_sibling('ref')
+                        previous_rtag_ref_id = normalize_grobid_id(previous_rtag.get('target'))
+                        previous_rtag_surface = previous_rtag.text.strip()
+                        cite_key = tokgen.next()
+                        previous_rtag.replace_with(sp.new_string(f" {cite_key} "))
+                        cite_map[cite_key] = (previous_rtag_ref_id, previous_rtag_surface)
+                        # add mapping between ref id and surface form for current reftag
+                        cite_key = tokgen.next()
+                        rtag.replace_with(sp.new_string(f" {cite_key} "))
+                        cite_map[cite_key] = (rtag_ref_id, surface_span)
+                else:
+                    # look forward and see if expansion string, e.g. *[1]*-[3]
+                    forward_between_span = ""
+                    for sib in rtag.next_siblings:
+                        if sib.name == 'ref':
+                            break
+                        elif type(sib) == NavigableString:
+                            forward_between_span += sib
+                        else:
+                            break
+                    # look forward for range marker (if is a range, continue -- range will be expanded
+                    # when we get to the second value)
+                    if is_expansion_string(forward_between_span):
+                        continue
+                    # else treat like normal reference
+                    else:
+                        cite_key = tokgen.next()
+                        rtag.replace_with(sp.new_string(f" {cite_key} "))
+                        cite_map[cite_key] = (rtag_ref_id, surface_span)
+            else:
+                cite_key = tokgen.next()
+                rtag.replace_with(sp.new_string(f" {cite_key} "))
+                cite_map[cite_key] = (None, surface_span)
+        except AttributeError:
+            continue
+    return cite_map
+def process_paragraph(
+        sp: BeautifulSoup,
+        para_el: bs4.element.Tag,
+        section_names: List[Tuple],
+        bib_dict: Dict,
+        ref_dict: Dict,
+        bracket: bool
+) -> Dict:
+    """
+    Process one paragraph
+    :param sp:
+    :param para_el:
+    :param section_names:
+    :param bib_dict:
+    :param ref_dict:
+    :param bracket: if bracket style, expand and clean up citations
+    :return:
+    """
+    # return empty paragraph if no text
+    if not para_el.text:
+        return {
+            'text': "",
+            'cite_spans': [],
+            'ref_spans': [],
+            'eq_spans': [],
+            'section': section_names
+        }
+    # replace formulas with formula text
+    process_formulas_in_paragraph(para_el, sp)
+    # get references to tables and figures
+    ref_map = process_references_in_paragraph(para_el, sp, ref_dict)
+    # generate citation map for paragraph element (keep only cite spans with bib entry or unlinked)
+    cite_map = process_citations_in_paragraph(para_el, sp, bib_dict, bracket)
+    # substitute space characters
+    para_text = re.sub(r'\s+', ' ', para_el.text)
+    para_text = re.sub(r'\s', ' ', para_text)
+    # get all cite and ref spans
+    all_spans_to_replace = []
+    for span in re.finditer(r'(CITETOKEN\d+)', para_text):
+        uniq_token = span.group()
+        ref_id, surface_text = cite_map[uniq_token]
+        all_spans_to_replace.append((
+            span.start(),
+            span.start() + len(uniq_token),
+            uniq_token,
+            surface_text
+        ))
+    for span in re.finditer(r'(REFTOKEN\d+)', para_text):
+        uniq_token = span.group()
+        ref_id, surface_text, ref_type = ref_map[uniq_token]
+        all_spans_to_replace.append((
+            span.start(),
+            span.start() + len(uniq_token),
+            uniq_token,
+            surface_text
+        ))
+    # replace cite and ref spans and create json blobs
+    para_text, all_spans_to_replace = sub_spans_and_update_indices(all_spans_to_replace, para_text)
+    cite_span_blobs = [{
+        "start": start,
+        "end": end,
+        "text": surface,
+        "ref_id": cite_map[token][0]
+    } for start, end, token, surface in all_spans_to_replace if token.startswith('CITETOKEN')]
+    ref_span_blobs = [{
+        "start": start,
+        "end": end,
+        "text": surface,
+        "ref_id": ref_map[token][0]
+    } for start, end, token, surface in all_spans_to_replace if token.startswith('REFTOKEN')]
+    for cite_blob in cite_span_blobs:
+        assert para_text[cite_blob["start"]:cite_blob["end"]] == cite_blob["text"]
+    for ref_blob in ref_span_blobs:
+        assert para_text[ref_blob["start"]:ref_blob["end"]] == ref_blob["text"]
+    return {
+        'text': para_text,
+        'cite_spans': cite_span_blobs,
+        'ref_spans': ref_span_blobs,
+        'eq_spans': [],
+        'section': section_names
+    }
+def extract_abstract_from_tei_xml(
+        sp: BeautifulSoup,
+        bib_dict: Dict,
+        ref_dict: Dict,
+        cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse abstract from soup
+    :param sp:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    abstract_text = []
+    if sp.abstract:
+        # process all divs
+        if sp.abstract.div:
+            for div in sp.abstract.find_all('div'):
+                if div.text:
+                    if div.p:
+                        for para in div.find_all('p'):
+                            if para.text:
+                                abstract_text.append(
+                                    process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
+                                )
+                    else:
+                        if div.text:
+                            abstract_text.append(
+                                process_paragraph(sp, div, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
+                            )
+        # process all paragraphs
+        elif sp.abstract.p:
+            for para in sp.abstract.find_all('p'):
+                if para.text:
+                    abstract_text.append(
+                        process_paragraph(sp, para, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
+                    )
+        # else just try to get the text
+        else:
+            if sp.abstract.text:
+                abstract_text.append(
+                    process_paragraph(sp, sp.abstract, [(None, "Abstract")], bib_dict, ref_dict, cleanup_bracket)
+                )
+        sp.abstract.decompose()
+    return abstract_text
+def extract_body_text_from_div(
+        sp: BeautifulSoup,
+        div: bs4.element.Tag,
+        sections: List[Tuple],
+        bib_dict: Dict,
+        ref_dict: Dict,
+        cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse body text from soup
+    :param sp:
+    :param div:
+    :param sections:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    chunks = []
+    # check if nested divs; recursively process
+    if div.div:
+        for subdiv in div.find_all('div'):
+            # has header, add to section list and process
+            if subdiv.head:
+                chunks += extract_body_text_from_div(
+                    sp,
+                    subdiv,
+                    sections + [(subdiv.head.get('n', None), subdiv.head.text.strip())],
+                    bib_dict,
+                    ref_dict,
+                    cleanup_bracket
+                )
+                subdiv.head.decompose()
+            # no header, process with same section list
+            else:
+                chunks += extract_body_text_from_div(
+                    sp,
+                    subdiv,
+                    sections,
+                    bib_dict,
+                    ref_dict,
+                    cleanup_bracket
+                )
+    # process tags individuals
+    for tag in div:
+        try:
+            if tag.name == 'p':
+                if tag.text:
+                    chunks.append(process_paragraph(
+                        sp, tag, sections, bib_dict, ref_dict, cleanup_bracket
+                    ))
+            elif tag.name == 'formula':
+                # e.g. <formula xml:id="formula_0">Y = W T X.<label>(1)</label></formula>
+                label = tag.label.text
+                tag.label.decompose()
+                eq_text = tag.text
+                chunks.append({
+                    'text': 'EQUATION',
+                    'cite_spans': [],
+                    'ref_spans': [],
+                    'eq_spans': [
+                        {
+                            "start": 0,
+                            "end": 8,
+                            "text": "EQUATION",
+                            "ref_id": "EQREF",
+                            "raw_str": eq_text,
+                            "eq_num": label
+                        }
+                    ],
+                    'section': sections
+                })
+        except AttributeError:
+            if tag.text:
+                chunks.append(process_paragraph(
+                    sp, tag, sections, bib_dict, ref_dict, cleanup_bracket
+                ))
+    return chunks
+def extract_body_text_from_tei_xml(
+        sp: BeautifulSoup,
+        bib_dict: Dict,
+        ref_dict: Dict,
+        cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse body text from soup
+    :param sp:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    body_text = []
+    if sp.body:
+        body_text = extract_body_text_from_div(sp, sp.body, [], bib_dict, ref_dict, cleanup_bracket)
+        sp.body.decompose()
+    return body_text
+def extract_back_matter_from_tei_xml(
+        sp: BeautifulSoup,
+        bib_dict: Dict,
+        ref_dict: Dict,
+        cleanup_bracket: bool
+) -> List[Dict]:
+    """
+    Parse back matter from soup
+    :param sp:
+    :param bib_dict:
+    :param ref_dict:
+    :param cleanup_bracket:
+    :return:
+    """
+    back_text = []
+    if sp.back:
+        for div in sp.back.find_all('div'):
+            if div.get('type'):
+                section_type = div.get('type')
+            else:
+                section_type = ''
+            for child_div in div.find_all('div'):
+                if child_div.head:
+                    section_title = child_div.head.text.strip()
+                    section_num = child_div.head.get('n', None)
+                    child_div.head.decompose()
+                else:
+                    section_title = section_type
+                    section_num = None
+                if child_div.text:
+                    if child_div.text:
+                        back_text.append(
+                            process_paragraph(sp, child_div, [(section_num, section_title)], bib_dict, ref_dict, cleanup_bracket)
+                        )
+        sp.back.decompose()
+    return back_text
+def convert_tei_xml_soup_to_s2orc_json(soup: BeautifulSoup, paper_id: str, pdf_hash: str) -> Paper:
+    """
+    Convert Grobid TEI XML to S2ORC json format
+    :param soup: BeautifulSoup of XML file content
+    :param paper_id: name of file
+    :param pdf_hash: hash of PDF
+    :return:
+    """
+    # extract metadata
+    metadata = extract_paper_metadata_from_grobid_xml(soup.fileDesc)
+    # clean metadata authors (remove dupes etc)
+    metadata['authors'] = _clean_empty_and_duplicate_authors_from_grobid_parse(metadata['authors'])
+    # parse bibliography entries (removes empty bib entries)
+    biblio_entries = parse_bibliography(soup)
+    bibkey_map = {
+        normalize_grobid_id(bib['ref_id']): bib for bib in biblio_entries
+    }
+    # # process formulas and replace with text
+    # extract_formulas_from_tei_xml(soup)
+    # extract figure and table captions
+    refkey_map = extract_figures_and_tables_from_tei_xml(soup)
+    # get bracket style
+    is_bracket_style = check_if_citations_are_bracket_style(soup)
+    # substitute all note tags with p tags
+    soup = sub_all_note_tags(soup)
+    # process abstract if possible
+    abstract_entries = extract_abstract_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
+    # process body text
+    body_entries = extract_body_text_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
+    # parse back matter (acks, author statements, competing interests, abbrevs etc)
+    back_matter = extract_back_matter_from_tei_xml(soup, bibkey_map, refkey_map, is_bracket_style)
+    # form final paper entry
+    return Paper(
+        paper_id=paper_id,
+        pdf_hash=pdf_hash,
+        metadata=metadata,
+        abstract=abstract_entries,
+        body_text=body_entries,
+        back_matter=back_matter,
+        bib_entries=bibkey_map,
+        ref_entries=refkey_map
+    )
+def convert_tei_xml_file_to_s2orc_json(tei_file: str, pdf_hash: str = "") -> Paper:
+    """
+    Convert a TEI XML file to S2ORC JSON
+    :param tei_file:
+    :param pdf_hash:
+    :return:
+    """
+    if not os.path.exists(tei_file):
+        raise FileNotFoundError("Input TEI XML file doesn't exist")
+    paper_id = tei_file.split('/')[-1].split('.')[0]
+    soup = BeautifulSoup(open(tei_file, "rb").read(), "xml")
+    paper = convert_tei_xml_soup_to_s2orc_json(soup, paper_id, pdf_hash)
+    return paper

s2orc-doc2json/doc2json/jats2json/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/jats2json/jats_to_json.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+Mostly copied from cite2vec paper_parsing.parse_nxml
+"""
+from typing import List, Set, Dict, Callable
+import os
+import json
+import re
+import multiprocessing
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+from glob import glob
+from pprint import pprint
+from doc2json.utils.soup_utils import destroy_unimportant_tags_inplace
+from doc2json.jats2json.pmc_utils.front_tag_utils import parse_journal_id_tag, parse_journal_name_tag, \
+    parse_title_tag, parse_category_tag, parse_date_tag, parse_doi_tag, parse_pmc_id_tag, parse_pubmed_id_tag, \
+    parse_authors, parse_affiliations, parse_abstract_tag, parse_funding_groups, NoAuthorNamesError
+from doc2json.jats2json.pmc_utils.extract_utils import extract_fig_blobs, extract_table_blobs, extract_suppl_blobs
+from doc2json.jats2json.pmc_utils.all_tag_utils import replace_xref_with_string_placeholders, \
+    replace_sup_sub_tags_with_string_placeholders, recurse_parse_section
+from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section
+from doc2json.jats2json.pmc_utils.back_tag_utils import parse_bib_entries
+from doc2json.s2orc import Paper
+def process_front_tag(front_tag, soup) -> Dict:
+    # process <journal-meta> tags
+    journal_id: str = parse_journal_id_tag(front_tag=front_tag)
+    journal_name: str = parse_journal_name_tag(front_tag=front_tag)
+    # process <article-meta> tags
+    title: str = parse_title_tag(front_tag=front_tag)
+    try:
+        authors: List[Dict] = parse_authors(front_tag=front_tag)
+    except NoAuthorNamesError:
+        authors: List[Dict] = []
+    affiliations: Dict = parse_affiliations(front_tag=front_tag)
+    dates: Dict = parse_date_tag(front_tag=front_tag)
+    pubmed_id: str = parse_pubmed_id_tag(front_tag=front_tag)
+    pmc_id: str = parse_pmc_id_tag(front_tag=front_tag)
+    doi: str = parse_doi_tag(front_tag=front_tag)
+    abstract: List[Dict] = parse_abstract_tag(front_tag=front_tag, soup=soup)
+    # categories: str = parse_category_tag(front_tag=front_tag)
+    funding_groups: List[str] = parse_funding_groups(front_tag=front_tag)
+    return {
+        'title': title,
+        'abstract': abstract,
+        'authors': authors,
+        'affiliations': affiliations,
+        'journal_id': journal_id,
+        'journal_name': journal_name,
+        'pubmed_id': pubmed_id,
+        'pmc_id': pmc_id,
+        'doi': doi,
+        'year': dates,
+        'funding_groups': funding_groups
+    }
+def process_body_tag(body_tag, soup) -> Dict:
+    # replace all xref tags with string placeholders
+    replace_xref_with_string_placeholders(soup_tag=body_tag, soup=soup)
+    # replace all sup/sub tags with string placeholders
+    replace_sup_sub_tags_with_string_placeholders(soup_tag=body_tag, soup=soup)
+    # some articles (like PMC2844102) have no sections
+    sec_tags = body_tag.find_all('sec', recursive=False)
+    # try looking in article tag
+    if not sec_tags:
+        try:
+            sec_tags = body_tag.article.find_all('sec', recursive=False)
+        except:
+            pass
+    if sec_tags:
+        all_par_blobs = []
+        for sec_tag in sec_tags:
+            # note; most sections dont have this 'sec-type' attribute
+            if sec_tag.get('sec-type') == 'supplementary-material':
+                # hopefully all the important supplementary content already extracted above in previous step
+                continue
+            else:
+                par_blobs = recurse_parse_section(sec_tag=sec_tag)
+                all_par_blobs.extend(par_blobs)
+    else:
+        all_par_blobs = parse_all_paragraphs_in_section(body_tag)
+    return {
+        'body_text': all_par_blobs,
+    }
+def process_back_tag(back_tag) -> Dict:
+    # glossary = {}
+    # if back_tag.find('glossary'):
+    #     for def_item_tag in back_tag.find('glossary').find_all('def-item'):
+    #         glossary[def_item_tag.find('term').text] = def_item_tag.find('def').text
+    # TODO: author contrib and COIs
+    # notes = []
+    # for notes_tag in back_tag.find_all('notes'):
+    #     pass
+    # TODO: PMC2778891 has back tag that looks like:  <back><sec><title>Acknowledgements</title><p>Supported by the Austrian Science Fund (P-20670 and W11).</p></sec></back>
+    #       that is, it doesn't have 'ack' section.
+    acknowledgements: List[Dict] = []
+    for ack_tag in back_tag.find_all('ack'):
+        title_tag = ack_tag.find('title')
+        for par_tag in ack_tag.find_all('p'):
+            acknowledgements.append({
+                'section': title_tag.text if title_tag is not None else None,
+                'text': par_tag.text,
+                'funding_sources': [fund_tag.text for fund_tag in par_tag.find_all('funding-source')],
+                'urls': [url_tag.text for url_tag in par_tag.find_all('ext-link')]
+            })
+    bib_entries = parse_bib_entries(back_tag)
+    return {
+        'acknowledgements': acknowledgements,
+        'bib_entries': bib_entries,
+    }
+def postprocess_front_tags_for_s2orc(init_front_dict: Dict):
+    """
+    Fix authors and year for S2ORC format
+    """
+    # Make authors in front tags look like S2ORC
+    for a in init_front_dict['authors']:
+        a['affiliation'] = {}
+        # get affiliation if available
+        if a['affiliation_ids']:
+            affil_id = a['affiliation_ids'][0]
+            affil_text = [affil['text'] for affil in init_front_dict['affiliations'] if affil['id'] == affil_id]
+            if affil_text:
+                a['affiliation'] = {
+                    'laboratory': "",
+                    'institution': affil_text[0],
+                    'location': {}
+                }
+        del a['affiliation_ids']
+        del a['corresponding']
+        del a['orcid']
+    del init_front_dict['affiliations']
+    # Pick best year and make year int in front tags
+    if init_front_dict['year'].get('epub'):
+        year = init_front_dict['year'].get('epub')
+    elif init_front_dict['year'].get('accepted'):
+        year = init_front_dict['year'].get('accepted')
+    elif init_front_dict['year'].get('collection'):
+        year = init_front_dict['year'].get('collection')
+    elif init_front_dict['year'].get('received'):
+        year = init_front_dict['year'].get('received')
+    else:
+        year = None
+    init_front_dict['year'] = year
+    return init_front_dict
+def convert_acks_to_s2orc(paragraphs: List) -> List[Dict]:
+    """
+    Convert acks to S2ORC paragraphs
+    """
+    for paragraph_blob in paragraphs:
+        paragraph_blob['cite_spans'] = []
+        paragraph_blob['ref_spans'] = []
+        del paragraph_blob['funding_sources']
+        del paragraph_blob['urls']
+    return paragraphs
+def convert_paragraphs_to_s2orc(paragraphs: List, old_to_new: Dict) -> List[Dict]:
+    """
+    Convert paragraphs into S2ORC format
+    """
+    # TODO: temp code to process body text into S2ORC format.  this includes getting rid of sub/superscript spans.
+    #       also combining fig & table spans into ref spans.
+    #       also remapping the reference / bib labels to the new ones defined earlier in this function.
+    #       temporarily, we cant support PMC xml parse bibs, so remove all links to the bibliography (cuz they'll be wrong)
+    for paragraph_blob in paragraphs:
+        del paragraph_blob['sup_spans']
+        del paragraph_blob['sub_spans']
+        paragraph_blob['ref_spans'] = []
+        for fig_tab_span in paragraph_blob['fig_spans'] + paragraph_blob['table_spans']:
+            # replace old ref_id with new ref_id.  default to None if null
+            # optional, just wanted to check if this ever happens
+            assert fig_tab_span['ref_id']
+            fig_tab_span['ref_id'] = old_to_new.get(fig_tab_span['ref_id'])
+            paragraph_blob['ref_spans'].append(fig_tab_span)
+        del paragraph_blob['fig_spans']
+        del paragraph_blob['table_spans']
+        for cite_span in paragraph_blob['cite_spans']:
+            # replace old cite ids with new cite ids.  again default to None if null
+            # optional, just wanted to check if this ever happens
+            assert cite_span['ref_id']
+            cite_span['ref_id'] = old_to_new.get(cite_span['ref_id'])
+    return paragraphs
+def convert_jats_xml_to_s2orc_json(jats_file: str, log_dir: str):
+    """
+    Convert JATS XML to S2ORC JSON
+    :param jats_file:
+    :param log_dir:
+    :return:
+    """
+    # get file id (PMC id usually)
+    file_id = jats_file.split('/')[-1].split('.')[0]
+    # read JATS XML
+    with open(jats_file, 'r') as f_in:
+        soup = BeautifulSoup(f_in, 'lxml')
+        destroy_unimportant_tags_inplace(soup, tags_to_remove=['bold', 'italic', 'graphic'])
+    # all the XML files have their own wonky reference IDs.  we want to standardize them, but need to remember the old->new mapping
+    old_key_to_new_key = {}
+    # REFERENCES
+    table_blobs = extract_table_blobs(soup)
+    figure_blobs = extract_fig_blobs(soup)
+    # TODO: not current represented in S2ORC, keep for later
+    suppl_blobs = extract_suppl_blobs(soup)
+    # TODO: for S2ORC, need to process them into a single ref dict.  need to construct new IDs to match ID conventions.  and update all cite spans.
+    #       also, S2ORC table captions are free text without detected reference/citation mentions
+    # TODO: may want to keep table representations around
+    ref_entries = {}
+    for i, (old_table_key, table_blob) in enumerate(sorted(table_blobs.items())):
+        # TODO: PMC2557072 table `tbl5` has no label.  skip.
+        # TODO: PMC3137981 table `tab1` has no caption text.  skip.
+        if not table_blob['label'] or not table_blob['caption']:
+            continue
+        table_text = table_blob['label'] + ': ' + ' '.join(
+            [c['text'] for c in table_blob['caption']]
+        ) + '\n' + ' '.join([f['text'] for f in table_blob['footnote']])
+        new_table_key = f'TABREF{i}'
+        old_key_to_new_key[old_table_key] = new_table_key
+        # TODO: skipping over any citations or references in the table for now
+        if table_blob['xml']:
+            table_content = table_blob['xml'][0]['text']
+        ref_entries[new_table_key] = {'text': table_text, 'content': table_content, 'type': 'table'}
+    for i, (old_figure_key, figure_blob) in enumerate(sorted(figure_blobs.items())):
+        # TODO: double-check, but it seems like figure blobs dont have footnotes parsed out? might be bug
+        # TODO: PMC1326260 first figure has no ['label'].  just skip these for now (because no inline references)
+        # TODO: PMC2403743 has null-valued caption in `fig1`.  also skip here. fix later.
+        if not figure_blob['label'] or not figure_blob['caption']:
+            continue
+        figure_text = figure_blob['label'] + ': ' + ' '.join([c['text'] for c in figure_blob['caption']])
+        new_figure_key = f'FIGREF{i}'
+        old_key_to_new_key[old_figure_key] = new_figure_key
+        ref_entries[new_figure_key] = {'text': figure_text, 'type': 'figure'}
+    # FRONT TAGS
+    front_tag = soup.find('front').extract()
+    front_dict = process_front_tag(front_tag=front_tag, soup=soup)
+    front_dict = postprocess_front_tags_for_s2orc(front_dict)
+    front_dict['abstract'] = convert_paragraphs_to_s2orc(front_dict['abstract'], old_key_to_new_key)
+    # BACK TAGS
+    back_tag = soup.find('back')
+    back_dict = {}
+    # PMC1139917 doesnt have 'back' tag
+    if back_tag is not None:
+        back_dict = process_back_tag(back_tag=back_tag)
+        # TODO: format bib entries to S2ORC format.  we're already very close, but need a couple changes:
+        #       - author blobs include a 'suffix' which defaults to empty string
+        #       - issn defaults to empty string
+        #       - rename all the bib IDs
+        bib_entries = {}
+        for i, (old_bib_key, bib_entry) in enumerate(sorted(back_dict['bib_entries'].items())):
+            del bib_entry['ref_id']
+            new_bib_key = f'BIBREF{i}'
+            old_key_to_new_key[old_bib_key] = new_bib_key
+            bib_entries[new_bib_key] = bib_entry
+    else:
+        bib_entries = {}
+    if back_dict and back_dict.get('acknowledgements'):
+        back_dict['acknowledgements'] = convert_acks_to_s2orc(back_dict['acknowledgements'])
+    # BODY TAGS
+    body_tag = soup.find('body')
+    # PMC1240684 doesnt have 'body' tag
+    if body_tag is not None:
+        body_dict = process_body_tag(body_tag=body_tag, soup=soup)
+        body_text = body_dict['body_text']
+    else:
+        # Has no body: /disk2/gorpus/20200101/pmc/Br_Foreign_Med_Chir_Rev/PMC5163425.nxml
+        body_text = []
+    body_text = convert_paragraphs_to_s2orc(body_text, old_key_to_new_key)
+    metadata = {
+        "title": front_dict['title'],
+        "authors": front_dict['authors'],
+        "year": front_dict['year'],
+        "venue": front_dict['journal_name'],
+        "identifiers": {
+            "doi": front_dict['doi'],
+            "pubmed_id": front_dict['pubmed_id'],
+            "pmc_id": front_dict['pmc_id']
+        }
+    }
+    return Paper(
+        paper_id=file_id,
+        pdf_hash="",
+        metadata=metadata,
+        abstract=front_dict['abstract'],
+        body_text=body_text,
+        back_matter=back_dict.get('acknowledgements', []),
+        bib_entries=bib_entries,
+        ref_entries=ref_entries
+    )
+if __name__ == '__main__':
+    jats_file = 'tests/jats/PMC5828200.nxml'
+    paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
+    jats_file = 'tests/jats/PMC6398430.nxml'
+    paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
+    jats_file = 'tests/jats/PMC7417471.nxml'
+    paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
+    print('done.')

s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from typing import Dict, List, Callable
+import re
+import itertools
+from bs4 import BeautifulSoup
+START_TOKENS = {"#!start#", "@!start@", "&!start&"}
+SEP_TOKENS = {"#!sep#"}
+END_TOKENS = {"#!end#", "@!end@", "&!end&"}
+ALL_TOKENS = START_TOKENS | SEP_TOKENS | END_TOKENS
+def replace_xref_with_string_placeholders(soup_tag, soup):
+    # replace all xref tags with string placeholders
+    for xref_tag in soup_tag.find_all("xref"):
+        rid = xref_tag['rid'] if 'rid' in xref_tag.attrs else None
+        ref_type = xref_tag['ref-type'] if 'ref-type' in xref_tag.attrs else None
+        xref_tag.replace_with(
+            soup.new_string(
+                f"#!start#{xref_tag.text}#!sep#{rid}#!sep#{ref_type}#!end#"
+            )
+        )
+def replace_sup_sub_tags_with_string_placeholders(soup_tag, soup):
+    # replace all sup/sub tags with string placeholders
+    for sup_tag in soup_tag.find_all("sup"):
+        sup_tag.replace_with(soup.new_string(f"@!start@{sup_tag.text}@!end@"))
+    for sub_tag in soup_tag.find_all("sub"):
+        sub_tag.replace_with(soup.new_string(f"&!start&{sub_tag.text}&!end&"))
+def recurse_parse_section(
+    sec_tag,
+    # suppl_blobs: Dict
+) -> List[Dict]:
+    """Recursive function for getting paragraph blobs to look like
+        {
+            'text': ...,
+            ...,
+            'section': SUBSUBSECTION_NAME :: SUBSECTION_NAME :: SECTION_NAME
+        }
+    """
+    subsections = sec_tag.find_all("sec", recursive=False)
+    if not subsections:
+        return parse_all_paragraphs_in_section(
+            sec_tag=sec_tag
+        )  # , suppl_blobs=suppl_blobs)
+    else:
+        outputs = []
+        for child in subsections:
+            child_blobs = recurse_parse_section(
+                sec_tag=child
+            )  # , suppl_blobs=suppl_blobs)
+            for blob in child_blobs:
+                # PMC373254 - process blob['section'] to remove any span markers left in there
+                for t in ALL_TOKENS:
+                    blob['section'] = blob['section'].replace(t, '')
+                blob["section"] = blob["section"] + " :: " + sec_tag.find("title").text
+            outputs.extend(child_blobs)
+        return outputs
+def _reduce_args(stack: List, end_token: str) -> List[List]:
+    """Helper function for `_parse_all_paragraphs_in_section`.
+    Pop arguments for the xref off the top of the stack and return a list of argument lists,
+    where the outer lists represent groups divided by separators."""
+    start_token = end_token.replace('end', 'start')
+    sep_token = end_token.replace('end', 'sep')
+    args = [[]]
+    while True:
+        token = stack.pop()
+        if token == start_token:
+            return args
+        elif token == sep_token:
+            args.insert(0, [])
+        else:
+            args[0].insert(0, token)
+def _add_spans(
+    end_token: str,
+    start_pos: int,
+    text: str,
+    ref_id,
+    ref_type,
+    cite_spans: List,
+    fig_spans: List,
+    table_spans: List,
+    sup_spans: List,
+    sub_spans: List,
+):
+    """Helper function used by `_parse_all_paragraphs_in_section`."""
+    if end_token.startswith("#"):  # process xref
+        blob = {
+            "start": start_pos,
+            "end": start_pos + len(text),
+            "mention": text,
+            "ref_id": ref_id,
+        }
+        if ref_type == "bibr":
+            cite_spans.append(blob)
+        elif ref_type == "fig":
+            fig_spans.append(blob)
+        elif ref_type == "table":
+            table_spans.append(blob)
+    else:
+        blob = {
+            "start": start_pos,
+            "end": start_pos + len(text),
+            "mention": text,
+        }
+        if end_token.startswith("@"):
+            sup_spans.append(blob)
+        else:
+            assert end_token.startswith("&")
+            sub_spans.append(blob)
+def get_latex_from_formula(
+    formula_tag
+):
+    if formula_tag.find('tex-math'):
+        latex_text = formula_tag.find('tex-math').text
+        match = re.search(r'\\begin\{document\}(.+)\\end\{document\}', latex_text)
+        if match:
+            return match.group(1).strip('$')
+    return None
+def get_mathml_from_formula(
+    formula_tag
+):
+    if formula_tag.find('mml:math'):
+        return str(formula_tag.find('mml:math'))
+    return None
+def parse_formulas(
+    para_el,
+    sp,
+    replace
+):
+    # sub and get corresponding spans of inline formulas
+    formula_dict = dict()
+    eq_ind = 0
+    for ftag in para_el.find_all('inline-formula'):
+        try:
+            formula_key = f'INLINEFORM{eq_ind}'
+            eq_ind += 1
+            try:
+                formula_text = ftag.find('mml:math').text
+            except:
+                if 'begin{document}' not in ftag.text:
+                    formula_text = ftag.text
+                else:
+                    formula_text = "FORMULA"
+            formula_latex = get_latex_from_formula(ftag)
+            formula_mathml = get_mathml_from_formula(ftag)
+            if not formula_mathml and formula_latex:
+                formula_mathml = latex2mathml.converter.convert(formula_latex)
+            formula_dict[formula_key] = (formula_text, formula_latex, formula_mathml, ftag.get('id'))
+            if replace:
+                ftag.replace_with(sp.new_string(f" {formula_key} "))
+            else:
+                # replace with mathml text if available
+                if formula_text != 'FORMULA':
+                    ftag.replace_with(sp.new_string(f" {formula_text} "))
+        except AttributeError:
+            continue
+    return formula_dict
+def parse_all_paragraphs_in_section(
+    sec_tag,
+    par_to_text: Callable = None,
+    replace_formula=True
+) -> List[Dict]:
+    """Internal function. Assumes section has no nested tags
+    `par_to_text` is an optional function that converts the `par` tag into a string.  by default, calls `par_tag.text`.
+    """
+    outputs = []
+    sp = BeautifulSoup('', 'lxml')
+    for par_tag in sec_tag.find_all("p", recursive=True):
+        cite_spans = []
+        fig_spans = []
+        table_spans = []
+        # suppl_spans = []
+        sup_spans = []
+        sub_spans = []
+        eq_spans = []
+        if par_tag.find('display-formula'):
+            raise NotImplementedError('Display formula!')
+        if par_tag.find('formula'):
+            raise NotImplementedError('Formula!')
+        formula_dict = parse_formulas(par_tag, sp, replace_formula)
+        par_text = par_to_text(par_tag) if par_to_text else par_tag.text
+        par_text = re.sub(
+            r"[^\S\n\t]", " ", par_text
+        )  # replaces whitespace but not newline or tab
+        par_text = re.sub(
+            r"  ", " ", par_text
+        )  # replaces two spaces w/ one
+        # Tokenize the text into normal text and special placeholder tokens.
+        pattern = r"(#!start#)|(#!sep#)|(#!end#)|(@!start@)|(@!end@)|(&!start&)|(&!end&)"
+        tokens = [tok for tok in re.split(pattern, par_text) if tok]
+        # To handle nested structures, use a shift-reduce algorithm to consume the text. Placeholder tags are merged away, and related spans are registered.
+        stack = []
+        full_text = []
+        pos = 0
+        disable_count = False
+        for token in tokens:
+            if token in START_TOKENS:
+                stack.append(token)
+                stack.append(pos)
+                stack.append(token.replace('start', 'sep'))
+            elif token in SEP_TOKENS:
+                assert stack
+                stack.append(token)
+                disable_count = True
+            elif token in END_TOKENS:
+                assert stack
+                disable_count = False
+                args = _reduce_args(stack, token)
+                start_pos = args[0][0]
+                text = "".join(args[1])
+                assert len(args) == 2 or len(args) == 4
+                if len(args) == 2:
+                    ref_id, ref_type = None, None
+                elif len(args) == 4:
+                    ref_id = args[2] and args[2][0]
+                    ref_type = args[3] and args[3][0]
+                stack.append(text)
+                _add_spans(
+                    token,
+                    start_pos,
+                    text,
+                    ref_id,
+                    ref_type,
+                    cite_spans,
+                    fig_spans,
+                    table_spans,
+                    sup_spans,
+                    sub_spans,
+                )
+            else:  # just normal text
+                stack.append(token)
+                if not disable_count:  # metadata appearing after a separator
+                    full_text.append(token)
+                    pos += len(token)
+        full_text = "".join(full_text)
+        assert pos == len(full_text)
+        title = sec_tag.find("title")
+        title = title.text if title else ""
+        # get all equation spans
+        eq_spans = []
+        for span in itertools.chain(
+                re.finditer(r'(INLINEFORM\d+)', full_text),
+                re.finditer(r'(DISPLAYFORM\d+)', full_text)
+        ):
+            try:
+                matching_formula = formula_dict[span.group()]
+                eq_spans.append({
+                    "start": span.start(),
+                    "end": span.start() + len(span.group()),
+                    "text": matching_formula[0],
+                    "latex": matching_formula[1],
+                    "mathml": matching_formula[2],
+                    "ref_id": span.group()
+                })
+            except KeyError:
+                continue
+        outputs.append(
+            {
+                "text": full_text,
+                'cite_spans': cite_spans,
+                'fig_spans': fig_spans,
+                'table_spans': table_spans,
+                # 'suppl_spans': suppl_spans,
+                'sup_spans': sup_spans,
+                'sub_spans': sub_spans,
+                'eq_spans': eq_spans,
+                "section": title,
+            }
+        )
+    return outputs

s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import Dict, List
+def _wrap_text(tag):
+    return tag.text if tag else ''
+def parse_authors(authors_tag) -> List:
+    """The PMC XML has a slightly different format than authors listed in front tag."""
+    if not authors_tag:
+        return []
+    authors = []
+    for name_tag in authors_tag.find_all('name', recursive=False):
+        surname = name_tag.find('surname')
+        given_names = name_tag.find('given-names')
+        given_names = given_names.text.split(' ') if given_names else None
+        suffix = name_tag.find('suffix')
+        authors.append({
+            'first': given_names[0] if given_names else '',
+            'middle': given_names[1:] if given_names else [],
+            'last': surname.text if surname else '',
+            'suffix': suffix.text if suffix else ''
+        })
+    return authors
+def parse_bib_entries(back_tag) -> Dict:
+    bib_entries = {}
+    # TODO: PMC2778891 does not have 'ref-list' in its back_tag.  do we even need this, or can directly .find_all('ref')?
+    ref_list_tag = back_tag.find('ref-list')
+    if ref_list_tag:
+        for ref_tag in ref_list_tag.find_all('ref'):
+            # The ref ID and label are semantically swapped between CORD-19 and PMC, lol
+            ref_label = ref_tag['id']
+            ref_id = ref_tag.find('label')
+            authors_tag = ref_tag.find('person-group', {'person-group-type': 'author'})
+            year = ref_tag.find('year')
+            fpage = ref_tag.find('fpage')
+            lpage = ref_tag.find('lpage')
+            pages = f'{fpage.text}-{lpage.text}' if fpage and lpage else None
+            dois = [tag.text for tag in ref_tag.find_all('pub-id', {'pub-id-type': 'doi'})]
+            bib_entries[ref_label] = {
+                'ref_id': _wrap_text(ref_id),
+                'title': _wrap_text(ref_tag.find('article-title')),
+                'authors': parse_authors(authors_tag),
+                'year': int(year.text) if year and year.text.isdigit() else None,
+                'venue': _wrap_text(ref_tag.find('source')),
+                'volume': _wrap_text(ref_tag.find('volume')),
+                'issn': _wrap_text(ref_tag.find('issue')),
+                'pages': pages,
+                'other_ids': {
+                    'DOI': dois,
+                }
+            }
+    return bib_entries

s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import Dict
+import bs4
+from bs4 import BeautifulSoup
+from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section
+def extract_fig_blobs(body_tag) -> Dict:
+    fig_blobs = {}
+    for fig_tag in body_tag.find_all('fig'):
+        fig = fig_tag.extract()
+        label = fig.find('label')
+        fig_blobs[fig['id']] = {
+            'label': label and label.text,
+            'caption': fig.find('caption')
+        }
+    _update_fig_blobs(fig_blobs)
+    return fig_blobs
+def _update_fig_blobs(fig_blobs: Dict):
+    for fig_blob in fig_blobs.values():
+        if fig_blob['caption'] is None:
+            continue
+        # replace non-p tags w/ p tags in figure caption (mostly dealing with title tags, which weren't being extracted before)
+        for tag in fig_blob['caption']:
+            if type(tag) == bs4.element.Tag and tag.name != 'p':
+                tag.name = 'p'
+        par_blobs = parse_all_paragraphs_in_section(sec_tag=fig_blob['caption'], replace_formula=False)
+        for par_blob in par_blobs:
+            del par_blob['section']
+        fig_blob['caption'] = par_blobs
+def extract_table_blobs(body_tag) -> Dict:
+    # note 1: footnotes dont always exist for each table; hence the if statement
+    # note 2: we want to preserve the XML tags for tables, but also need to run it through the regex cleaner for xrefs and other spans
+    #         hence, wrapping all of the table XML text into a fake <p> paragraph tag
+    table_blobs = {}
+    for table_tag in body_tag.find_all('table-wrap'):
+        table = table_tag.extract()
+        label = table.find('label')
+        # TODO: currently restricting to tables with identifiers.  might want to include unreferenced tables once we care more.
+        if table.get('id'):
+            table_blobs[table['id']] = {
+                'label': label and label.text,
+                'caption': table.find('caption'),
+                'footnote': table.find('table-wrap-foot') if table.find('table-wrap-foot') else BeautifulSoup('<p></p>', 'xml'),
+                'xml': BeautifulSoup('<p>' + str(table.find('table')) + '</p>', 'xml')
+            }
+    _update_table_blobs(table_blobs)
+    return table_blobs
+def _update_table_blobs(table_blobs: Dict):
+    for table_blob in table_blobs.values():
+        if table_blob['caption'] is not None:
+            # replace non-p tags w/ p tags in table caption (mostly dealing with title tags, which weren't being extracted before)
+            for tag in table_blob['caption']:
+                if type(tag) == bs4.element.Tag and tag.name != 'p':
+                    tag.name = 'p'
+            par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['caption'], replace_formula=False)
+            for par_blob in par_blobs:
+                del par_blob['section']
+            table_blob['caption'] = par_blobs
+        if table_blob['footnote'] is not None:
+            par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['footnote'], replace_formula=False)
+            for par_blob in par_blobs:
+                del par_blob['section']
+            table_blob['footnote'] = par_blobs
+        # note: if we dont include `par_to_text` function, the parser will convert all <p> tags to text via `par_tag.text`
+        #       which actually removes all XML tags we wanted to preserve in table.
+        #       by passing in str(), we ensure to keep all of those tags
+        if table_blob['xml'] is not None:
+            par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['xml'], par_to_text=str, replace_formula=False)
+            for par_blob in par_blobs:
+                del par_blob['section']
+            table_blob['xml'] = par_blobs
+def extract_suppl_blobs(body_tag) -> Dict:
+    suppl_blobs = {}
+    for suppl_tag in body_tag.find_all('supplementary-material'):
+        suppl = suppl_tag.extract()
+        # We only care about supplementary material that can be referenced (like figures/tables)
+        # for example, we dont care about PMC1139917 which has supplementary material but without an ID
+        if 'id' in suppl:
+            label = suppl.find('label')
+            suppl_blobs[suppl['id']] = {
+                'label': label and label.text,
+                'caption': suppl.find('caption')
+            }
+    _update_suppl_blobs(suppl_blobs)
+    return suppl_blobs
+def _update_suppl_blobs(suppl_blobs: Dict):
+    for suppl_blob in suppl_blobs.values():
+        if suppl_blob['caption'] is None:
+            continue
+        par_blobs = parse_all_paragraphs_in_section(sec_tag=suppl_blob['caption'])
+        for par_blob in par_blobs:
+            del par_blob['section']
+        suppl_blob['caption'] = par_blobs

s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""
+Functions for parsing specific `front_tag` soup tags
+"""
+from typing import Dict, List, Optional
+from collections import Counter
+import re
+from doc2json.jats2json.pmc_utils.all_tag_utils import recurse_parse_section, parse_all_paragraphs_in_section, \
+    replace_sup_sub_tags_with_string_placeholders, replace_xref_with_string_placeholders
+class NoAuthorNamesError(Exception):
+    """Known papers that trigger:
+        - PMC3462967
+    """
+    pass
+def parse_journal_id_tag(front_tag) -> str:
+    """
+    front_tag.find_all('journal-id') returns:
+        [
+            <journal-id journal-id-type="nlm-ta">Neurosci J</journal-id>,
+            <journal-id journal-id-type="iso-abbrev">Neurosci J</journal-id>,
+            <journal-id journal-id-type="publisher-id">NEUROSCIENCE</journal-id>
+        ]
+        [
+            <journal-id journal-id-type="nlm-ta">BMC Biochem</journal-id>
+            <journal-id journal-id-type="iso-abbrev">BMC Biochem</journal-id>
+        ]
+    """
+    c = Counter()
+    for tag in front_tag.find_all('journal-id'):
+        c[tag.text] += 1
+        tag.decompose()
+    journal_id, n = c.most_common(1)[0]
+    return journal_id
+def parse_journal_name_tag(front_tag) -> str:
+    """
+    Examples:
+        # Paper 1
+        <journal-title-group>
+            <journal-title>BMC Biochemistry</journal-title>
+        </journal-title-group>
+        # Paper 2
+        <journal-title-group>
+            <journal-title>Neuroscience Journal</journal-title>
+        </journal-title-group>
+    But not all titles are contained within a `journal-title-group`.  See PMC1079901
+        <journal-meta>
+            <journal-id journal-id-type="nlm-ta">
+                Biomed Eng Online
+            </journal-id>
+            <journal-title>
+                BioMedical Engineering OnLine
+            </journal-title>
+        ...
+    """
+    if len(front_tag.find_all('journal-title')) > 1:
+        raise Exception('Multiple journal titles?!')
+    return front_tag.find('journal-title').extract().text
+def parse_pubmed_id_tag(front_tag) -> Optional[str]:
+    """Not every PMC paper has a PMID """
+    pmid_tag = front_tag.find('article-id', {'pub-id-type': 'pmid'})
+    if pmid_tag is None:
+        return None
+    else:
+        return pmid_tag.extract().text
+def parse_pmc_id_tag(front_tag) -> str:
+    return f"PMC{front_tag.find('article-id', {'pub-id-type': 'pmc'}).extract().text}"
+def parse_doi_tag(front_tag) -> Optional[str]:
+    """Not all papers have a DOI"""
+    doi_tag = front_tag.find('article-id', {'pub-id-type': 'doi'})
+    if doi_tag is not None:
+        return doi_tag.extract().text
+    else:
+        return None
+def parse_title_tag(front_tag) -> str:
+    """
+    Examples:
+        # Paper 1
+        <title-group>
+            <article-title>Role of the highly conserved G68 residue in the yeast phosphorelay protein Ypd1: implications for interactions between histidine phosphotransfer (HPt) and response regulator proteins</article-title>
+        </title-group>
+        # Paper 2
+        <title-group>
+            <article-title>Association of Strength and Physical Functions in People with Parkinson's Disease</article-title>
+        </title-group>
+    Want to restrict to `title-group` because sometimes title shows up in <notes> under self-citation
+    """
+    title_group = front_tag.find('title-group').extract()
+    if len(title_group.find_all('article-title')) > 1:
+        raise Exception('Multiple article titles?!')
+    return title_group.find('article-title').text
+def parse_category_tag(front_tag) -> List[str]:
+    """
+    Examples:
+        # Paper 1
+        <article-categories>
+            <subj-group subj-group-type="heading">
+                <subject>Research Article</subject>
+            </subj-group>
+        </article-categories>
+        # Paper 2
+        <article-categories>
+            <subj-group subj-group-type="heading">
+                <subject>Research Article</subject>
+            </subj-group>
+        </article-categories>
+    """
+    if len(front_tag.find_all('subj-group')) > 1 or len(front_tag.find_all('subject')) > 1:
+        raise Exception('Multiple categories?!')
+    article_categories = front_tag.find('article-categories').extract()
+    return article_categories.find('subject').text
+def parse_date_tag(front_tag) -> Dict:
+    """
+    Two sets of tags contain dates:
+        <pub-date pub-type="collection">
+            <year>2018</year>
+        </pub-date>
+        <pub-date pub-type="epub">
+            <day>12</day>
+            <month>12</month>
+            <year>2018</year>
+        </pub-date>
+    And:
+        <history>
+            <date date-type="received">
+                <day>15</day>
+                <month>10</month>
+                <year>2018</year>
+            </date>
+            <date date-type="rev-recd">
+                <day>20</day>
+                <month>11</month>
+                <year>2018</year>
+            </date>
+            <date date-type="accepted">
+                <day>26</day>
+                <month>11</month>
+                <year>2018</year>
+            </date>
+        </history>
+    PMC2557072 has `date` tag with no `day`, only `year` and `month`
+    """
+    out = {}
+    for pub_date in front_tag.find_all('pub-date'):
+        year = pub_date.find('year')
+        month = pub_date.find('month')
+        day = pub_date.find('day')
+        out[pub_date.get('pub-type', 'MISSING_PUB_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None])
+        pub_date.decompose()
+    for date in front_tag.find_all('date'):
+        year = date.find('year')
+        month = date.find('month')
+        day = date.find('day')
+        out[date.get('date-type', 'MISSING_DATE_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None])
+        date.decompose()
+    return out
+def parse_funding_groups(front_tag) -> List[str]:
+    outs = []
+    for tag in front_tag.find_all():
+        # AND statement skips cases where the two tag types nest within each other; we only process the inner one
+        if (tag.name == 'funding-source' or tag.name == 'funding-statement') and tag.find('funding-source') is None and tag.find('funding-statement') is None:
+            out = {
+                'name': None,
+                'doi': None,
+                'notes': None,
+                # 'raw': str(tag)       # for debugging
+            }
+            # handle institution
+            institution_id_tag = tag.find('institution-id')
+            if institution_id_tag:
+                out['doi'] = institution_id_tag.extract().text.replace('http://dx.doi.org/', '')
+            institution_tag = tag.find('institution')
+            if institution_tag:
+                out['name'] = tag.find('institution').extract().text
+            # handle named content
+            funder_name_tag = tag.find('named-content', {'content-type': 'funder-name'})
+            if funder_name_tag:
+                out['name'] = funder_name_tag.extract().text
+            funder_id_tag = tag.find('named-content', {'content-type': 'funder-identifier'})
+            if funder_id_tag:
+                out['doi'] = funder_id_tag.extract().text.replace('http://dx.doi.org/', '')
+            # handle urls
+            if tag.get('xlink:href'):
+                out['doi'] = tag['xlink:href']
+            # fix DOIs with URLs in them
+            if out['doi']:
+                match = re.search(r'http(s?)://dx.doi.org/(.+)', out['doi'])
+                if match:
+                    out['doi'] = match.group(2)
+            # remainder text is either a name or a full statement
+            text = tag.text
+            if tag.name == 'funding-statement' or ('fund' in text or 'support' in text or 'provide' in text):
+                out['notes'] = text
+            else:
+                # what if something already in 'name'?  observed it's typically empty string; so ignore.
+                if not out['name']:
+                    out['name'] = text
+            # if DOI link is in the name, remove it and parse (PMC5407128)
+            if out['name'] and not out['doi']:
+                pattern = r'\s*http(s?)://dx.doi.org/(.+)$'
+                match = re.search(pattern, out['name'])
+                if match:
+                    out['doi'] = match.group(2)
+                    out['name'] = re.sub(pattern, r'', out['name'])
+            outs.append(out)
+    return outs
+# TODO: didnt want to handle <collab> group names; seemed rare and inconsistent; focus on <contrib> with <name> and <aff>
+def parse_authors(front_tag) -> List[Dict]:
+    authors = []
+    for contrib_tag in front_tag.find_all('contrib'):
+        # skip nesting; just process children (individual authors)
+        if contrib_tag.find_all('contrib'):
+            continue
+        # skip contribs without a name; these should be ones that consist of <collab> tag
+        if contrib_tag.find('name') is None:
+            continue
+        # corresponding tag
+        if (contrib_tag.get('corresp') == 'yes') or (contrib_tag.find('xref', {'ref-type': 'corresp'})):
+            is_corresp = True
+        else:
+            is_corresp = False
+        # orcid ID is sometimes a URL or just a number.  standardize as hyphenized number.
+        if contrib_tag.find('contrib-id'):
+            orcid_id = contrib_tag.find('contrib-id').text
+            match = re.search(r'http(s?)://orcid.org/(.+)', orcid_id)
+            if match:
+                orcid_id = match.group(2)
+            # A very small number of articles have ID type CATS, which we don't handle. For example:
+            #   /disk2/gorpus/20200101/pmc/Change/PMC6176774.nxml
+            if len(orcid_id) != 19:
+                orcid_id = None
+        else:
+            orcid_id = None
+        # Email may or may not be present.
+        email = contrib_tag.find('email')
+        email = email.text if email else None
+        # Get the name info for the author.
+        name_info = {name_tag.name: name_tag.text for name_tag in contrib_tag.find('name').find_all()}
+        # TODO: PMC3462967 is an Erratum. It does not have ['given-names'].  not sure we care about those, so try-catch for now
+        try:
+            given_names = name_info['given-names'].split(' ')
+        except KeyError as e:
+            raise NoAuthorNamesError
+        authors.append({
+            'first': given_names[0] if given_names else None,
+            'middle': given_names[1:] if given_names else None,
+            'last': name_info['surname'],
+            'suffix': name_info.get('suffix', ''),
+            'email': email,
+            'affiliation_ids': [xref_tag.get('rid') for xref_tag in contrib_tag.find_all('xref', {'ref-type': 'aff'})],
+            'corresponding': is_corresp,
+            'orcid': orcid_id
+        })
+        # authors.append(str(contrib_tag.extract()))
+    return authors
+def parse_affiliations(front_tag) -> List[Dict]:
+    """
+    Sometimes affiliations is nested within '<contrib-group>' along with
+    authors.  Sometimes, they're not and listed outside as multiple tags.
+    Not all <aff> have IDs.  For example:
+        <aff>St. Paul, Minnesota</aff>
+    """
+    outs = []
+    for aff_tag in front_tag.find_all('aff'):
+        if aff_tag.find('label'):                   # get rid of unused markers so `.text` is cleaner
+            aff_tag.find('label').decompose()
+        if aff_tag.find('sup'):
+            aff_tag.find('sup').decompose()         # same treatment as label
+        aff_id = aff_tag.get('id')
+        # it looks like we want to go to the full affiliation surface form without worrying about all possible handlings of <named-content> and other fields
+        # BUT, we do want to keep ISNI and GRID IDs when they occur.  They seem to occur typically within <institution-wrap>
+        # so let's handle those if they exist; safely decompose the tags (because they dont contribute to surface form); then grab remaining affiliation surface form
+        # implicit in this approach is that we dont need to actually handle <institution-wrap> tags because only one per affiliation
+        if len(aff_tag.find_all('institution-wrap')) > 1:
+            import pdb; pdb.set_trace()
+        id_type_to_id = {}
+        for institution_id_tag in aff_tag.find_all('institution-id'):
+            id_type_to_id[institution_id_tag['institution-id-type']] = institution_id_tag.text
+            institution_id_tag.decompose()
+        # TODO: processing of text:  there are a lot of random newline chars (cuz XML preserves page layout)
+        # --> replace them with whitespace if there's preceding punctuation char
+        # --> otherwise, replace them with comma
+        text = aff_tag.text
+        outs.append({
+            'id': aff_id,
+            'other_ids': id_type_to_id,
+            'text': text
+        })
+    return outs
+def parse_abstract_tag(front_tag, soup) -> List[Dict]:
+    """Not every paper has an abstract
+    Furthermore, note very abstract is structured into sections.
+    Some abstracts (see PMC1914226) look like:
+        <abstract>
+            <p> ... </p>
+            <p> ... </p>
+        </abstract>
+    """
+    # TODO: are there cases where <abstract> text <p> text </> </abstract> ?
+    abstract: List[Dict] = []
+    if front_tag.find('abstract'):
+        abstract_tag = front_tag.find('abstract').extract()
+        # replace all xref tags with string placeholders
+        replace_xref_with_string_placeholders(soup_tag=abstract_tag, soup=soup)
+        # replace all sup/sub tags with string placeholders
+        replace_sup_sub_tags_with_string_placeholders(soup_tag=abstract_tag, soup=soup)
+        if abstract_tag.find('sec'):
+            all_par_blobs = []
+            for sec_tag in abstract_tag.find_all('sec', recursive=False):
+                par_blobs = recurse_parse_section(sec_tag=sec_tag)
+                all_par_blobs.extend(par_blobs)
+        else:
+            all_par_blobs = parse_all_paragraphs_in_section(sec_tag=abstract_tag)
+            for par_blob in all_par_blobs:
+                # these 'sections' typically show up as empty string
+                par_blob['section'] = 'Abstract'
+                abstract.append(par_blob)
+    return abstract

s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py ADDED Viewed

	@@ -0,0 +1,347 @@

+funding_tags_and_parsed_dicts = [
+    # <funding-group> is typically the top-level tag
+    #
+    # within, we see <funding-source> and <funding-statement> as containing the main information we want
+    #
+    # here, we see <funding-source> with an 'id' attribute.  we can ignore these.
+    ("""<funding-group>
+            <award-group>
+                <funding-source id=\"CS200\">Wellcome Trust</funding-source>
+            </award-group>
+        </funding-group>""", None),
+    # sometimes, there are also <award-id> tags, but we can ignore these.  they're funding-group specific.
+    ("""<funding-group>
+            <award-group>
+                <funding-source id=\"sp1\">US Department of Energy's Office of Science, Biological and Environmental Research Program</funding-source>
+                <award-id rid=\"sp1\">DE-AC02-05CH11231</award-id>
+                <award-id rid=\"sp1\">DE-AC52-07NA27344</award-id>
+                <award-id rid=\"sp1\">DE-AC02-06NA25396</award-id>
+                <award-id rid=\"sp1\">DE-AC05-00OR22725</award-id>
+            </award-group>
+            <award-group>
+                <funding-source id=\"sp2\">German Research Foundation</funding-source>
+                <award-id rid=\"sp2\">INST 599/1-2</award-id>
+            </award-group>
+        </funding-group>""", None),
+    # <funding-statement> is a less structured alternative to <funding-source>
+    ("""<funding-group>
+            <funding-statement>No sources of funding were used to assist in the preparation of this study.</funding-statement>
+        </funding-group>""", None),
+    # Rarely, there is nesting!  ignore parents.
+    ("""<funding-group>
+            <funding-statement>
+                <funding-source>This work was supported by the Swedish Association for Sexuality Education (RFSU).</funding-source>
+            </funding-statement>
+        </funding-group>""", None),
+    # Sometimes both can occur, sort of duplicating the same information.
+    # For example "Cornell" is mentioned as both a <funding-source> and a <funding-statement>
+    ("""<funding-group>
+            <award-group>
+                <funding-source>
+                    <named-content content-type=\"funder-name\">Cornell University Institute for the Social Sciences</named-content>
+                </funding-source>
+            </award-group>
+            <funding-statement>The research was supported by a grant from the Cornell University Institute for the Social Sciences.</funding-statement>
+        </funding-group>""", None),
+    # many <funding-source>
+    ("""<funding-group>
+            <award-group id=\"sp1\">
+                <funding-source>Brien Holden Vision Institute</funding-source>
+            </award-group>
+            <award-group id=\"sp2\">
+                <funding-source>Australian Federal Government</funding-source>
+            </award-group>
+            <award-group id=\"sp3\">
+                <funding-source>International Postgraduate Research Scholarship (Cathleen Fedtke)</funding-source>
+            </award-group>
+            <award-group id=\"sp4\">
+                <funding-source>University of New South Wales, Australia</funding-source>
+            </award-group>
+            <award-group id=\"sp5\">
+                <funding-source>National Institutes of Health</funding-source>
+                <award-id>P30EY14801</award-id>
+            </award-group>
+            <award-group id=\"sp6\">
+                <funding-source>Florida Lions Eye Bank</funding-source>
+            </award-group>
+            <award-group id=\"sp7\">
+                <funding-source>Bascom Palmer Eye Institute</funding-source>
+            </award-group>
+        </funding-group>""", None),
+    # institutions can optionally occur within <funding-source>
+    # 'institution-id-type' is common, but also optional
+    # regardless of the institution ID type, it looks like the ID is always a DOI (or URL to a DOI)
+    ("""<funding-group>
+            <award-group>
+                <funding-source>
+                    <institution-wrap>
+                        <institution-id institution-id-type=\"FundRef\">http://dx.doi.org/10.13039/100000025</institution-id>
+                        <institution>National Institute of Mental Health</institution>
+                    </institution-wrap>
+                </funding-source>
+            <award-id>R01MH107333</award-id>
+            <principal-award-recipient>
+                <name><surname>Kim</surname><given-names>Woong-Ki</given-names></name>
+            </principal-award-recipient>
+        </award-group>
+    </funding-group>""", None),
+    ("""<funding-group specific-use=\"FundRef\">
+            <award-group>
+                <funding-source>
+                    <institution-wrap>
+                        <institution>Deutsche Forschungsgemeinschaft</institution>
+                        <institution-id>http://search.crossref.org/fundref?q=501100001659</institution-id>
+                    </institution-wrap>
+                </funding-source>
+                <award-id>Re 628/16-1</award-id>
+                <award-id>GRK 1216</award-id>
+            </award-group>
+        </funding-group>""", None),
+    ("""<funding-group>
+            <award-group id=\"funding-1\">
+                <funding-source>
+                    <institution-wrap>
+                        <institution>National Institutes of Health </institution>
+                        <institution-id institution-id-type=\"open-funder-registry\">10.13039/100000002</institution-id>
+                    </institution-wrap>
+                </funding-source>
+            </award-group>
+        </funding-group>""", None),
+    # handing <named-content>
+    ("""<funding-group>
+            <award-group>
+                <funding-source>
+                    <named-content content-type=\"funder-name\">Austrian Science Fund</named-content>
+                    <named-content content-type=\"funder-identifier\">10.13039/501100002428</named-content>
+                </funding-source>
+                <award-id>P 27625</award-id>
+            </award-group>
+            <funding-statement>This work was supported by Austrian Science Fund [grant number P 27625].</funding-statement>
+        </funding-group>""", None),
+    # handling xlink:href attributes
+    ("""<funding-group>
+            <award-group>
+                <funding-source xlink:href=\"http://dx.doi.org/10.13039/501100000269\">Economic and Social Research Council</funding-source>
+                <award-id>RES-360-25-0032</award-id>
+            </award-group>
+            <award-group>
+                <funding-source xlink:href=\"http://dx.doi.org/10.13039/100004440\">Wellcome Trust</funding-source>
+                <award-id>106542/Z/14/Z</award-id>
+            </award-group>
+        </funding-group>""", None)
+]
+acknowledgement_tags_and_parsed_dicts = [
+    # variants with <ack id> may/may not have a <title>. always have <p> but may/may not have <p id>.  <title> never has attributes.
+    # the <p> text might contain <funding-source> or <ext-link> tags.
+    #   the <ext-link> tags have required attributes 'ext-link-type' and 'xlink:href', and optional attribute 'id'.  all the <ext-links> are URLs.
+    ("""<ack id=\"ack0005\">
+            <title>Acknowledgements</title>
+            <p>The authors thank the <funding-source id=\"gs0005\">BBSRC</funding-source> (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.</p>
+        </ack>""", {
+            'text': 'The authors thank the BBSRC (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.',
+            'funding': [{'text': 'BBSRC', 'id': 'gs0005'}],
+            'url': None}),
+    ("""<ack id=\"S27\">
+            <p>Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.</p>
+        </ack>""", {
+            'text': 'Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.',
+            'funding': [],
+            'url': None}),
+    ("""<ack id=\"S11\">
+            <title>Acknowledgements</title>
+            <p id=\"P33\">This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.</p>
+        </ack>""", {
+            'text': 'This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.',
+            'funding': [],
+            'url': None}),
+    ("""<ack id=\"mee312535-sec-0015\">
+            <title>Data accessibility</title>
+            <p>The data used is included in the RepeatABEL package available at <ext-link ext-link-type=\"uri\" xlink:href=\"https://cran.r-project.org/web/packages/RepeatABEL\">https://cran.r-project.org/web/packages/RepeatABEL</ext-link>.</p>
+        </ack>""", {
+            'text': 'The data used is included in the RepeatABEL package available at https://cran.r-project.org/web/packages/RepeatABEL.',
+            'funding': [],
+            'url': 'https://cran.r-project.org/web/packages/RepeatABEL'}),
+    # variants with <ack> are similar to the above.
+    ("""<ack>
+            <title>Acknowledgments</title>
+            <p>D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.</p>
+        </ack>""", {
+            'text': 'D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.',
+            'funding': [],
+            'url': None}),
+    ("""<ack>
+            <title>Conflict of interest</title>
+            <p>The authors declare there is no conflict of interest associated with this manuscript.</p>
+        </ack>""", {
+            'text': 'The authors declare there is no conflict of interest associated with this manuscript.',
+            'funding': [],
+            'url': None})
+]
+affiliation_tags_and_parsed_dicts = [
+    # mix of <aff> tags with and without IDs
+    ("""<aff>Department of Internal Medicine, Division of Cardiology, Inha University Hospital, Incheon, South Korea</aff>""", None),
+    ("""<aff id=\"aff1\"><label>1</label>Department of Cardiology, Atatürk Chest Diseases and Chest Surgery Training and Research Hospital; Ankara-Turkey</aff>""", None),
+    # there can exist a <label> tag with/without IDs
+    ("""<aff><label>3</label>Center for Medical Education, Sapporo Medical University, <addr-line>Sapporo, Japan</addr-line></aff>""", None),
+    # sometimes, the marker used in paper is kept also.  for example, `1` in superscript.
+    # this can exist with/without the <label> tag.  as in, it's inconsistent whether the marker is encapsulated in <label> or kept as string
+    ("""<aff id=\"I1\">\n<sup>1</sup>Department of Orthodontics, College of Dentistry, King Khalid University, Abha, Saudi Arabia</aff>""", None),
+    ("""<aff id=\"hic312304-aff-0001\"><label><sup>1</sup></label><institution>University of Dundee</institution></aff>""", None),
+    # <institution> tags can be straightforward; just ignore and grab text
+    ("""<aff id=\"AF02477-1\"><label>1</label><institution>School of Chemistry, The University of Manchester, Manchester, United Kingdom</institution>""", None),
+    # sometimes <institution> tags can have SIBLING tags, like <addr-line> or <country>
+    ("""<aff id=\"aff002\"><label>2</label>Sr. Consultant &amp; Head, Dept. of Neurology, <institution>National Neurosciences Centre, Peerless Hospital</institution>, <addr-line>Kolkata, India</addr-line></aff>""", None),
+    ("""<aff id=\"aff2\"><label><sup>2</sup></label>Institute for Transplantation Diagnostics and Cell Therapeutics, <institution>Heinrich Heine University Düsseldorf</institution>, Düsseldorf, <country>Germany</country>.</aff>""", None),
+    # <named-content> is also a common CHILD tag; these can be either entirely structured affiliation entries  (not intended for tag.text)
+    ("""<aff id=\"embr201642857-aff-0007\">
+            <label><sup>7</sup></label>
+            <institution>VIB</institution>
+            <named-content content-type=\"city\">Zwijnaarde</named-content>
+            <country country=\"BE\">Belgium</country>
+        </aff>""", None),
+    # or overlayed over a single affiliation string (comma-sep if call tag.text)
+    ("""<aff id=\"AFF0005\">
+            <label><sup>e</sup></label>
+            <institution>
+                <named-content content-type=\"department\">School of Public Health &amp; Health Systems</named-content>, <named-content content-type=\"institution-name\">University of Waterloo</named-content>
+            </institution>
+        </aff>""", None),
+    # example of a nonsense one that has TWO <named-content> tags, whitespaces, the <sup> tag WITHIN <label>
+    ("""<aff id=\"ejn14074-aff-0007\">\n
+        <label><sup>7</sup></label>\n
+        <named-content content-type=\"organisation-division\">Brain Research Institute</named-content>\n
+        <institution>University of Zürich</institution>\n
+        <named-content content-type=\"city\">Zürich</named-content>\n
+        <country country=\"CH\">Switzerland</country>\n</aff>""", None),
+    # most common content-type within <named-content> are: 'department', 'organisation-division', 'city', 'institution-name', 'postal-code', 'country-part', etc.
+    # <institution-wrap> is the other popular way to surface <institution> tags.
+    # They seem to always come with 1+ <institution-id> as children.
+    # finally, these wrappers can wrap multiple <institution> tags.
+    # in this example, see how the COMMA is awkwardly encapsulated within <institution> tags?  Also, notice how the country is untagged outside of <institution-wrap>
+    # basically, everything is weird.
+    ("""<aff id=\"Aff10\">
+        <label>10</label>
+        <institution-wrap>
+            <institution-id institution-id-type=\"ISNI\">0000000123222966</institution-id>
+            <institution-id institution-id-type=\"GRID\">grid.6936.a</institution-id>
+            <institution>Institute of Experimental Genetics, Life and Food Science Center Weihenstephan, </institution>
+            <institution>Technische Universität München, </institution>
+        </institution-wrap>Freising-Weihenstephan, Germany </aff>""", None)
+]
+author_tags_and_parsed_dicts = [
+    # every author seems to be in a <contrib> tag.
+    # all <contrib> tags seem to have a 'contrib-type' attribute, which often equals 'author' and sometimes equals 'collab'
+    # below is an 'author' that has <name>, <address>, and <bio> child tags.  Also XREF to affiliation (can have multiple).
+    ("""<contrib contrib-type=\"author\">
+            <name><surname>Sandström</surname><given-names>Annica</given-names></name>
+            <address><email>annica.sandstrom@ltu.se</email></address>
+            <xref ref-type=\"aff\" rid=\"Aff2\"/>
+            <bio><sec id=\"d30e226\"><title>Annica Sandström</title><p>is an Associate Professor in Political Science at Luleå University of Technology. Working foremost within the field of environmental policy and management, her publications include empirical studies on the socio-political complexities of natural resource governance as well as theory-driven pieces on collaborative management, adaptive management, and policy networks.</p></sec></bio>
+        </contrib>""", None),
+    ("""<contrib contrib-type="author">
+            <name><surname>Cassidy</surname><given-names>John W.</given-names></name>
+            <xref ref-type="aff" rid="A1">1</xref>
+            <xref ref-type="aff" rid="A2">2</xref>
+        </contrib>""", None),
+    # below is an 'author' that contains a <collab> child tag.  We can see sometimes there's other tags like an XREF to affiliation which can probably be .decomposed()
+    ("""<contrib contrib-type=\"author\">
+            <collab>The HIV Neurobehavioral Research Programs (HNRP) Group</collab>
+        </contrib>""", None),
+    ("""<contrib contrib-type=\"author\">
+            <collab>JET EFDA contributors</collab>
+            <xref ref-type=\"aff\" rid=\"aff1\">a</xref><xref ref-type=\"fn\" rid=\"fn3\">3</xref>
+        </contrib>""", None),
+    # below is a 'collab' that also contains nested <contrib> tags wrapped by <contrib-group>.  Yikes!
+    # luckily, it seems <contrib-group> is rare and always nested within an ultimate parent <contrib>
+    # --> these are more like affiliations
+    ("""<contrib contrib-type=\"collab\">
+            <collab>UK Biobank Eye and Vision Consortium\n
+                <contrib-group>
+                    <contrib contrib-type=\"collab\">
+                        <name><surname>Aslam</surname><given-names>Tariq</given-names></name>
+                    </contrib>
+                    <contrib contrib-type=\"collab\">
+                        <name><surname>Bishop</surname><given-names>Paul</given-names></name>
+                    </contrib>
+                    <contrib contrib-type=\"collab\">
+                        <name><surname>Barman</surname><given-names>Sarah</given-names></name>
+                    </contrib>
+                </contrib-group>
+            </collab>
+        </contrib>
+    """, None),
+    ("""<contrib contrib-type="author">
+            <collab>WERF EPHect Working Group
+                <contrib-group>
+                    <contrib contrib-type="author"><name><surname>Adamson</surname><given-names>G.D.</given-names></name></contrib>
+                    <contrib contrib-type="author"><name><surname>Allaire</surname><given-names>C.</given-names></name></contrib>
+                </contrib-group>
+            </collab>
+        </contrib>""", None),
+    # there are optional <aff> tags instead of an <xref ref-type=\"aff\">
+    ("""<contrib contrib-type=\"author\">
+            <name><surname>Beedle</surname><given-names>Aaron M</given-names></name>
+            <aff id=\"A1\">Department of Pharmaceutical and Biomedical Sciences, University of Georgia College of Pharmacy, Athens, GA 30602 USA</aff>
+        </contrib>""", None),
+    # corresponding authors are indicated in two ways: (i) within <contrib> as a 'corresp=yes' attribute, (ii) within <xref> as a 'ref-type=corresp' attribute
+    ("""<contrib contrib-type=\"author\" corresp=\"yes\">
+            <name><surname>Kim</surname><given-names>Woong-Ki</given-names></name>
+            <address><email>kimw@evms.edu</email></address>
+            <xref ref-type=\"aff\" rid=\"Aff1\">1</xref>
+        </contrib>""", None),
+    ("""<contrib contrib-type=\"author\">
+            <name><surname>Suero Molina</surname><given-names>Eric</given-names></name>
+            <degrees>MD, MBA</degrees>
+            <!--<email>eric.suero@ukmuenster.de</email>-->
+            <xref ref-type=\"aff\" rid=\"aff1\"/>
+            <xref ref-type=\"corresp\" rid=\"cor1\"/>
+        </contrib>""", None),
+    # note that contrib-type 'editor' is also present, and seems to accompany <role> tag and 'corresp=no' attribute
+    ("""<contrib contrib-type=\"editor\" corresp=\"no\">
+            <name><surname>Greene</surname><given-names>Robert L.</given-names></name>
+            <role>Editor</role>
+        </contrib>""", None),
+    # within <contrib> are optional child tags <contrib-id>
+    # the 'contrib-id-type' seems to always be 'orcid'
+    # authentication seems optional
+    ("""<contrib contrib-type=\"author\" corresp=\"yes\">
+            <contrib-id authenticated=\"false\" contrib-id-type=\"orcid\">https://orcid.org/0000-0002-9987-6824</contrib-id>
+            <name><surname>Sandeepa</surname><given-names>N. C.</given-names></name>
+            <email>drsandeepanc@gmail.com</email>
+            <xref ref-type=\"aff\" rid=\"I2\">\n<sup>2</sup>\n</xref>
+        </contrib>""", None),
+    ("""<contrib contrib-type=\"author\" corresp=\"yes\">
+            <contrib-id contrib-id-type=\"orcid\">http://orcid.org/0000-0003-1079-4775</contrib-id>
+            <name><surname>West</surname><given-names>Ann H.</given-names></name>
+            <address><email>awest@ou.edu</email></address>
+            <xref ref-type=\"aff\" rid=\"Aff1\">1</xref>
+        </contrib>""", None),
+    # more edge cases; a <contrib> tag with no <name> --> probably just remove
+    ("""<contrib contrib-type="author">
+            <collab>on behalf of the National Advisory Committee on Blood and Blood Products
+                <xref ref-type="author-notes" rid="fn1">*</xref>
+            </collab>
+        </contrib>""", None),
+]

s2orc-doc2json/doc2json/jats2json/process_jats.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import json
+import argparse
+import time
+from typing import Optional
+from doc2json.jats2json.jats_to_json import convert_jats_xml_to_s2orc_json
+BASE_TEMP_DIR = 'temp'
+BASE_OUTPUT_DIR = 'output'
+BASE_LOG_DIR = 'log'
+def process_jats_stream(
+        fname: str,
+        stream: bytes,
+        temp_dir: str=BASE_TEMP_DIR
+):
+    """
+    Process a jats file stream
+    :param fname:
+    :param stream:
+    :param temp_dir:
+    :return:
+    """
+    temp_input_dir = os.path.join(temp_dir, 'input')
+    temp_input_file = os.path.join(temp_input_dir, fname)
+    os.makedirs(temp_dir, exist_ok=True)
+    os.makedirs(temp_input_dir, exist_ok=True)
+    with open(temp_input_file, 'wb') as outf:
+        outf.write(stream)
+    output_file = process_jats_file(temp_input_file)
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            contents = json.load(f)
+            return contents
+    else:
+        return []
+def process_jats_file(
+        jats_file: str,
+        output_dir: str=BASE_OUTPUT_DIR,
+        log_dir: str=BASE_LOG_DIR,
+) -> Optional[str]:
+    """
+    Process files in a JATS XML file and get JSON representation
+    :param jats_file:
+    :param output_dir:
+    :param log_dir:
+    :return:
+    """
+    # create directories
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(log_dir, exist_ok=True)
+    # get paper id as the name of the file
+    paper_id = os.path.splitext(jats_file)[0].split('/')[-1]
+    output_file = os.path.join(output_dir, f'{paper_id}.json')
+    # check if input file exists and output file doesn't
+    if not os.path.exists(jats_file):
+        raise FileNotFoundError(f"{jats_file} doesn't exist")
+    if os.path.exists(output_file):
+        print(f'{output_file} already exists!')
+    # convert to S2ORC
+    paper = convert_jats_xml_to_s2orc_json(jats_file, log_dir)
+    # write to file
+    with open(output_file, 'w') as outf:
+        json.dump(paper.release_json("jats"), outf, indent=4, sort_keys=False)
+    return output_file
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run S2ORC JATS2JSON")
+    parser.add_argument("-i", "--input", default=None, help="path to the input JATS XML file")
+    parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
+    parser.add_argument("-l", "--log", default='log', help="path to the log dir")
+    parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
+    args = parser.parse_args()
+    input_path = args.input
+    output_path = args.output
+    log_path = args.log
+    keep_temp = args.keep
+    start_time = time.time()
+    os.makedirs(output_path, exist_ok=True)
+    process_jats_file(input_path, output_path, log_path, keep_temp)
+    runtime = round(time.time() - start_time, 3)
+    print("runtime: %s seconds " % (runtime))
+    print('done.')

s2orc-doc2json/doc2json/s2orc.py ADDED Viewed

	@@ -0,0 +1,527 @@

+"""
+S2ORC classes
+"""
+from datetime import datetime
+from typing import Dict, List, Optional
+from doc2json.config import *
+CORRECT_KEYS = {
+    "issn": "issue",
+    "type": "type_str"
+}
+SKIP_KEYS = {
+    'link',
+    'bib_id'
+}
+REFERENCE_OUTPUT_KEYS = {
+    'figure': {'text', 'type_str', 'uris', 'num'},
+    'table': {'text', 'type_str', 'content', 'num', 'html'},
+    'footnote': {'text', 'type_str', 'num'},
+    'section': {'text', 'type_str', 'num', 'parent'},
+    'equation': {'text', 'type_str', 'latex', 'mathml', 'num'}
+}
+METADATA_KEYS = {
+    "title", "authors", "year", "venue", "identifiers"
+}
+class ReferenceEntry:
+    """
+    Class for representing S2ORC figure and table references
+    An example json representation (values are examples, not accurate):
+    {
+      "FIGREF0": {
+        "text": "FIG. 2. Depth profiles of...",
+        "latex": null,
+        "type": "figure"
+      },
+      "TABREF2": {
+        "text": "Diversity indices of...",
+        "latex": null,
+        "type": "table",
+        "content": "",
+        "html": ""
+      }
+    }
+    """
+    def __init__(
+            self,
+            ref_id: str,
+            text: str,
+            type_str: str,
+            latex: Optional[str] = None,
+            mathml: Optional[str] = None,
+            content: Optional[str] = None,
+            html: Optional[str] = None,
+            uris: Optional[List[str]] = None,
+            num: Optional[str] = None,
+            parent: Optional[str] = None
+    ):
+        self.ref_id = ref_id
+        self.text = text
+        self.type_str = type_str
+        self.latex = latex
+        self.mathml = mathml
+        self.content = content
+        self.html = html
+        self.uris = uris
+        self.num = num
+        self.parent = parent
+    def as_json(self):
+        keep_keys = REFERENCE_OUTPUT_KEYS.get(self.type_str, None)
+        if keep_keys:
+            return {
+                k: self.__getattribute__(k) for k in keep_keys
+            }
+        else:
+            return {
+                "text": self.text,
+                "type": self.type_str,
+                "latex": self.latex,
+                "mathml": self.mathml,
+                "content": self.content,
+                "html": self.html,
+                "uris": self.uris,
+                "num": self.num,
+                "parent": self.parent
+            }
+class BibliographyEntry:
+    """
+    Class for representing S2ORC parsed bibliography entries
+    An example json representation (values are examples, not accurate):
+    {
+        "title": "Mobility Reports...",
+        "authors": [
+            {
+                "first": "A",
+                "middle": ["A"],
+                "last": "Haija",
+                "suffix": ""
+            }
+        ],
+        "year": 2015,
+        "venue": "IEEE Wireless Commun. Mag",
+        "volume": "42",
+        "issn": "9",
+        "pages": "80--92",
+        "other_ids": {
+            "doi": [
+                "10.1109/TWC.2014.2360196"
+            ],
+        }
+    }
+    """
+    def __init__(
+            self,
+            bib_id: str,
+            title: str,
+            authors: List[Dict[str, str]],
+            ref_id: Optional[str] = None,
+            year: Optional[int] = None,
+            venue: Optional[str] = None,
+            volume: Optional[str] = None,
+            issue: Optional[str] = None,
+            pages: Optional[str] = None,
+            other_ids: Dict[str, List] = None,
+            num: Optional[int] = None,
+            urls: Optional[List] = None,
+            raw_text: Optional[str] = None,
+            links: Optional[List] = None
+    ):
+        self.bib_id = bib_id
+        self.ref_id = ref_id
+        self.title = title
+        self.authors = authors
+        self.year = year
+        self.venue = venue
+        self.volume = volume
+        self.issue = issue
+        self.pages = pages
+        self.other_ids = other_ids
+        self.num = num
+        self.urls = urls
+        self.raw_text = raw_text
+        self.links = links
+    def as_json(self):
+        return {
+            "ref_id": self.ref_id,
+            "title": self.title,
+            "authors": self.authors,
+            "year": self.year,
+            "venue": self.venue,
+            "volume": self.volume,
+            "issue": self.issue,
+            "pages": self.pages,
+            "other_ids": self.other_ids,
+            "num": self.num,
+            "urls": self.urls,
+            "raw_text": self.raw_text,
+            "links": self.links
+        }
+class Affiliation:
+    """
+    Class for representing affiliation info
+    Example:
+        {
+            "laboratory": "Key Laboratory of Urban Environment and Health",
+            "institution": "Chinese Academy of Sciences",
+            "location": {
+              "postCode": "361021",
+              "settlement": "Xiamen",
+              "country": "People's Republic of China"
+        }
+    """
+    def __init__(
+            self,
+            laboratory: str,
+            institution: str,
+            location: Dict
+    ):
+        self.laboratory = laboratory
+        self.institution = institution
+        self.location = location
+    def as_json(self):
+        return {
+            "laboratory": self.laboratory,
+            "institution": self.institution,
+            "location": self.location
+        }
+class Author:
+    """
+    Class for representing paper authors
+    Example:
+        {
+          "first": "Anyi",
+          "middle": [],
+          "last": "Hu",
+          "suffix": "",
+          "affiliation": {
+            "laboratory": "Key Laboratory of Urban Environment and Health",
+            "institution": "Chinese Academy of Sciences",
+            "location": {
+              "postCode": "361021",
+              "settlement": "Xiamen",
+              "country": "People's Republic of China"
+            }
+          },
+          "email": ""
+        }
+    """
+    def __init__(
+            self,
+            first: str,
+            middle: List[str],
+            last: str,
+            suffix: str,
+            affiliation: Optional[Dict] = None,
+            email: Optional[str] = None
+    ):
+        self.first = first
+        self.middle = middle
+        self.last = last
+        self.suffix = suffix
+        self.affiliation = Affiliation(**affiliation) if affiliation else {}
+        self.email = email
+    def as_json(self):
+        return {
+            "first": self.first,
+            "middle": self.middle,
+            "last": self.last,
+            "suffix": self.suffix,
+            "affiliation": self.affiliation.as_json() if self.affiliation else {},
+            "email": self.email
+        }
+class Metadata:
+    """
+    Class for representing paper metadata
+    Example:
+    {
+      "title": "Niche Partitioning...",
+      "authors": [
+        {
+          "first": "Anyi",
+          "middle": [],
+          "last": "Hu",
+          "suffix": "",
+          "affiliation": {
+            "laboratory": "Key Laboratory of Urban Environment and Health",
+            "institution": "Chinese Academy of Sciences",
+            "location": {
+              "postCode": "361021",
+              "settlement": "Xiamen",
+              "country": "People's Republic of China"
+            }
+          },
+          "email": ""
+        }
+      ],
+      "year": "2011-11"
+    }
+    """
+    def __init__(
+            self,
+            title: str,
+            authors: List[Dict],
+            year: Optional[str] = None,
+            venue: Optional[str] = None,
+            identifiers: Optional[Dict] = {}
+    ):
+        self.title = title
+        self.authors = [Author(**author) for author in authors]
+        self.year = year
+        self.venue = venue
+        self.identifiers = identifiers
+    def as_json(self):
+        return {
+            "title": self.title,
+            "authors": [author.as_json() for author in self.authors],
+            "year": self.year,
+            "venue": self.venue,
+            "identifiers": self.identifiers
+        }
+class Paragraph:
+    """
+    Class for representing a parsed paragraph from Grobid xml
+    All xml tags are removed from the paragraph text, all figures, equations, and tables are replaced
+    with a special token that maps to a reference identifier
+    Citation mention spans and section header are extracted
+    An example json representation (values are examples, not accurate):
+    {
+        "text": "Formal language techniques BID1 may be used to study FORMULA0 (see REF0)...",
+        "mention_spans": [
+            {
+                "start": 27,
+                "end": 31,
+                "text": "[1]")
+        ],
+        "ref_spans": [
+            {
+                "start": ,
+                "end": ,
+                "text": "Fig. 1"
+            }
+        ],
+        "eq_spans": [
+            {
+                "start": 53,
+                "end": 61,
+                "text": "α = 1",
+                "latex": "\\alpha = 1",
+                "ref_id": null
+            }
+        ],
+        "section": "Abstract"
+    }
+    """
+    def __init__(
+            self,
+            text: str,
+            cite_spans: List[Dict],
+            ref_spans: List[Dict],
+            eq_spans: Optional[List[Dict]] = [],
+            section: Optional = None,
+            sec_num: Optional = None
+    ):
+        self.text = text
+        self.cite_spans = cite_spans
+        self.ref_spans = ref_spans
+        self.eq_spans = eq_spans
+        if type(section) == str:
+            if section:
+                sec_parts = section.split('::')
+                section_list = [[None, sec_name] for sec_name in sec_parts]
+            else:
+                section_list = None
+            if section_list and sec_num:
+                section_list[-1][0] = sec_num
+        else:
+            section_list = section
+        self.section = section_list
+    def as_json(self):
+        return {
+            "text": self.text,
+            "cite_spans": self.cite_spans,
+            "ref_spans": self.ref_spans,
+            "eq_spans": self.eq_spans,
+            "section": '::'.join([sec[1] for sec in self.section]) if self.section else "",
+            "sec_num": self.section[-1][0] if self.section else None
+        }
+class Paper:
+    """
+    Class for representing a parsed S2ORC paper
+    """
+    def __init__(
+            self,
+            paper_id: str,
+            pdf_hash: str,
+            metadata: Dict,
+            abstract: List[Dict],
+            body_text: List[Dict],
+            back_matter: List[Dict],
+            bib_entries: Dict,
+            ref_entries: Dict
+        ):
+        self.paper_id = paper_id
+        self.pdf_hash = pdf_hash
+        self.metadata = Metadata(**metadata)
+        self.abstract = [Paragraph(**para) for para in abstract]
+        self.body_text = [Paragraph(**para) for para in body_text]
+        self.back_matter = [Paragraph(**para) for para in back_matter]
+        self.bib_entries = [
+            BibliographyEntry(
+                bib_id=key,
+                **{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in bib.items() if k not in SKIP_KEYS}
+            ) for key, bib in bib_entries.items()
+        ]
+        self.ref_entries = [
+            ReferenceEntry(
+                ref_id=key,
+                **{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in ref.items() if k != 'ref_id'}
+            ) for key, ref in ref_entries.items()
+        ]
+    def as_json(self):
+        return {
+            "paper_id": self.paper_id,
+            "pdf_hash": self.pdf_hash,
+            "metadata": self.metadata.as_json(),
+            "abstract": [para.as_json() for para in self.abstract],
+            "body_text": [para.as_json() for para in self.body_text],
+            "back_matter": [para.as_json() for para in self.back_matter],
+            "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
+            "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
+        }
+    @property
+    def raw_abstract_text(self) -> str:
+        """
+        Get all the body text joined by a newline
+        :return:
+        """
+        return '\n'.join([para.text for para in self.abstract])
+    @property
+    def raw_body_text(self) -> str:
+        """
+        Get all the body text joined by a newline
+        :return:
+        """
+        return '\n'.join([para.text for para in self.body_text])
+    def release_json(self, doc_type: str="pdf"):
+        """
+        Return in release JSON format
+        :return:
+        """
+        # TODO: not fully implemented; metadata format is not right; extra keys in some places
+        release_dict = {"paper_id": self.paper_id}
+        release_dict.update({"header": {
+            "generated_with": f'{S2ORC_NAME_STRING} {S2ORC_VERSION_STRING}',
+            "date_generated": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
+        }})
+        release_dict.update(self.metadata.as_json())
+        release_dict.update({"abstract": self.raw_abstract_text})
+        release_dict.update({
+            f"{doc_type}_parse": {
+                "paper_id": self.paper_id,
+                "_pdf_hash": self.pdf_hash,
+                "abstract": [para.as_json() for para in self.abstract],
+                "body_text": [para.as_json() for para in self.body_text],
+                "back_matter": [para.as_json() for para in self.back_matter],
+                "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
+                "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
+            }
+        })
+        return release_dict
+def load_s2orc(paper_dict: Dict) -> Paper:
+    """
+    Load release S2ORC into Paper class
+    :param paper_dict:
+    :return:
+    """
+    paper_id = paper_dict['paper_id']
+    pdf_hash = paper_dict.get('_pdf_hash', paper_dict.get('s2_pdf_hash', None))
+    # 2019 gorc parses
+    if "grobid_parse" in paper_dict and paper_dict.get("grobid_parse"):
+        metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
+        abstract = paper_dict.get("grobid_parse").get("abstract", [])
+        body_text = paper_dict.get("grobid_parse").get("body_text", [])
+        back_matter = paper_dict.get("grobid_parse").get("back_matter", [])
+        bib_entries = paper_dict.get("grobid_parse").get("bib_entries", {})
+        for k, v in bib_entries.items():
+            if 'link' in v:
+                v['links'] = [v['link']]
+        ref_entries = paper_dict.get("grobid_parse").get("ref_entries", {})
+    # current and 2020 s2orc release_json
+    elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ("body_text" in paper_dict and paper_dict.get("body_text")):
+        if "pdf_parse" in paper_dict:
+            paper_dict = paper_dict["pdf_parse"]
+        if paper_dict.get("metadata"):
+            metadata = {k: v for k, v in paper_dict.get("metadata").items() if k in METADATA_KEYS}
+        # 2020 s2orc releases (metadata is separate)
+        else:
+            metadata = {
+                "title": None,
+                "authors": [],
+                "year": None
+            }
+        abstract = paper_dict.get("abstract", [])
+        body_text = paper_dict.get("body_text", [])
+        back_matter = paper_dict.get("back_matter", [])
+        bib_entries = paper_dict.get("bib_entries", {})
+        for k, v in bib_entries.items():
+            if 'link' in v:
+                v['links'] = [v['link']]
+        ref_entries = paper_dict.get("ref_entries", {})
+    else:
+        print(paper_id)
+        raise NotImplementedError("Unknown S2ORC file type!")
+    return Paper(
+        paper_id=paper_id,
+        pdf_hash=pdf_hash,
+        metadata=metadata,
+        abstract=abstract,
+        body_text=body_text,
+        back_matter=back_matter,
+        bib_entries=bib_entries,
+        ref_entries=ref_entries
+    )

s2orc-doc2json/doc2json/spp2json/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/spp2json/process_pdf.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import json
+import argparse
+import time
+from typing import Dict
+from doc2json.spp2json.spp.spp_client import SppClient
+from doc2json.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
+def process_pdf_file(input_file: str, temp_dir: str, output_dir: str) -> str:
+    """
+    Process a PDF file and get JSON representation
+    :param input_file:
+    :param temp_dir:
+    :param output_dir:
+    :return:
+    """
+    # get paper id as the name of the file
+    paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
+    spp_json_file = os.path.join(temp_dir, f'{paper_id}.json')
+    output_file = os.path.join(output_dir, f'{paper_id}.json')
+    # check if input file exists and output file doesn't
+    if not os.path.exists(input_file):
+        raise FileNotFoundError(f"{input_file} doesn't exist")
+    if os.path.exists(output_file):
+        raise Warning(f'{output_file} already exists!')
+    # process PDF through SPP -> SPP JSON
+    client = SppClient()
+    # TODO: compute PDF hash
+    client.process(input_file, temp_dir)
+    # process SPP JSON -> S2ORC JSON
+    assert os.path.exists(spp_json_file)
+    with open(spp_json_file, 'r') as f_in:
+        spp_json = json.load(f_in)
+    paper = convert_spp_json_to_s2orc_json(spp_json=spp_json)
+    # write to file
+    with open(output_file, 'w') as outf:
+        json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
+    return output_file
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
+    parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
+    parser.add_argument("-t", "--temp", default='temp/', help="path to the temp dir for putting tei xml files")
+    parser.add_argument("-o", "--output", default='output/', help="path to the output dir for putting json files")
+    parser.add_argument("-k", "--keep", action='store_true')
+    args = parser.parse_args()
+    input_path = args.input
+    temp_path = args.temp
+    output_path = args.output
+    keep_temp = args.keep
+    start_time = time.time()
+    os.makedirs(temp_path, exist_ok=True)
+    os.makedirs(output_path, exist_ok=True)
+    process_pdf_file(input_path, temp_path, output_path)
+    runtime = round(time.time() - start_time, 3)
+    print("runtime: %s seconds " % (runtime))
+    print('done.')

s2orc-doc2json/doc2json/spp2json/spp/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/spp2json/spp/spp_client.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import io
+import json
+import argparse
+import time
+import glob
+import ntpath
+from typing import List
+class SppClient:
+    def process(self, input: str, output: str):
+        raise NotImplementedError
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Client for ScienceParsePlus (SPP) services")
+    parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
+    parser.add_argument("--output", default=None, help="path to the directory where to put the results")
+    args = parser.parse_args()
+    input_path = args.input
+    output_path = args.output
+    client = SppClient()
+    start_time = time.time()
+    client.process(input_path, output_path)
+    runtime = round(time.time() - start_time, 3)
+    print("runtime: %s seconds " % (runtime))

s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from typing import *
+from doc2json.s2orc import Paper
+def convert_spp_json_to_s2orc_json(spp_json: Dict) -> Paper:
+    raise NotImplementedError

s2orc-doc2json/doc2json/tex2json/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/tex2json/process_tex.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import json
+import argparse
+import time
+from typing import Optional, Dict
+from doc2json.tex2json.tex_to_xml import convert_latex_to_s2orc_json
+from doc2json.tex2json.xml_to_json import convert_latex_xml_to_s2orc_json
+BASE_TEMP_DIR = 'temp'
+BASE_OUTPUT_DIR = 'output'
+BASE_LOG_DIR = 'log'
+def process_tex_stream(
+        fname: str,
+        stream: bytes,
+        temp_dir: str=BASE_TEMP_DIR,
+        keep_flag: bool=False,
+        grobid_config: Optional[Dict] = None
+):
+    """
+    Process a gz file stream
+    :param fname:
+    :param stream:
+    :param temp_dir:
+    :param keep_flag:
+    :param grobid_config:
+    :return:
+    """
+    temp_input_dir = os.path.join(temp_dir, 'input')
+    temp_input_file = os.path.join(temp_input_dir, fname)
+    os.makedirs(temp_dir, exist_ok=True)
+    os.makedirs(temp_input_dir, exist_ok=True)
+    with open(temp_input_file, 'wb') as outf:
+        outf.write(stream)
+    output_file = process_tex_file(
+        temp_input_file, temp_dir=temp_dir, keep_flag=keep_flag, grobid_config=grobid_config
+    )
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            contents = json.load(f)
+            return contents
+    else:
+        return []
+def process_tex_file(
+        input_file: str,
+        temp_dir: str=BASE_TEMP_DIR,
+        output_dir: str=BASE_OUTPUT_DIR,
+        log_dir: str=BASE_LOG_DIR,
+        keep_flag: bool=False,
+        grobid_config: Optional[Dict]=None
+) -> Optional[str]:
+    """
+    Process files in a TEX zip and get JSON representation
+    :param input_file:
+    :param temp_dir:
+    :param output_dir:
+    :param log_dir:
+    :param keep_flag:
+    :param grobid_config:
+    :return:
+    """
+    # create directories
+    os.makedirs(temp_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(log_dir, exist_ok=True)
+    # get paper id as the name of the file
+    paper_id = os.path.splitext(input_file)[0].split('/')[-1]
+    output_file = os.path.join(output_dir, f'{paper_id}.json')
+    cleanup_flag = not keep_flag
+    # check if input file exists and output file doesn't
+    if not os.path.exists(input_file):
+        raise FileNotFoundError(f"{input_file} doesn't exist")
+    if os.path.exists(output_file):
+        print(f'{output_file} already exists!')
+    # process LaTeX
+    xml_file = convert_latex_to_s2orc_json(input_file, temp_dir, cleanup_flag)
+    if not xml_file:
+        return None
+    # convert to S2ORC
+    paper = convert_latex_xml_to_s2orc_json(xml_file, log_dir, grobid_config=grobid_config)
+    # write to file
+    with open(output_file, 'w') as outf:
+        json.dump(paper.release_json("latex"), outf, indent=4, sort_keys=False)
+    return output_file
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run S2ORC TEX2JSON")
+    parser.add_argument("-i", "--input", default=None, help="path to the input TEX zip file")
+    parser.add_argument("-t", "--temp", default='temp', help="path to a temp dir for partial files")
+    parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
+    parser.add_argument("-l", "--log", default='log', help="path to the log dir")
+    parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
+    args = parser.parse_args()
+    input_path = args.input
+    temp_path = args.temp
+    output_path = args.output
+    log_path = args.log
+    keep_temp = args.keep
+    start_time = time.time()
+    os.makedirs(temp_path, exist_ok=True)
+    os.makedirs(output_path, exist_ok=True)
+    process_tex_file(input_path, temp_path, output_path, log_path, keep_temp)
+    runtime = round(time.time() - start_time, 3)
+    print("runtime: %s seconds " % (runtime))
+    print('done.')

s2orc-doc2json/doc2json/tex2json/tex_to_xml.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+Process all the files in a LaTeX zip file to extract paper content
+1. Unzips LaTeX ZIP file
+2. Identifies primary TEX file
+3. Expands other TEX files into main TEX file using latexpand
+4. Expands BBL file into main TEX file
+5. Convert TEX file into XML using tralics
+6. Extract content of XML into S2ORC JSON
+"""
+import os
+import gzip
+import tarfile
+import zipfile
+import shutil
+from typing import Optional
+from doc2json.utils.latex_util import normalize, latex_to_xml
+def _is_gzip_file(fpath):
+    with open(fpath, 'rb') as test_f:
+        return test_f.read(2) == b'\x1f\x8b'
+def extract_latex(zip_file: str, latex_dir: str, cleanup=True):
+    """
+    Unzip latex zip into temp directory
+    :param zip_file:
+    :param latex_dir:
+    :param cleanup:
+    :return:
+    """
+    assert os.path.exists(zip_file)
+    assert zip_file.endswith('.gz') or zip_file.endswith('.zip') or zip_file.endswith('.tar')
+    # get name of zip file
+    file_id = os.path.splitext(zip_file)[0].split('/')[-1]
+    # check if tar file -> untar
+    tar_dir = os.path.join(latex_dir, file_id)
+    os.makedirs(tar_dir, exist_ok=True)
+    if tarfile.is_tarfile(zip_file):
+        with tarfile.open(zip_file) as tar:
+            tar.extractall(tar_dir)
+    # check if gzip file -> un-gz and/or untar
+    elif _is_gzip_file(zip_file):
+        tar_file = os.path.join(latex_dir, f'{file_id}.tar')
+        with gzip.open(zip_file, 'rb') as in_f, open(tar_file, 'wb') as out_f:
+            s = in_f.read()
+            out_f.write(s)
+        if os.path.exists(tar_file):
+            # check if tarfile
+            if tarfile.is_tarfile(tar_file):
+                with tarfile.open(tar_file) as tar:
+                    tar.extractall(tar_dir)
+                os.remove(tar_file)
+            # else, copy to tex file
+            else:
+                tex_file = os.path.join(latex_dir, file_id, f'{file_id}.tex')
+                os.makedirs(tar_dir, exist_ok=True)
+                os.rename(tar_file, tex_file)
+    # check if zip file -> unzip
+    elif zipfile.is_zipfile(zip_file):
+        with zipfile.ZipFile(zip_file, 'r') as in_f:
+            in_f.extractall(tar_dir)
+    else:
+        return None
+    # clean up if needed
+    if cleanup:
+        os.remove(zip_file)
+    # returns directory
+    if os.path.exists(tar_dir):
+        return tar_dir
+def normalize_latex(latex_dir: str, norm_dir: str, norm_log_file: str, cleanup=True) -> Optional[str]:
+    """
+    Normalize all latex files from arxiv
+    :param latex_dir:
+    :param norm_dir:
+    :param norm_log_file:
+    :param cleanup:
+    :return:
+    """
+    # normalize file
+    file_id = latex_dir.strip('/').split('/')[-1]
+    if file_id == 'skipped':
+        return None
+    norm_output_folder = os.path.join(norm_dir, file_id)
+    os.makedirs(norm_output_folder, exist_ok=True)
+    try:
+        normalize(latex_dir, norm_output_folder)
+    except TypeError:
+        shutil.rmtree(norm_output_folder)
+        with open(norm_log_file, 'a+') as log_f:
+            log_f.write(f'{file_id}\n')
+    # delete latex directory if cleanup
+    if cleanup:
+        shutil.rmtree(latex_dir)
+    return norm_output_folder
+def norm_latex_to_xml(norm_dir: str, xml_dir: str, xml_err_file: str, xml_log_file: str, cleanup=True) -> Optional[str]:
+    """
+    Convert LaTeX to XML using tralics
+    :param norm_dir:
+    :param xml_dir:
+    :param xml_err_file:
+    :param xml_log_file:
+    :param cleanup:
+    :return:
+    """
+    file_id = norm_dir.strip('/').split('/')[-1]
+    norm_tex_file = os.path.join(norm_dir, f'{file_id}.tex')
+    xml_output_dir = os.path.join(xml_dir, file_id)
+    xml_file = os.path.join(xml_output_dir, f'{file_id}.xml')
+    os.makedirs(xml_output_dir, exist_ok=True)
+    latex_to_xml(
+        tex_file=norm_tex_file,
+        out_dir=xml_output_dir,
+        out_file=xml_file,
+        err_file=xml_err_file,
+        log_file=xml_log_file
+    )
+    # delete norm directory if cleanup
+    if cleanup:
+        shutil.rmtree(norm_dir)
+    if os.path.exists(xml_file):
+        return xml_file
+def convert_latex_to_xml(
+        zip_file: str, latex_dir: str, norm_dir: str, xml_dir: str, log_dir: str, cleanup=True
+) -> Optional[str]:
+    """
+    Run expansion, normalization, xml conversion on latex
+    :param zip_file:
+    :param latex_dir:
+    :param norm_dir:
+    :param xml_dir:
+    :param log_dir:
+    :param cleanup:
+    :return:
+    """
+    # extract zip file
+    latex_output_dir = extract_latex(zip_file, latex_dir, cleanup)
+    # normalize latex
+    norm_log_file = os.path.join(log_dir, 'norm_error.log')
+    norm_output_dir = normalize_latex(latex_output_dir, norm_dir, norm_log_file, cleanup)
+    # convert to xml
+    xml_error_file = os.path.join(log_dir, 'xml_error.log')
+    xml_log_file = os.path.join(log_dir, 'xml_skip.log')
+    xml_output_file = norm_latex_to_xml(norm_output_dir, xml_dir, xml_error_file, xml_log_file, cleanup)
+    return xml_output_file
+def convert_latex_to_s2orc_json(
+        latex_zip: str,
+        base_temp_dir: str,
+        cleanup_after: bool=True
+) -> str:
+    """
+    Convert a LaTeX zip file to S2ORC JSON
+    :param latex_zip:
+    :param base_temp_dir:
+    :param cleanup_after:
+    :return:
+    """
+    if not os.path.exists(latex_zip):
+        raise FileNotFoundError("Input LaTeX ZIP file doesn't exist")
+    # temp directories
+    latex_expand_dir = os.path.join(base_temp_dir, 'latex')
+    latex_norm_dir = os.path.join(base_temp_dir, 'norm')
+    latex_xml_dir = os.path.join(base_temp_dir, 'xml')
+    latex_log_dir = os.path.join(base_temp_dir, 'log')
+    os.makedirs(base_temp_dir, exist_ok=True)
+    os.makedirs(latex_expand_dir, exist_ok=True)
+    os.makedirs(latex_norm_dir, exist_ok=True)
+    os.makedirs(latex_xml_dir, exist_ok=True)
+    os.makedirs(latex_log_dir, exist_ok=True)
+    # convert to XML
+    xml_file = convert_latex_to_xml(
+        latex_zip, latex_expand_dir, latex_norm_dir, latex_xml_dir, latex_log_dir, cleanup_after
+    )
+    return xml_file

s2orc-doc2json/doc2json/tex2json/xml_to_json.py ADDED Viewed

	@@ -0,0 +1,1396 @@

+import os
+import re
+import itertools
+import bs4
+from bs4 import BeautifulSoup, NavigableString
+from typing import List, Dict, Tuple, Optional
+import copy
+import latex2mathml.converter
+from doc2json.grobid2json.grobid.grobid_client import GrobidClient
+from doc2json.utils.grobid_util import parse_bib_entry, get_author_data_from_grobid_xml
+from doc2json.s2orc import Paper, Paragraph
+SKIP_TAGS = {
+    'clearpage',
+    'colorpool',
+    'newpage',
+    'tableofcontents'
+}
+TEXT_TAGS = {
+    'p',
+    'proof',
+    'caption'
+}
+def normalize_latex_id(latex_id: str):
+    str_norm = latex_id.upper().replace('_', '')
+    if str_norm.startswith('BID'):
+        return str_norm.replace('BID', 'BIBREF')
+    if str_norm.startswith('CID'):
+        return str_norm.replace('CID', 'SECREF')
+    if str_norm.startswith('FORMULA'):
+        return str_norm.replace('FORMULA', 'EQREF')
+    return str_norm
+def process_author(
+        author_text: str,
+        grobid_client: GrobidClient,
+        logfile: str
+) -> List[Dict]:
+    """
+    Process authors
+    :param author_text:
+    :param grobid_client:
+    :param logfile:
+    :return:
+    """
+    if author_text:
+        author_xml_str = grobid_client.process_header_names(author_text, logfile)
+        if author_xml_str:
+            author_soup = BeautifulSoup(author_xml_str, 'xml')
+            author_entry = get_author_data_from_grobid_xml(author_soup)
+            return author_entry
+    return [{
+        "first": "",
+        "middle": [],
+        "last": author_text,
+        "suffix": "",
+        "affiliation": {},
+        "email": ""
+    }]
+def process_bibentry(bib_text: str, grobid_client: GrobidClient, logfile: str):
+    """
+    Process one bib entry text into title, authors, etc
+    :param bib_text:
+    :param grobid_client:
+    :param logfile:
+    :return:
+    """
+    if not bib_text:
+        return None
+    bib_lines = bib_text.split('\n')
+    bib_lines = [re.sub(r'\s+', ' ', line) for line in bib_lines]
+    bib_lines = [re.sub(r'\s', ' ', line).strip() for line in bib_lines]
+    bib_string = ' '.join(bib_lines)
+    xml_str = grobid_client.process_citation(bib_string, logfile)
+    if xml_str:
+        soup = BeautifulSoup(xml_str, 'lxml')
+        bib_entry = parse_bib_entry(soup)
+        if not bib_entry['raw_text']:
+            bib_entry['raw_text'] = bib_string
+        return bib_entry
+    return None
+def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
+    """
+    Replace all references in element with special tokens
+    :param sp:
+    :param el:
+    :param ref_map:
+    :return:
+    """
+    # replace all citations with cite keyword
+    for cite in el.find_all('cit'):
+        try:
+            target = cite.ref.get('target').replace('bid', 'BIBREF')
+            cite.replace_with(sp.new_string(f" {target} "))
+        except AttributeError:
+            print('Attribute error: ', cite)
+            continue
+    # replace all non citation references
+    for rtag in el.find_all('ref'):
+        try:
+            if rtag.get('target') and not rtag.get('target').startswith('bid'):
+                if rtag.get('target').startswith('cid'):
+                    target = rtag.get('target').replace('cid', 'SECREF')
+                elif rtag.get('target').startswith('uid'):
+                    if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
+                        target = rtag.get('target').replace('uid', 'FIGREF')
+                    elif rtag.get('target').replace('uid', 'TABREF') in ref_map:
+                        target = rtag.get('target').replace('uid', 'TABREF')
+                    elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
+                        target = rtag.get('target').replace('uid', 'EQREF')
+                    elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map:
+                        target = rtag.get('target').replace('uid', 'FOOTREF')
+                    elif rtag.get('target').replace('uid', 'SECREFU') in ref_map:
+                        target = rtag.get('target').replace('uid', 'SECREFU')
+                    else:
+                        target = rtag.get('target').upper()
+                else:
+                    print('Weird ID!')
+                    target = rtag.get('target').upper()
+                rtag.replace_with(sp.new_string(f" {target} "))
+        except AttributeError:
+            print('Attribute error: ', rtag)
+            continue
+    return el
+def process_list_el(sp: BeautifulSoup, list_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
+    """
+    Process list element
+    :param sp:
+    :param list_el:
+    :param section_info:
+    :param bib_map:
+    :param ref_map:
+    :return:
+    """
+    # TODO: currently parsing list as a list of paragraphs (append numbers to start of each entry in ordered lists)
+    list_items = []
+    for item in list_el.find_all('item'):
+        # skip itemize settings
+        if item.text.strip().startswith('[') and item.text.strip().endswith(']'):
+            continue
+        # try processing as paragraph
+        list_num = item.get('id-text', None)
+        item_as_para = process_paragraph(sp, item, section_info, bib_map, ref_map)
+        # append list number if ordered
+        if list_num:
+            list_num_str = f'{list_num}. '
+            # iterate cite spans
+            new_cite_spans = []
+            for span in item_as_para.cite_spans:
+                new_cite_spans.append({
+                    "start": span['start'] + len(list_num_str),
+                    "end": span['end'] + len(list_num_str),
+                    "text": span['text']
+                })
+            # iterate ref spans
+            new_ref_spans = []
+            for span in item_as_para.ref_spans:
+                new_ref_spans.append({
+                    "start": span['start'] + len(list_num_str),
+                    "end": span['end'] + len(list_num_str),
+                    "text": span['text']
+                })
+            # iterate equation spans
+            new_eq_spans = []
+            for span in item_as_para.eq_spans:
+                new_eq_spans.append({
+                    "start": span['start'] + len(list_num_str),
+                    "end": span['end'] + len(list_num_str),
+                    "text": span['text'],
+                    "latex": span['latex'],
+                    "ref_id": span['ref_id']
+                })
+            new_para = Paragraph(
+                text=list_num_str + item_as_para.text,
+                cite_spans=new_cite_spans,
+                ref_spans=new_ref_spans,
+                eq_spans=new_eq_spans,
+                section=item_as_para.section
+            )
+        else:
+            new_para = item_as_para
+        list_items.append(new_para)
+    return list_items
+def process_navstring(str_el: NavigableString, section_info: List):
+    """
+    Process one NavigableString
+    :param sp:
+    :param str_el:
+    :param section_info:
+    :param bib_map:
+    :param ref_map:
+    :return:
+    """
+    # substitute space characters
+    text = re.sub(r'\s+', ' ', str_el)
+    text = re.sub(r'\s', ' ', text)
+    # get all cite spans
+    all_cite_spans = []
+    for span in re.finditer(r'(BIBREF\d+)', text):
+        all_cite_spans.append({
+            "start": span.start(),
+            "end": span.start() + len(span.group()),
+            "ref_id": span.group()
+        })
+    # get all ref spans
+    all_ref_spans = []
+    for span in itertools.chain(
+        re.finditer(r'(FIGREF\d+)', text),
+        re.finditer(r'(TABREF\d+)', text),
+        re.finditer(r'(EQREF\d+)', text),
+        re.finditer(r'(FOOTREF\d+)', text),
+        re.finditer(r'(SECREF\d+)', text),
+        re.finditer(r'(SECREFU\d+)', text),
+    ):
+        all_ref_spans.append({
+            "start": span.start(),
+            "end": span.start() + len(span.group()),
+            "ref_id": span.group()
+        })
+    # assert all align
+    for cite_span in all_cite_spans:
+        assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
+    for ref_span in all_ref_spans:
+        assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']
+    return Paragraph(
+        text=text,
+        cite_spans=all_cite_spans,
+        ref_spans=all_ref_spans,
+        eq_spans=[],
+        section=section_info
+    )
+def process_paragraph(sp: BeautifulSoup, para_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
+    """
+    Process one paragraph
+    :param sp:
+    :param para_el:
+    :param section_info:
+    :param bib_map:
+    :param ref_map:
+    :return:
+    """
+    # replace all ref tokens with special tokens
+    para_el = replace_ref_tokens(sp, para_el, ref_map)
+    # sub and get corresponding spans of inline formulas
+    formula_dict = dict()
+    inline_key_ind = 0
+    display_key_ind = 0
+    for ftag in para_el.find_all('formula'):
+        try:
+            # if formula has ref id, treat as display formula
+            if ftag.get('id'):
+                formula_key = f'DISPLAYFORM{display_key_ind}'
+                ref_id = ftag.get('id').replace('uid', 'EQREF')
+                display_key_ind += 1
+            # else, treat as inline
+            else:
+                formula_key = f'INLINEFORM{inline_key_ind}'
+                ref_id = None
+                inline_key_ind += 1
+            try:
+                formula_mathml = latex2mathml.converter.convert(ftag.texmath.text)
+            except Exception:
+                formula_mathml = ""
+            formula_dict[formula_key] = (ftag.math.text, ftag.texmath.text, formula_mathml, ref_id)
+            ftag.replace_with(sp.new_string(f" {formula_key} "))
+        except AttributeError:
+            continue
+    # remove floats
+    for fl in para_el.find_all('float'):
+        print('Warning: still has <float/>!')
+        fl.decompose()
+    # remove notes
+    for note in para_el.find_all('note'):
+        print('Warning: still has <note/>!')
+        note.decompose()
+    # substitute space characters
+    text = re.sub(r'\s+', ' ', para_el.text)
+    text = re.sub(r'\s', ' ', text)
+    # get all cite spans
+    all_cite_spans = []
+    for span in re.finditer(r'(BIBREF\d+)', text):
+        all_cite_spans.append({
+            "start": span.start(),
+            "end": span.start() + len(span.group()),
+            "text": bib_map[span.group()]['num'] if span.group() in bib_map else None,
+            "ref_id": span.group()
+        })
+    # get all ref spans
+    all_ref_spans = []
+    for span in itertools.chain(
+        re.finditer(r'(FIGREF\d+)', text),
+        re.finditer(r'(TABREF\d+)', text),
+        re.finditer(r'(EQREF\d+)', text),
+        re.finditer(r'(FOOTREF\d+)', text),
+        re.finditer(r'(SECREF\d+)', text),
+        re.finditer(r'(SECREFU\d+)', text),
+    ):
+        all_ref_spans.append({
+            "start": span.start(),
+            "end": span.start() + len(span.group()),
+            "text": ref_map[span.group()]['num'] if span.group() in ref_map else None,
+            "ref_id": span.group()
+        })
+    # get all equation spans
+    all_eq_spans = []
+    for span in itertools.chain(
+            re.finditer(r'(INLINEFORM\d+)', text),
+            re.finditer(r'(DISPLAYFORM\d+)', text)
+    ):
+        try:
+            matching_formula = formula_dict[span.group()]
+            all_eq_spans.append({
+                "start": span.start(),
+                "end": span.start() + len(span.group()),
+                "text": matching_formula[0],
+                "latex": matching_formula[1],
+                "mathml": matching_formula[2],
+                "ref_id": span.group()
+            })
+        except KeyError:
+            continue
+    # assert all align
+    for cite_span in all_cite_spans:
+        assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
+    for ref_span in all_ref_spans:
+        assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']
+    return Paragraph(
+        text=text,
+        cite_spans=all_cite_spans,
+        ref_spans=all_ref_spans,
+        eq_spans=all_eq_spans,
+        section=section_info
+    )
+def decompose_tags_before_title(sp: BeautifulSoup):
+    """
+    decompose all tags before title
+    :param sp:
+    :return:
+    """
+    if sp.body.next.name == 'std':
+        cld_tags = sp.std.find_all(recursive=False)
+        if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
+            for tag in sp.std:
+                if type(tag) == bs4.element.Tag:
+                    if tag.name != 'maketitle' and tag.name != 'title':
+                        tag.decompose()
+                    else:
+                        break
+    elif sp.body.next.name == 'unknown':
+        cld_tags = sp.unknown.find_all(recursive=False)
+        if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
+            for tag in sp.std:
+                if type(tag) == bs4.element.Tag:
+                    if tag.name != 'maketitle' and tag.name != 'title':
+                        tag.decompose()
+                    else:
+                        break
+    else:
+        print(f"Unknown inner tag: {sp.body.next.name}")
+        return
+def process_metadata(sp: BeautifulSoup, grobid_client: GrobidClient, log_file: str) -> Tuple[str, List]:
+    """
+    Process metadata section in soup
+    :param sp:
+    :param grobid_client:
+    :param log_file:
+    :return:
+    """
+    title = ""
+    authors = []
+    if not sp.maketitle and not sp.metadata:
+        if sp.title:
+            title = sp.title.text
+            return title, authors
+        else:
+            return title, authors
+    elif sp.maketitle:
+        try:
+            # process title
+            title = sp.maketitle.title.text
+            for formula in sp.author.find_all('formula'):
+                formula.decompose()
+            # process authors
+            author_parts = []
+            for tag in sp.author:
+                if type(tag) == NavigableString:
+                    author_parts.append(tag.strip())
+                else:
+                    author_parts.append(tag.text.strip())
+            author_parts = [re.sub(r'\s+', ' ', line) for line in author_parts]
+            author_parts = [re.sub(r'\s', ' ', line).strip() for line in author_parts]
+            author_parts = [part for part in author_parts if part.strip()]
+            author_string = ', '.join(author_parts)
+            authors = process_author(author_string, grobid_client, log_file)
+            sp.maketitle.decompose()
+        except AttributeError:
+            sp.maketitle.decompose()
+            return title, authors
+    elif sp.metadata:
+        try:
+            # process title and authors from metadata
+            title = sp.metadata.title.text
+            # get authors
+            for author in sp.authors:
+                for subtag in author:
+                    subtag.decompose()
+                if author.text.strip():
+                    author_parts = author.text.strip().split()
+                    authors.append({
+                        "first": author_parts[0] if len(author_parts) > 1 else "",
+                        "last": author_parts[-1]
+                            if author_parts[-1].lower() not in {"jr", "jr.", "iii", "iv", "v"}
+                            else author_parts[-2] if len(author_parts) > 1 else author_parts[-1],
+                        "middle": author_parts[1:-1],
+                        "suffix": "",
+                        "affiliation": {},
+                        "email": ""
+                    })
+            sp.metadata.decompose()
+        except AttributeError:
+            sp.metadata.decompose()
+            return title, authors
+    return title, authors
+def process_bibliography_from_tex(sp: BeautifulSoup, client, log_file) -> Dict:
+    """
+    Parse bibliography from latex
+    :return:
+    """
+    bibkey_map = dict()
+    # replace Bibliography with bibliography if needed
+    for bibl in sp.find_all("Bibliography"):
+        bibl.name = 'bibliography'
+    # construct bib map
+    for bibliography in sp.find_all('bibliography'):
+        bib_items = bibliography.find_all('bibitem')
+        # map all bib entries
+        if bib_items:
+            for bi_num, bi in enumerate(bib_items):
+                try:
+                    if not bi.get('id'):
+                        continue
+                    # get bib entry text and process it
+                    bib_par = bi.find_parent('p')
+                    if bib_par.text:
+                        bib_entry = process_bibentry(bib_par.text, client, log_file)
+                    else:
+                        next_tag = bib_par.findNext('p')
+                        if not next_tag.find('bibitem') and next_tag.text:
+                            bib_entry = process_bibentry(next_tag.text, client, log_file)
+                        else:
+                            bib_entry = None
+                    # if processed successfully, add to map
+                    if bib_entry:
+                        # get URLs from bib entry
+                        urls = []
+                        for xref in bib_par.find_all('xref'):
+                            urls.append(xref.get('url'))
+                        bib_entry['urls'] = urls
+                        # map to ref id
+                        ref_id = normalize_latex_id(bi.get('id'))
+                        bib_entry['ref_id'] = ref_id
+                        bib_entry['num'] = bi_num
+                        bibkey_map[ref_id] = bib_entry
+                except AttributeError:
+                    print('Attribute error in bib item!', bi)
+                    continue
+                except TypeError:
+                    print('Type error in bib item!', bi)
+                    continue
+        else:
+            for bi_num, p in enumerate(sp.bibliography.find_all('p')):
+                try:
+                    bib_key, bib_entry = None, None
+                    bib_text = p.text
+                    bib_name = re.match(r'\[(.*?)\](.*)', bib_text)
+                    if bib_name:
+                        bib_text = re.sub(r'\s', ' ', bib_text)
+                        bib_name = re.match(r'\[(.*?)\](.*)', bib_text)
+                        if bib_name:
+                            bib_key = bib_name.group(1)
+                            bib_entry = process_bibentry(bib_name.group(2), client, log_file)
+                    else:
+                        bib_lines = bib_text.split('\n')
+                        bib_key = re.sub(r'\s', ' ', bib_lines[0])
+                        bib_text = re.sub(r'\s', ' ', ' '.join(bib_lines[1:]))
+                        bib_entry = process_bibentry(bib_text, client, log_file)
+                    if bib_key and bib_entry:
+                        # get URLs from bib entry
+                        urls = []
+                        for xref in p.find_all('xref'):
+                            urls.append(xref.get('url'))
+                        bib_entry['urls'] = urls
+                        bib_entry['num'] = bi_num
+                        # map to bib id
+                        bibkey_map[bib_key] = bib_entry
+                except AttributeError:
+                    print('Attribute error in bib item!', p)
+                    continue
+                except TypeError:
+                    print('Type error in bib item!', p)
+                    continue
+    for bibliography in sp.find_all('bibliography'):
+        bibliography.decompose()
+    return bibkey_map
+def get_section_name(sec):
+    """
+    Get section name from div tag
+    :param sec:
+    :return:
+    """
+    if sec.head:
+        sec_text = sec.head.text
+    else:
+        sec_str = []
+        for tag in sec:
+            if type(tag) == NavigableString:
+                if len(tag.strip()) < 50:
+                    sec_str.append(tag.strip())
+                else:
+                    break
+            elif tag.name != 'p':
+                if len(tag.text.strip()) < 50:
+                    sec_str.append(tag.text.strip())
+                else:
+                    break
+            else:
+                break
+        sec_text = ' '.join(sec_str).strip()
+    return sec_text
+def get_sections_from_div(el: bs4.element.Tag, sp: BeautifulSoup, parent: Optional[str], faux_max: int) -> Dict:
+    """
+    Process section headers for one div
+    :param el:
+    :param sp:
+    :return:
+    """
+    sec_map_dict = dict()
+    el_ref_id = None
+    # process divs with ids
+    if el.get('id', None):
+        sec_num = el.get('id-text', None)
+        if 'cid' in el.get('id'):
+            el_ref_id = el.get('id').replace('cid', 'SECREF')
+        elif 'uid' in el.get('id'):
+            el_ref_id = el.get('id').replace('uid', 'SECREFU')
+        else:
+            print('Unknown ID type!', el.get('id'))
+            raise NotImplementedError
+        el['s2orc_id'] = el_ref_id
+        sec_map_dict[el_ref_id] = {
+            "num": sec_num,
+            "text": get_section_name(el),
+            "ref_id": el_ref_id,
+            "parent": parent
+        }
+    # process divs without section numbers
+    elif el.get('rend') == "nonumber":
+        el_ref_id = f'SECREF{faux_max}'
+        el['s2orc_id'] = el_ref_id
+        sec_map_dict[el_ref_id] = {
+            "num": None,
+            "text": get_section_name(el),
+            "ref_id": el_ref_id,
+            "parent": parent
+        }
+    # process sub elements
+    for sub_el in el.find_all(recursive=False):
+        if sub_el.name.startswith('div'):
+            # add any unspecified keys
+            sec_keys = [int(k.strip('SECREF')) for k in sec_map_dict.keys() if k and k.strip('SECREF').isdigit()]
+            faux_max = max(sec_keys + [faux_max]) + 1
+            sec_map_dict.update(
+                get_sections_from_div(sub_el, sp, el_ref_id if el_ref_id else parent, faux_max)
+            )
+        elif sub_el.name == 'p' or sub_el.name == 'proof':
+            if sub_el.get('id', None):
+                sec_num = sub_el.get('id-text', sub_el.hi.get('id-text', None))
+                if 'cid' in sub_el.get('id'):
+                    sub_el_ref_id = sub_el.get('id').replace('cid', 'SECREF')
+                elif 'uid' in sub_el.get('id'):
+                    sub_el_ref_id = sub_el.get('id').replace('uid', 'SECREFU')
+                else:
+                    print('Unknown ID type!', sub_el.get('id'))
+                    raise NotImplementedError
+                sub_el['s2orc_id'] = sub_el_ref_id
+                sec_map_dict[el_ref_id] = {
+                    "num": sec_num,
+                    "text": sub_el.head.text if sub_el.head else sub_el.hi.text if sub_el.hi else "",
+                    "ref_id": sub_el_ref_id,
+                    "parent": el_ref_id if el_ref_id else parent
+                }
+    return sec_map_dict
+def process_sections_from_text(sp: BeautifulSoup) -> Dict:
+    """
+    Generate section dict and replace with id tokens
+    :param sp:
+    :return:
+    """
+    # initialize
+    section_map = dict()
+    max_above_1000 = 999
+    for div0 in sp.find_all('div0'):
+        parent = None
+        section_map.update(get_sections_from_div(div0, sp, parent, max_above_1000 + 1))
+        # add any unspecified keys
+        sec_keys = [int(k.strip('SECREF')) for k in section_map.keys() if k and k.strip('SECREF').isdigit()]
+        max_above_1000 = max(sec_keys + [max_above_1000]) + 1
+    return section_map
+def process_equations_from_tex(sp: BeautifulSoup) -> Dict:
+    """
+    Generate equation dict and replace with id tokens
+    :param sp:
+    :return:
+    """
+    equation_map = dict()
+    for eq in sp.find_all('formula'):
+        try:
+            if eq.get('type', None) == 'display':
+                if eq.get('id', None):
+                    ref_id = eq.get('id').replace('uid', 'EQREF')
+                    try:
+                        mathml = latex2mathml.converter.convert(eq.texmath.text.strip())
+                    except Exception:
+                        mathml = ""
+                    equation_map[ref_id] = {
+                        "num": eq.get('id-text', None),
+                        "text": eq.math.text.strip(),
+                        "mathml": mathml,
+                        "latex": eq.texmath.text.strip(),
+                        "ref_id": ref_id
+                    }
+                replace_item = sp.new_tag('p')
+                equation_copy = copy.copy(eq)
+                equation_copy['type'] = 'inline'
+                replace_item.insert(0, equation_copy)
+                # replace with <p> containing equation as inline
+                eq.replace_with(replace_item)
+        except AttributeError:
+            continue
+    return equation_map
+def process_footnotes_from_text(sp: BeautifulSoup) -> Dict:
+    """
+    Process footnote marks
+    :param sp:
+    :return:
+    """
+    footnote_map = dict()
+    for note in sp.find_all('note'):
+        try:
+            if note.name and note.get('id'):
+                # normalize footnote id
+                ref_id = note.get('id').replace('uid', 'FOOTREF')
+                # remove equation tex
+                for eq in note.find_all('texmath'):
+                    eq.decompose()
+                # replace all xrefs with link
+                for xref in note.find_all('xref'):
+                    xref.replace_with(sp.new_string(f" {xref.get('url')} "))
+                # clean footnote text
+                footnote_text = None
+                if note.text:
+                    footnote_text = note.text.strip()
+                    footnote_text = re.sub(r'\s+', ' ', footnote_text)
+                    footnote_text = re.sub(r'\s', ' ', footnote_text)
+                # form footnote entry
+                footnote_map[ref_id] = {
+                    "num": note.get('id-text', None),
+                    "text": footnote_text,
+                    "ref_id": ref_id
+                }
+                note.replace_with(sp.new_string(f" {ref_id} "))
+        except AttributeError:
+            continue
+    return footnote_map
+def get_figure_map_from_tex(sp: BeautifulSoup) -> Dict:
+    """
+    Generate figure dict only
+    :param sp:
+    :return:
+    """
+    figure_map = dict()
+    # get floats first because they are around figures
+    for flt in sp.find_all('float'):
+        try:
+            if flt.name and flt.get('name') == 'figure':
+                # get files
+                fig_files = []
+                for fig in flt.find_all('figure'):
+                    if fig.get('file') and fig.get('extension'):
+                        fname = fig.get('file') + '.' + fig.get('extension')
+                        fig_files.append(fname)
+                    elif fig.get('file'):
+                        fname = fig.get('file')
+                        fig_files.append(fname)
+                    else:
+                        for subfig in fig.find_all('subfigure'):
+                            if subfig.get('file') and subfig.get('extension'):
+                                fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
+                            elif subfig.get('file'):
+                                fig_files.append(subfig.get('file'))
+                if flt.get('id'):
+                    ref_id = flt.get('id').replace('uid', 'FIGREF')
+                    # form figmap entry
+                    figure_map[ref_id] = {
+                        "num": flt.get('id-text', None),
+                        "text": None,   # placeholder
+                        "uris": fig_files,
+                        "ref_id": ref_id
+                    }
+        except AttributeError:
+            print('Attribute error with figure float: ', flt.name)
+            continue
+    for fig in sp.find_all('figure'):
+        try:
+            if fig.name and fig.get('id'):
+                # normalize figure id
+                ref_id = fig.get('id').replace('uid', 'FIGREF')
+                # try to get filenames of figures
+                fig_files = []
+                if fig.get('file') and fig.get('extension'):
+                    fname = fig.get('file') + '.' + fig.get('extension')
+                    fig_files.append(fname)
+                elif fig.get('file'):
+                    fig_files.append(fig.get('file'))
+                else:
+                    for subfig in fig.find_all('subfigure'):
+                        if subfig.get('file') and subfig.get('extension'):
+                            fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
+                        elif subfig.get('file'):
+                            fig_files.append(subfig.get('file'))
+                # form figmap entry
+                figure_map[ref_id] = {
+                    "num": fig.get('id-text', None),
+                    "text": None,   # placeholder
+                    "uris": fig_files,
+                    "ref_id": ref_id
+                }
+        except AttributeError:
+            print('Attribute error with figure: ', fig.name)
+            continue
+    return figure_map
+def process_figures_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
+    """
+    Add figure captions to fig_map and decompose
+    :param sp:
+    :param ref_map:
+    :return:
+    """
+    # process floats first because they are on the outside
+    for flt in sp.find_all('float'):
+        try:
+            if flt.name and flt.get('name') == 'figure':
+                if flt.get('id'):
+                    ref_id = flt.get('id').replace('uid', 'FIGREF')
+                    # remove equation tex
+                    for eq in flt.find_all('texmath'):
+                        eq.decompose()
+                    # clean caption text
+                    caption_text = None
+                    if flt.caption:
+                        flt = replace_ref_tokens(sp, flt, ref_map)
+                        caption_text = flt.caption.text.strip()
+                        caption_text = re.sub(r'\s+', ' ', caption_text)
+                        caption_text = re.sub(r'\s', ' ', caption_text)
+                    # form figmap entry
+                    ref_map[ref_id]['text'] = caption_text
+                flt.decompose()
+        except AttributeError:
+            print('Attribute error with figure float: ', flt.name)
+            continue
+    for fig in sp.find_all('figure'):
+        try:
+            if fig.name and fig.get('id'):
+                # normalize figure id
+                ref_id = fig.get('id').replace('uid', 'FIGREF')
+                # remove equation tex
+                for eq in fig.find_all('texmath'):
+                    eq.decompose()
+                # clean caption text
+                caption_text = None
+                if fig.text:
+                    fig = replace_ref_tokens(sp, fig, ref_map)
+                    caption_text = fig.text.strip()
+                    caption_text = re.sub(r'\s+', ' ', caption_text)
+                    caption_text = re.sub(r'\s', ' ', caption_text)
+                # add text to figmap entry
+                ref_map[ref_id]["text"] = caption_text
+        except AttributeError:
+            print('Attribute error with figure: ', fig.name)
+            continue
+        fig.decompose()
+    return ref_map
+def convert_table_to_html(table_lst: List) -> str:
+    if not table_lst:
+        return ''
+    html_str = '<table>'
+    for i, row in enumerate(table_lst):
+        html_str += '<tr>'
+        bottom_border = row.get('bottom-border')
+        if i == 0 or bottom_border:
+            for cell in row['cells']:
+                html_str += f"<th>{cell['text']}</th>"
+        else:
+            for cell in row['cells']:
+                html_str += f"<td>{cell['text']}</td>"
+        html_str += '</tr>'
+    html_str += '</table>'
+    return html_str
+def extract_table(table: BeautifulSoup) -> List:
+    """
+    Extract table values from table entry
+    :param table:
+    :return:
+    """
+    table_rep = []
+    for row in table.find_all('row'):
+        cells = []
+        for cell in row.find_all('cell'):
+            text_items = []
+            latex_items = []
+            for child in cell:
+                if type(child) == NavigableString:
+                    text_items.append(str(child))
+                    latex_items.append(str(child))
+                elif child.name == 'formula':
+                    text_items.append(child.math.text)
+                    latex_items.append(child.texmath.text)
+                else:
+                    text_items.append(child.text)
+                    latex_items.append(child.text)
+            text = ' '.join(text_items)
+            text = re.sub(r'\s+', ' ', text)
+            text = re.sub(r'\s', ' ', text)
+            latex = ' '.join(latex_items)
+            latex = re.sub(r'\s+', ' ', latex)
+            cells.append({
+                "alignment": cell.get('halign'),
+                "right-border": cell.get('right-border') == 'true',
+                "left-border": cell.get('left-border') == 'true',
+                "text": text.strip(),
+                "latex": latex.strip()
+            })
+        table_rep.append({
+            "top-border": row.get('top-border') == "true",
+            "bottom-border": row.get('bottom-border') == "true",
+            "cells": cells
+        })
+    return table_rep
+def get_table_map_from_text(sp: BeautifulSoup, keep_table_contents=True) -> Dict:
+    """
+    Generate table dict only
+    :param sp:
+    :param keep_table_contents:
+    :return:
+    """
+    table_map = dict()
+    for flt in sp.find_all('float'):
+        try:
+            if flt.name and flt.get('name') == 'table':
+                if flt.get('id'):
+                    # normalize table id
+                    ref_id = flt.get('id').replace('uid', 'TABREF')
+                    # get table content
+                    content = extract_table(flt) if keep_table_contents else None
+                    html = convert_table_to_html(content) if keep_table_contents else None
+                    # form tabmap entry
+                    table_map[ref_id] = {
+                        "num": flt.get('id-text', None),
+                        "text": None,   # placeholder
+                        "content": content,
+                        "html": html,
+                        "ref_id": ref_id
+                    }
+                    for row in flt.find_all('row'):
+                        row.decompose()
+        except AttributeError:
+            print('Attribute error with table float: ', flt.name)
+            continue
+    for tab in sp.find_all('table'):
+        try:
+            # skip inline tables
+            if tab.get('rend') == 'inline':
+                continue
+            # process them
+            if tab.name and tab.get('id'):
+                # normalize table id
+                ref_id = tab.get('id').replace('uid', 'TABREF')
+                # get table content
+                content = extract_table(tab) if keep_table_contents else None
+                html = convert_table_to_html(content) if keep_table_contents else None
+                # form tabmap entry
+                table_map[ref_id] = {
+                    "num": tab.get('id-text', None),
+                    "text": None,   # placeholder
+                    "content": content,
+                    "html": html,
+                    "ref_id": ref_id
+                }
+                for row in tab.find_all('row'):
+                    row.decompose()
+        except AttributeError:
+            print('Attribute error with table: ', tab.name)
+            continue
+    return table_map
+def process_tables_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
+    """
+    Generate table dict and replace with id tokens
+    :param sp:
+    :param ref_map:
+    :return:
+    """
+    # process floats first because they are on the outside
+    for flt in sp.find_all('float'):
+        try:
+            if flt.name and flt.get('name') == 'table':
+                if flt.get('id'):
+                    # normalize table id
+                    ref_id = flt.get('id').replace('uid', 'TABREF')
+                    # remove equation tex
+                    if flt.caption:
+                        caption_el = replace_ref_tokens(sp, flt.caption, ref_map)
+                        for eq in caption_el.find_all('texmath'):
+                            eq.decompose()
+                        caption_text = caption_el.text.strip()
+                    elif flt.head:
+                        head_el = replace_ref_tokens(sp, flt.head, ref_map)
+                        for eq in head_el.find_all('texmath'):
+                            eq.decompose()
+                        caption_text = head_el.text.strip()
+                    elif flt.p:
+                        caption_parts = []
+                        for tab_p in flt.find_all('p'):
+                            p_el = replace_ref_tokens(sp, tab_p, ref_map)
+                            for eq in p_el.find_all('texmath'):
+                                eq.decompose()
+                            caption_parts.append(p_el.text.strip())
+                        caption_text = ' '.join(caption_parts)
+                    else:
+                        tab_el = replace_ref_tokens(sp, flt, ref_map)
+                        caption_text = tab_el.text.strip()
+                    if caption_text:
+                        caption_text = re.sub(r'\s+', ' ', caption_text)
+                        caption_text = re.sub(r'\s', ' ', caption_text)
+                    # form tabmap entry
+                    ref_map[ref_id]['text'] = caption_text
+                flt.decompose()
+        except AttributeError:
+            print('Attribute error with table float: ', flt.name)
+            continue
+    for tab in sp.find_all('table'):
+        try:
+            # skip inline tables
+            if tab.get('rend') == 'inline':
+                continue
+            # process them
+            if tab.name and tab.get('id'):
+                # normalize table id
+                ref_id = tab.get('id').replace('uid', 'TABREF')
+                # remove equation tex from caption and clean and resolve refs
+                if tab.caption:
+                    caption_el = replace_ref_tokens(sp, tab.caption, ref_map)
+                    for eq in caption_el.find_all('texmath'):
+                        eq.decompose()
+                    caption_text = caption_el.text.strip()
+                elif tab.head:
+                    head_el = replace_ref_tokens(sp, tab.head, ref_map)
+                    for eq in head_el.find_all('texmath'):
+                        eq.decompose()
+                    caption_text = head_el.text.strip()
+                elif tab.p:
+                    caption_parts = []
+                    for tab_p in tab.find_all('p'):
+                        p_el = replace_ref_tokens(sp, tab_p, ref_map)
+                        for eq in p_el.find_all('texmath'):
+                            eq.decompose()
+                        caption_parts.append(p_el.text.strip())
+                    caption_text = ' '.join(caption_parts)
+                else:
+                    tab_el = replace_ref_tokens(sp, tab, ref_map)
+                    caption_text = tab_el.text.strip()
+                if caption_text:
+                    caption_text = re.sub(r'\s+', ' ', caption_text)
+                    caption_text = re.sub(r'\s', ' ', caption_text)
+                # form tabmap entry
+                ref_map[ref_id]['text'] = caption_text
+        except AttributeError:
+            print('Attribute error with table: ', tab.name)
+            continue
+        tab.decompose()
+    return ref_map
+def combine_ref_maps(eq_map: Dict, fig_map: Dict, tab_map: Dict, foot_map: Dict, sec_map: Dict):
+    """
+    Combine all items with ref ids into one map
+    :param eq_map:
+    :param fig_map:
+    :param tab_map:
+    :param sec_map:
+    :return:
+    """
+    ref_map = dict()
+    for k, v in eq_map.items():
+        v['type'] = 'equation'
+        ref_map[k] = v
+    for k, v in fig_map.items():
+        v['type'] = 'figure'
+        ref_map[k] = v
+    for k, v in tab_map.items():
+        v['type'] = 'table'
+        ref_map[k] = v
+    for k, v in foot_map.items():
+        v['type'] = 'footnote'
+        ref_map[k] = v
+    for k, v in sec_map.items():
+        v['type'] = 'section'
+        ref_map[k] = v
+    return ref_map
+def collapse_formatting_tags(sp: BeautifulSoup):
+    """
+    Collapse formatting tags like <hi>
+    :param sp:
+    :return:
+    """
+    for hi in sp.find_all('hi'):
+        hi.replace_with(f' {sp.new_string(hi.text.strip())} ')
+def process_abstract_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
+    """
+    Parse abstract from soup
+    :param sp:
+    :param bib_map:
+    :param ref_map:
+    :return:
+    """
+    abstract_text = []
+    if sp.abstract:
+        for p in sp.abstract.find_all('p'):
+            abstract_text.append(
+                process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
+            )
+        sp.abstract.decompose()
+    else:
+        if sp.std:
+            p_tags = [tag for tag in sp.std if tag.name == 'p' and not tag.get('s2orc_id', None)]
+        elif sp.unknown:
+            p_tags = [tag for tag in sp.unknown if tag.name == 'p' and not tag.get('s2orc_id', None)]
+        else:
+            p_tags = None
+        if p_tags:
+            for p in p_tags:
+                abstract_text.append(
+                    process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
+                )
+                p.decompose()
+    return [para.__dict__ for para in abstract_text]
+def build_section_list(sec_id: str, ref_map: Dict) -> List[Tuple]:
+    """
+    Build list of sections from reference map from sec_id using parent entry recursively
+    :param sec_id:
+    :param ref_map:
+    :return:
+    """
+    if not sec_id:
+        return []
+    elif sec_id not in ref_map:
+        return []
+    else:
+        sec_entry = [(ref_map[sec_id]['num'], ref_map[sec_id]['text'])]
+        if ref_map[sec_id]['parent'] == sec_id:
+            return sec_entry
+        else:
+            return build_section_list(ref_map[sec_id]['parent'], ref_map) + sec_entry
+def get_seclist_for_el(el: bs4.element.Tag, ref_map: Dict, default_seclist: List) -> List[Tuple]:
+    """
+    Build sec_list for tag
+    :param el:
+    :param ref_map:
+    :param default_seclist:
+    :return:
+    """
+    if type(el) == NavigableString:
+        return default_seclist
+    sec_id = el.get('s2orc_id', None)
+    if sec_id:
+        return build_section_list(sec_id, ref_map)
+    else:
+        return default_seclist
+def process_div(tag: bs4.element.Tag, secs: List, sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
+    """
+    Process div recursively
+    :param tag:
+    :param secs:
+    :param sp:
+    :param bib_map:
+    :param ref_map:
+    :return:
+    """
+    # iterate through children of this tag
+    body_text = []
+    # navigable strings
+    if type(tag) == NavigableString:
+        return []
+    # skip these tags
+    elif tag.name in SKIP_TAGS:
+        return []
+    # process normal tags
+    elif tag.name in TEXT_TAGS:
+        if tag.text:
+            body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
+    # process lists
+    elif tag.name == 'list':
+        if tag.text:
+            body_text += process_list_el(sp, tag, secs, bib_map, ref_map)
+    # process formula
+    elif tag.name == 'formula':
+        replace_item = sp.new_tag('p')
+        tag_copy = copy.copy(tag)
+        tag_copy['type'] = 'inline'
+        replace_item.insert(0, tag_copy)
+        tag.replace_with(replace_item)
+        if tag.text:
+            body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
+    # process divs
+    elif tag.name.startswith('div'):
+        for el in tag:
+            # process tags
+            if type(el) == bs4.element.Tag:
+                el_sec_list = get_seclist_for_el(el, ref_map, secs)
+                body_text += process_div(el, el_sec_list, sp, bib_map, ref_map)
+    # unknown tag type, skip for now
+    else:
+        print(f'Unknown tag type: {tag.name}')
+        return []
+    return body_text
+def process_body_text_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
+    """
+    Parse body text from tag recursively
+    :param sp:
+    :param bib_map:
+    :param ref_map:
+    :return:
+    """
+    body_text = []
+    for tag in sp.body:
+        # skip navigable string
+        if type(tag) == NavigableString:
+            continue
+        else:
+            sec_list = get_seclist_for_el(tag, ref_map, [])
+            for cld in tag:
+                # skip navigable string
+                if type(tag) == NavigableString:
+                    continue
+                else:
+                    sec_list = get_seclist_for_el(cld, ref_map, sec_list)
+                    if type(cld) == bs4.element.Tag:
+                        body_text += process_div(cld, sec_list, sp, bib_map, ref_map)
+    # decompose everything
+    sp.body.decompose()
+    return [para.__dict__ for para in body_text]
+def convert_xml_to_s2orc(
+        sp: BeautifulSoup, file_id: str, year_str: str, log_file: str, grobid_config: Optional[Dict]=None
+) -> Paper:
+    """
+    Convert a bunch of xml to gorc format
+    :param sp:
+    :param file_id:
+    :param year_str:
+    :param log_file:
+    :param grobid_config:
+    :return:
+    """
+    # create grobid client
+    client = GrobidClient(grobid_config)
+    # TODO: not sure why but have to run twice
+    decompose_tags_before_title(sp)
+    decompose_tags_before_title(sp)
+    # process maketitle info
+    title, authors = process_metadata(sp, client, log_file)
+    # processing of bibliography entries
+    # TODO: look into why authors aren't processing
+    bibkey_map = process_bibliography_from_tex(sp, client, log_file)
+    # no bibliography entries
+    if not bibkey_map:
+        with open(log_file, 'a+') as bib_f:
+            bib_f.write(f'{file_id},warn_no_bibs\n')
+    # process section headers
+    section_map = process_sections_from_text(sp)
+    # process and replace non-inline equations
+    equation_map = process_equations_from_tex(sp)
+    # process footnote markers
+    footnote_map = process_footnotes_from_text(sp)
+    # get figure map
+    figure_map = get_figure_map_from_tex(sp)
+    # get table_map
+    table_map = get_table_map_from_text(sp)
+    # combine references in one dict
+    refkey_map = combine_ref_maps(equation_map, figure_map, table_map, footnote_map, section_map)
+    # process and replace figures
+    refkey_map = process_figures_from_tex(sp, refkey_map)
+    # process and replace tables
+    refkey_map = process_tables_from_tex(sp, refkey_map)
+    # collapse all hi tags
+    collapse_formatting_tags(sp)
+    # process abstract if possible
+    abstract = process_abstract_from_tex(sp, bibkey_map, refkey_map)
+    # process body text
+    body_text = process_body_text_from_tex(sp, bibkey_map, refkey_map)
+    # skip if no body text parsed
+    if not body_text:
+        with open(log_file, 'a+') as body_f:
+            body_f.write(f'{file_id},warn_no_body\n')
+    metadata = {
+        "title": title,
+        "authors": authors,
+        "year": year_str,
+        "venue": "",
+        "identifiers": {
+            "arxiv_id": file_id
+        }
+    }
+    return Paper(
+        paper_id=file_id,
+        pdf_hash="",
+        metadata=metadata,
+        abstract=abstract,
+        body_text=body_text,
+        back_matter=[],
+        bib_entries=bibkey_map,
+        ref_entries=refkey_map
+    )
+def convert_latex_xml_to_s2orc_json(xml_fpath: str, log_dir: str, grobid_config: Optional[Dict]=None) -> Paper:
+    """
+    :param xml_fpath:
+    :param log_dir:
+    :param grobid_config:
+    :return:
+    """
+    assert os.path.exists(xml_fpath)
+    # get file id
+    file_id = str(os.path.splitext(xml_fpath)[0]).split('/')[-1]
+    # try to get year from file name
+    year = file_id.split('.')[0][:2]
+    if year.isdigit():
+        year = int(year)
+        if year < 40:
+            year += 2000
+        else:
+            year += 1900
+        year = str(year)
+    else:
+        year = ""
+    # log file
+    log_file = os.path.join(log_dir, 'failed.log')
+    with open(xml_fpath, 'r') as f:
+        try:
+            xml = f.read()
+            soup = BeautifulSoup(xml, "lxml")
+            paper = convert_xml_to_s2orc(soup, file_id, year, log_file, grobid_config=grobid_config)
+            return paper
+        except UnicodeDecodeError:
+            with open(log_file, 'a+') as log_f:
+                log_f.write(f'{file_id},err_unicode_decode\n')
+            raise UnicodeDecodeError

s2orc-doc2json/doc2json/utils/__init__.py ADDED Viewed

File without changes

s2orc-doc2json/doc2json/utils/citation_util.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# utility functions for handling failure situations with grobid-detected citation spans
+import re
+from typing import Dict, List, Tuple
+BRACKET_REGEX = re.compile(r'\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]')
+BRACKET_STYLE_THRESHOLD = 5
+SINGLE_BRACKET_REGEX = re.compile(r'\[([1-9]\d{0,2})\]')
+EXPANSION_CHARS = {'-', '–'}
+def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool:
+    """
+    Check if span is a subspan of existing span
+    :param sub_start:
+    :param sub_end:
+    :param span_indices:
+    :return:
+    """
+    for span_start, span_end in span_indices:
+        if sub_start >= span_start and sub_end <= span_end:
+            return True
+    return False
+def is_expansion_string(between_string: str) -> bool:
+    """
+    Check if the string between two refs is an expansion string
+    :param between_string:
+    :return:
+    """
+    if len(between_string) <= 2 \
+            and any([c in EXPANSION_CHARS for c in between_string]) \
+            and all([c in EXPANSION_CHARS.union({' '}) for c in between_string]):
+        return True
+    return False
+# TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler']
+# example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f`
+def _clean_empty_and_duplicate_authors_from_grobid_parse(authors: List[Dict]) -> List[Dict]:
+    """
+    Within affiliation, `location` is a dict with fields <settlement>, <region>, <country>, <postCode>, etc.
+    Too much hassle, so just take the first one that's not empty.
+    """
+    # stripping empties
+    clean_authors_list = []
+    for author in authors:
+        clean_first = author['first'].strip()
+        clean_last = author['last'].strip()
+        clean_middle = [m.strip() for m in author['middle']]
+        clean_suffix = author['suffix'].strip()
+        if clean_first or clean_last or clean_middle:
+            author['first'] = clean_first
+            author['last'] = clean_last
+            author['middle'] = clean_middle
+            author['suffix'] = clean_suffix
+            clean_authors_list.append(author)
+    # combining duplicates (preserve first occurrence of author name as position)
+    key_to_author_blobs = {}
+    ordered_keys_by_author_pos = []
+    for author in clean_authors_list:
+        key = (author['first'], author['last'], ' '.join(author['middle']), author['suffix'])
+        if key not in key_to_author_blobs:
+            key_to_author_blobs[key] = author
+            ordered_keys_by_author_pos.append(key)
+        else:
+            if author['email']:
+                key_to_author_blobs[key]['email'] = author['email']
+            if author['affiliation'] and (author['affiliation']['institution'] or author['affiliation']['laboratory'] or author['affiliation']['location']):
+                key_to_author_blobs[key]['affiliation'] = author['affiliation']
+    dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos]
+    return dedup_authors_list

s2orc-doc2json/doc2json/utils/grobid_util.py ADDED Viewed

	@@ -0,0 +1,388 @@

+from typing import List, Dict, Optional
+import bs4
+from bs4 import BeautifulSoup
+import re
+from collections import defaultdict
+SUBSTITUTE_TAGS = {
+    'persName',
+    'orgName',
+    'publicationStmt',
+    'titleStmt',
+    'biblScope'
+}
+def clean_tags(el: bs4.element.Tag):
+    """
+    Replace all tags with lowercase version
+    :param el:
+    :return:
+    """
+    for sub_tag in SUBSTITUTE_TAGS:
+        for sub_el in el.find_all(sub_tag):
+            sub_el.name = sub_tag.lower()
+def soup_from_path(file_path: str):
+    """
+    Read XML file
+    :param file_path:
+    :return:
+    """
+    return BeautifulSoup(open(file_path, "rb").read(), "xml")
+def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns title
+    :return:
+    """
+    for title_entry in raw_xml.find_all("title"):
+        if title_entry.has_attr("level") \
+                and title_entry["level"] == "a":
+            return title_entry.text
+    try:
+        return raw_xml.title.text
+    except AttributeError:
+        return ""
+def get_author_names_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict[str, str]]:
+    """
+    Returns a list of dictionaries, one for each author,
+    containing the first and last names.
+    e.g.
+        {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix
+        }
+    """
+    names = []
+    for author in raw_xml.find_all("author"):
+        if not author.persname:
+            continue
+        # forenames include first and middle names
+        forenames = author.persname.find_all("forename")
+        # surnames include last names
+        surnames = author.persname.find_all("surname")
+        # name suffixes
+        suffixes = author.persname.find_all("suffix")
+        first = ""
+        middle = []
+        last = ""
+        suffix = ""
+        for forename in forenames:
+            if forename["type"] == "first":
+                if not first:
+                    first = forename.text
+                else:
+                    middle.append(forename.text)
+            elif forename["type"] == "middle":
+                middle.append(forename.text)
+        if len(surnames) > 1:
+            for surname in surnames[:-1]:
+                middle.append(surname.text)
+            last = surnames[-1].text
+        elif len(surnames) == 1:
+            last = surnames[0].text
+        if len(suffix) >= 1:
+            suffix = " ".join([suffix.text for suffix in suffixes])
+        names_dict = {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix
+        }
+        names.append(names_dict)
+    return names
+def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
+    """
+    Get affiliation from grobid xml
+    :param raw_xml:
+    :return:
+    """
+    location_dict = dict()
+    laboratory_name = ""
+    institution_name = ""
+    if raw_xml and raw_xml.affiliation:
+        for child in raw_xml.affiliation:
+            if child.name == "orgname":
+                if child.has_attr("type"):
+                    if child["type"] == "laboratory":
+                        laboratory_name = child.text
+                    elif child["type"] == "institution":
+                        institution_name = child.text
+            elif child.name == "address":
+                for grandchild in child:
+                    if grandchild.name and grandchild.text:
+                        location_dict[grandchild.name] = grandchild.text
+        if laboratory_name or institution_name:
+            return {
+                "laboratory": laboratory_name,
+                "institution": institution_name,
+                "location": location_dict
+            }
+    return {}
+def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
+    """
+    Returns a list of dictionaries, one for each author,
+    containing the first and last names.
+    e.g.
+        {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix,
+            "affiliation": {
+                "laboratory": "",
+                "institution": "",
+                "location": "",
+            },
+            "email": ""
+        }
+    """
+    authors = []
+    for author in raw_xml.find_all("author"):
+        first = ""
+        middle = []
+        last = ""
+        suffix = ""
+        if author.persname:
+            # forenames include first and middle names
+            forenames = author.persname.find_all("forename")
+            # surnames include last names
+            surnames = author.persname.find_all("surname")
+            # name suffixes
+            suffixes = author.persname.find_all("suffix")
+            for forename in forenames:
+                if forename.has_attr("type"):
+                    if forename["type"] == "first":
+                        if not first:
+                            first = forename.text
+                        else:
+                            middle.append(forename.text)
+                    elif forename["type"] == "middle":
+                        middle.append(forename.text)
+            if len(surnames) > 1:
+                for surname in surnames[:-1]:
+                    middle.append(surname.text)
+                last = surnames[-1].text
+            elif len(surnames) == 1:
+                last = surnames[0].text
+            if len(suffix) >= 1:
+                suffix = " ".join([suffix.text for suffix in suffixes])
+        affiliation = get_affiliation_from_grobid_xml(author)
+        email = ""
+        if author.email:
+            email = author.email.text
+        author_dict = {
+            "first": first,
+            "middle": middle,
+            "last": last,
+            "suffix": suffix,
+            "affiliation": affiliation,
+            "email": email
+        }
+        authors.append(author_dict)
+    return authors
+def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
+    """
+    Returns date published if exists
+    :return:
+    """
+    if raw_xml.date and raw_xml.date.has_attr("when"):
+        # match year in date text (which is in some unspecified date format)
+        year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
+        if year_match:
+            year = year_match.group(0)
+            if year and year.isnumeric() and len(year) == 4:
+                return int(year)
+    return None
+def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
+    """
+    Returns venue/journal/publisher of bib entry
+    Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
+    level="j": journal title
+    level="m": "non journal bibliographical item holding the cited article"
+    level="s": series title
+    :return:
+    """
+    title_names = []
+    keep_types = ["j", "m", "s"]
+    # get all titles of the anove types
+    for title_entry in raw_xml.find_all("title"):
+        if title_entry.has_attr("level") \
+                and title_entry["level"] in keep_types \
+                and title_entry.text != title_text:
+            title_names.append((title_entry["level"], title_entry.text))
+    # return the title name that most likely belongs to the journal or publication venue
+    if title_names:
+        title_names.sort(key=lambda x: keep_types.index(x[0]))
+        return title_names[0][1]
+    return ""
+def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the volume number of grobid bib entry
+    Grobid <biblscope unit="volume">
+    :return:
+    """
+    for bibl_entry in raw_xml.find_all("biblscope"):
+        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
+            return bibl_entry.text
+    return ""
+def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the issue number of grobid bib entry
+    Grobid <biblscope unit="issue">
+    :return:
+    """
+    for bibl_entry in raw_xml.find_all("biblscope"):
+        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
+            return bibl_entry.text
+    return ""
+def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the page numbers of grobid bib entry
+    Grobid <biblscope unit="page">
+    :return:
+    """
+    for bibl_entry in raw_xml.find_all("biblscope"):
+        if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "page" and bibl_entry.has_attr("from"):
+            from_page = bibl_entry["from"]
+            if bibl_entry.has_attr("to"):
+                to_page = bibl_entry["to"]
+                return f'{from_page}--{to_page}'
+            else:
+                return from_page
+    return ""
+def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
+    """
+    Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
+    :param raw_xml:
+    :return:
+    """
+    other_ids = defaultdict(list)
+    for idno_entry in raw_xml.find_all("idno"):
+        if idno_entry.has_attr("type") and idno_entry.text:
+            other_ids[idno_entry["type"]].append(idno_entry.text)
+    return other_ids
+def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Returns the raw bibiliography string
+    :param raw_xml:
+    :return:
+    """
+    for note in raw_xml.find_all("note"):
+        if note.has_attr("type") and note["type"] == "raw_reference":
+            return note.text
+    return ""
+def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+    """
+    Finds and returns the publication datetime if it exists
+    :param raw_xml:
+    :return:
+    """
+    if raw_xml.publicationStmt:
+        for child in raw_xml.publicationstmt:
+            if child.name == "date" \
+                    and child.has_attr("type") \
+                    and child["type"] == "published" \
+                    and child.has_attr("when"):
+                return child["when"]
+    return ""
+def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
+    """
+    Parse one bib entry
+    :param bib_entry:
+    :return:
+    """
+    clean_tags(bib_entry)
+    title = get_title_from_grobid_xml(bib_entry)
+    return {
+        'ref_id': bib_entry.attrs.get("xml:id", None),
+        'title': title,
+        'authors': get_author_names_from_grobid_xml(bib_entry),
+        'year': get_year_from_grobid_xml(bib_entry),
+        'venue': get_venue_from_grobid_xml(bib_entry, title),
+        'volume': get_volume_from_grobid_xml(bib_entry),
+        'issue': get_issue_from_grobid_xml(bib_entry),
+        'pages': get_pages_from_grobid_xml(bib_entry),
+        'other_ids': get_other_ids_from_grobid_xml(bib_entry),
+        'raw_text': get_raw_bib_text_from_grobid_xml(bib_entry),
+        'urls': []
+    }
+def is_reference_tag(tag: bs4.element.Tag) -> bool:
+    return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"
+def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
+    """
+    Extract paper metadata (title, authors, affiliation, year) from grobid xml
+    :param tag:
+    :return:
+    """
+    clean_tags(tag)
+    paper_metadata = {
+        "title": tag.titlestmt.title.text,
+        "authors": get_author_data_from_grobid_xml(tag),
+        "year": get_publication_datetime_from_grobid_xml(tag)
+    }
+    return paper_metadata

s2orc-doc2json/doc2json/utils/latex_util.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Many of the REGEX expressions and pipeline in this set of utilities are borrowed or extended from
+the unarXive project: https://github.com/IllDepence/unarXive
+Modifications have been made to better identify the primary latex file and expand all other latex
+files into the main file. Latexpand and tralics options have also been changed.
+"""
+import chardet
+import magic
+import os
+import re
+import glob
+import subprocess
+import tempfile
+MAIN_TEX_PATT = re.compile(r'(\\begin\s*\{\s*document\s*\})', re.I)
+# ^ with capturing parentheses so that the pattern can be used for splitting
+PDF_EXT_PATT = re.compile(r'^\.pdf$', re.I)
+GZ_EXT_PATT = re.compile(r'^\.gz$', re.I)
+TEX_EXT_PATT = re.compile(r'^\.tex$', re.I)
+NON_TEXT_PATT = re.compile(r'^\.(pdf|eps|jpg|png|gif)$', re.I)
+BBL_SIGN = '\\bibitem'
+# natbib fix
+PRE_FIX_NATBIB = True
+NATBIB_PATT = re.compile((r'\\cite(t|p|alt|alp|author|year|yearpar)\s*?\*?\s*?'
+                           '(\[[^\]]*?\]\s*?)*?\s*?\*?\s*?\{([^\}]+?)\}'),
+                         re.I)
+# bibitem option fix
+PRE_FIX_BIBOPT = True
+BIBOPT_PATT = re.compile(r'\\bibitem\s*?\[[^]]*?\]', re.I|re.M)
+# ↑ above two solve most tralics problems; except for mnras style bibitems
+# (https://ctan.org/pkg/mnras)
+# agressive math pre-removal
+PRE_FILTER_MATH = False
+FILTER_PATTS = []
+for env in ['equation', 'displaymath', 'array', 'eqnarray', 'align', 'gather',
+            'multline', 'flalign', 'alignat']:
+    s = r'\\begin\{{{0}[*]?\}}.+?\\end\{{{0}\}}'.format(env)
+    patt = re.compile(s, re.I | re.M | re.S)
+    FILTER_PATTS.append(patt)
+FILTER_PATTS.append(re.compile(r'\$\$.+?\$\$', re.S))
+FILTER_PATTS.append(re.compile(r'\$.+?\$', re.S))
+FILTER_PATTS.append(re.compile(r'\\\(.+?\\\)', re.S))
+FILTER_PATTS.append(re.compile(r'\\\[.+?\\\]', re.S))
+def read_file(path):
+    try:
+        with open(path) as f:
+            cntnt = f.read()
+    except UnicodeDecodeError:
+        blob = open(path, 'rb').read()
+        m = magic.Magic(mime_encoding=True)
+        encoding = m.from_buffer(blob)
+        try:
+            cntnt = blob.decode(encoding)
+        except (UnicodeDecodeError, LookupError) as e:
+            encoding = chardet.detect(blob)['encoding']
+            if encoding:
+                try:
+                    cntnt = blob.decode(encoding, errors='replace')
+                except:
+                    return ''
+            else:
+                return ''
+    return cntnt
+def remove_math(latex_str):
+    parts = re.split(MAIN_TEX_PATT, latex_str, maxsplit=1)
+    for patt in FILTER_PATTS:
+         parts[2] = re.sub(patt, '', parts[2])
+    return ''.join(parts)
+def normalize(path, out_dir, write_logs=True):
+    """
+    Normalize an arXiv file
+    Adapted from https://github.com/IllDepence/unarXive
+        with modifications
+    Identifies the primary *.tex file, the bibliography file,
+    and expands other tex files and the bibliography into the
+    main tex file
+    """
+    def log(msg):
+        if write_logs:
+            with open(os.path.join(out_dir, 'log.txt'), 'a') as f:
+                f.write('{}\n'.format(msg))
+    # break path
+    _, fn = os.path.split(path.strip('/'))
+    # identify main tex file
+    main_tex_path = None
+    ignored_names = []
+    # check .tex files first
+    for tfn in os.listdir(path):
+        if not TEX_EXT_PATT.match(os.path.splitext(tfn)[1]):
+            ignored_names.append(tfn)
+            continue
+        try:
+            cntnt = read_file(os.path.join(path, tfn))
+        except:
+            continue
+        if re.search(MAIN_TEX_PATT, cntnt) is not None:
+            main_tex_path = tfn
+    # try other files
+    if main_tex_path is None:
+        for tfn in ignored_names:
+            if NON_TEXT_PATT.match(os.path.splitext(tfn)[1]):
+                continue
+            try:
+                cntnt = read_file(os.path.join(path, tfn))
+                if re.search(MAIN_TEX_PATT, cntnt) is not None:
+                    main_tex_path = tfn
+            except:
+                continue
+    # give up
+    if main_tex_path is None:
+        log(('couldn\'t find main tex file in dump archive {}'
+             '').format(fn))
+    # flatten to single tex file and save
+    with tempfile.TemporaryDirectory() as tmp_dir_path:
+        temp_tex_fn = os.path.join(tmp_dir_path, f'{fn}.tex')
+        # find bbl file
+        main_tex_fn = os.path.join(path, main_tex_path)
+        bbl_files = glob.glob(os.path.join(path, '*.bbl'))
+        if bbl_files:
+            latexpand_args = ['latexpand',
+                              '--expand-bbl',
+                              os.path.split(bbl_files[0])[1],
+                              main_tex_path,
+                              '--output',
+                              temp_tex_fn]
+        else:
+            latexpand_args = ['latexpand',
+                              main_tex_path,
+                              '--output',
+                              temp_tex_fn]
+        # run latexpand
+        with open(os.path.join(out_dir, 'log_latexpand.txt'), 'a+') as err:
+            subprocess.run(latexpand_args, stderr=err, cwd=path)
+        # re-read and write to ensure utf-8 b/c latexpand doesn't
+        # behave
+        new_tex_fn = os.path.join(out_dir, f'{fn}.tex')
+        cntnt = read_file(temp_tex_fn)
+        if PRE_FIX_NATBIB:
+            cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
+        if PRE_FIX_BIBOPT:
+            cntnt = BIBOPT_PATT.sub(r'\\bibitem', cntnt)
+        if PRE_FILTER_MATH:
+            cntnt = remove_math(cntnt)
+        with open(new_tex_fn, mode='w', encoding='utf-8') as f:
+            f.write(cntnt)
+def latex_to_xml(tex_file: str, out_dir: str, out_file: str, err_file: str, log_file: str):
+    """
+    Convert expanded latex file to XML using tralics
+    :param tex_file:
+    :param out_dir:
+    :param out_file:
+    :param err_file:
+    :param log_file:
+    :return:
+    """
+    with open(os.devnull, 'w') as devnull, \
+            open(err_file, 'a+') as err_f, \
+            open(log_file, 'a+') as skip_f:
+        # run tralics
+        tralics_args = ['tralics',
+                        '-silent',
+                        '-noxmlerror',
+                        '-utf8',
+                        '-oe8',
+                        '-entnames=false',
+                        '-nomathml',
+                        f'-output_dir={out_dir}',
+                        tex_file]
+        try:
+            subprocess.run(tralics_args, stdout=devnull, stderr=err_f, timeout=5)
+        except subprocess.TimeoutExpired:
+            skip_f.write(f'{tex_file}\n')
+        # if no output, skip
+        if not os.path.exists(out_file):
+            skip_f.write(f'{tex_file}\n')
+    if os.path.exists(out_file):
+        return out_file

s2orc-doc2json/doc2json/utils/refspan_util.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import List, Tuple
+def replace_refspans(
+    spans_to_replace: List[Tuple[int, int, str, str]],
+    full_string: str,
+    pre_padding: str = "",
+    post_padding: str = "",
+    btwn_padding: str = ", "
+) -> str:
+    """
+    For each span within the full string, replace that span with new text
+    :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
+    :param full_string:
+    :param pre_padding:
+    :param post_padding:
+    :param btwn_padding:
+    :return:
+    """
+    # assert all spans are equal to full_text span
+    assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])
+    # assert none of the spans start with the same start ind
+    start_inds = [rep[0] for rep in spans_to_replace]
+    assert len(set(start_inds)) == len(start_inds)
+    # sort by start index
+    spans_to_replace.sort(key=lambda x: x[0])
+    # form strings for each span group
+    for i, entry in enumerate(spans_to_replace):
+        start, end, span, new_string = entry
+        # skip empties
+        if end <= 0:
+            continue
+        # compute shift amount
+        shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)
+        # shift remaining appropriately
+        for ind in range(i + 1, len(spans_to_replace)):
+            next_start, next_end, next_span, next_string = spans_to_replace[ind]
+            # skip empties
+            if next_end <= 0:
+                continue
+            # if overlap between ref span and current ref span, remove from replacement
+            if next_start < end:
+                next_start = 0
+                next_end = 0
+                next_string = ""
+            # if ref span abuts previous reference span
+            elif next_start == end:
+                next_start += shift_amount
+                next_end += shift_amount
+                next_string = btwn_padding + pre_padding + next_string + post_padding
+            # if ref span starts after, shift starts and ends
+            elif next_start > end:
+                next_start += shift_amount
+                next_end += shift_amount
+                next_string = pre_padding + next_string + post_padding
+            # save adjusted span
+            spans_to_replace[ind] = (next_start, next_end, next_span, next_string)
+    spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
+    spans_to_replace.sort(key=lambda x: x[0])
+    # apply shifts in series
+    for start, end, span, new_string in spans_to_replace:
+        assert full_string[start:end] == span
+        full_string = full_string[:start] + new_string + full_string[end:]
+    return full_string
+def sub_spans_and_update_indices(
+    spans_to_replace: List[Tuple[int, int, str, str]],
+    full_string: str
+) -> Tuple[str, List]:
+    """
+    Replace all spans and recompute indices
+    :param spans_to_replace:
+    :param full_string:
+    :return:
+    """
+    # TODO: check no spans overlapping
+    # TODO: check all spans well-formed
+    # assert all spans are equal to full_text span
+    assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])
+    # assert none of the spans start with the same start ind
+    start_inds = [rep[0] for rep in spans_to_replace]
+    assert len(set(start_inds)) == len(start_inds)
+    # sort by start index
+    spans_to_replace.sort(key=lambda x: x[0])
+    # compute offsets for each span
+    new_spans = [[start, end, token, surface, 0] for start, end, token, surface in spans_to_replace]
+    for i, entry in enumerate(spans_to_replace):
+        start, end, token, surface = entry
+        new_end = start + len(surface)
+        offset = new_end - end
+        new_spans[i][1] += offset
+        for new_span_entry in new_spans[i+1:]:
+            new_span_entry[4] += offset
+    # generate new text and create final spans
+    new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
+    new_spans = [(start + offset, end + offset, token, surface) for start, end, token, surface, offset in new_spans]
+    return new_text, new_spans