import argparse import json import requests from datetime import datetime from flask import Flask, jsonify, abort, make_response, request, Response from flask_cors import CORS import uuid import os import subprocess import threading import shutil import hashlib import base64 from normalization_utils import DocumentNormalizer import time import socket from urllib.parse import urlparse import re # Make Flask application app = Flask(__name__) CORS(app) def bytes_to_base64_string(f_bytes): return base64.b64encode(f_bytes).decode('ASCII') def base64_string_to_bytes(base64_string): return base64.b64decode(base64_string) def get_md5( file_bytes ): readable_hash = hashlib.md5(file_bytes).hexdigest() return readable_hash def adjust_cite_span( cite_span, cite_span_year_matcher ): non_cite_text_chars = ",; []()" try: start = int(cite_span["start"]) end = int(cite_span["end"]) orig_text = cite_span["text"] text = cite_span_year_matcher.sub( r"Y\1Y", orig_text ) begin_offset = 0 end_offset = 0 for c in text: if c in non_cite_text_chars: start += 1 begin_offset += 1 else: break for c in text[::-1]: if c in non_cite_text_chars: end -= 1 end_offset -= 1 else: break assert start < end new_cite_span = { "start":str( start ), "end":str( end ), "text":orig_text[ begin_offset:len(text) + end_offset ], "ref_id":cite_span["ref_id"] } except: new_cite_span = cite_span return new_cite_span def parse_pdf_base( pdf_bytes ): root_dir = "root_dir_" + str(uuid.uuid4()) pdf_dir = root_dir + "/pdf/" temp_dir = root_dir + "/temp_dir/" output_dir = root_dir + "/output_dir/" try: os.makedirs(pdf_dir) os.makedirs(temp_dir) os.makedirs(output_dir) except: print("warning: folders exist!") try: with open( pdf_dir + "pdf.pdf","wb" ) as f: f.write(pdf_bytes) pdf_name = [ pdf_dir+fname for fname in os.listdir( pdf_dir )][0] subprocess.run( list(map( str, [ "python", PDF2JSON_HOME+"/doc2json/grobid2json/process_pdf.py", "-i", pdf_name, "-t", temp_dir, "-o", output_dir ] ) ) ) print("PDF parsing done!") json_name = [ output_dir+fname for fname in os.listdir( output_dir )][0] parsed_data = json.load(open(json_name)) shutil.rmtree(root_dir) except: parsed_data = {} try: shutil.rmtree(root_dir) except: print("warning: removing temporary folder failed!") return parsed_data def convert_pdf_to_json( fbytes, count, conversion_results ): try: parsed_data = parse_pdf_base( fbytes ) except: parsed_data = {} conversion_results[count] = parsed_data @app.route('/parse-pdf', methods=['POST']) def parse_pdf(): try: pdf_bytes = request.files.get('pdf').read() parsed_data = parse_pdf_base( pdf_bytes ) except: parsed_data = {} return {"response":parsed_data}, 201 @app.route('/parse-and-normalize-pdf', methods=['POST']) def parse_and_normalize_pdf(): global doc_normalizer, cite_span_year_matcher try: pdf_bytes = request.files.get('pdf').read() parsed_data = parse_pdf_base( pdf_bytes ) parsed_data = doc_normalizer.normalize( parsed_data ) """ Clean the citation marker text """ for sec in parsed_data["Content"]["Abstract_Parsed"] + parsed_data["Content"]["Fullbody_Parsed"]: for para in sec["section_text"]: for sen in para["paragraph_text"]: sen["cite_spans"] = [ adjust_cite_span( cite_span, cite_span_year_matcher ) for cite_span in sen["cite_spans"] ] except: parsed_data = {} return {"response":parsed_data}, 201 PDF2JSON_HOME = os.getenv("PDF2JSON_HOME") doc_normalizer = DocumentNormalizer( "./json_schema.json" ) cite_span_year_matcher = re.compile( "\((\d{4})\)" ) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "-flask_port", type = int, default = 8060 ) args = parser.parse_args() print("\n\nWaiting for requests...") sem = threading.Semaphore() app.run(host='0.0.0.0', port=args.flask_port, threaded = True, debug = True)