memsum-arxiv-summarizer / services /pdf_parsing_service.py
nianlonggu
init
02ae0bf
import argparse
import json
import requests
from datetime import datetime
from flask import Flask, jsonify, abort, make_response, request, Response
from flask_cors import CORS
import uuid
import os
import subprocess
import threading
import shutil
import hashlib
import base64
from normalization_utils import DocumentNormalizer
import time
import socket
from urllib.parse import urlparse
import re
# Make Flask application
app = Flask(__name__)
CORS(app)
def bytes_to_base64_string(f_bytes):
return base64.b64encode(f_bytes).decode('ASCII')
def base64_string_to_bytes(base64_string):
return base64.b64decode(base64_string)
def get_md5( file_bytes ):
readable_hash = hashlib.md5(file_bytes).hexdigest()
return readable_hash
def adjust_cite_span( cite_span, cite_span_year_matcher ):
non_cite_text_chars = ",; []()"
try:
start = int(cite_span["start"])
end = int(cite_span["end"])
orig_text = cite_span["text"]
text = cite_span_year_matcher.sub( r"Y\1Y", orig_text )
begin_offset = 0
end_offset = 0
for c in text:
if c in non_cite_text_chars:
start += 1
begin_offset += 1
else:
break
for c in text[::-1]:
if c in non_cite_text_chars:
end -= 1
end_offset -= 1
else:
break
assert start < end
new_cite_span = {
"start":str( start ),
"end":str( end ),
"text":orig_text[ begin_offset:len(text) + end_offset ],
"ref_id":cite_span["ref_id"]
}
except:
new_cite_span = cite_span
return new_cite_span
def parse_pdf_base( pdf_bytes ):
root_dir = "root_dir_" + str(uuid.uuid4())
pdf_dir = root_dir + "/pdf/"
temp_dir = root_dir + "/temp_dir/"
output_dir = root_dir + "/output_dir/"
try:
os.makedirs(pdf_dir)
os.makedirs(temp_dir)
os.makedirs(output_dir)
except:
print("warning: folders exist!")
try:
with open( pdf_dir + "pdf.pdf","wb" ) as f:
f.write(pdf_bytes)
pdf_name = [ pdf_dir+fname for fname in os.listdir( pdf_dir )][0]
subprocess.run( list(map( str, [
"python",
PDF2JSON_HOME+"/doc2json/grobid2json/process_pdf.py",
"-i", pdf_name,
"-t", temp_dir,
"-o", output_dir
] ) ) )
print("PDF parsing done!")
json_name = [ output_dir+fname for fname in os.listdir( output_dir )][0]
parsed_data = json.load(open(json_name))
shutil.rmtree(root_dir)
except:
parsed_data = {}
try:
shutil.rmtree(root_dir)
except:
print("warning: removing temporary folder failed!")
return parsed_data
def convert_pdf_to_json( fbytes, count, conversion_results ):
try:
parsed_data = parse_pdf_base( fbytes )
except:
parsed_data = {}
conversion_results[count] = parsed_data
@app.route('/parse-pdf', methods=['POST'])
def parse_pdf():
try:
pdf_bytes = request.files.get('pdf').read()
parsed_data = parse_pdf_base( pdf_bytes )
except:
parsed_data = {}
return {"response":parsed_data}, 201
@app.route('/parse-and-normalize-pdf', methods=['POST'])
def parse_and_normalize_pdf():
global doc_normalizer, cite_span_year_matcher
try:
pdf_bytes = request.files.get('pdf').read()
parsed_data = parse_pdf_base( pdf_bytes )
parsed_data = doc_normalizer.normalize( parsed_data )
""" Clean the citation marker text """
for sec in parsed_data["Content"]["Abstract_Parsed"] + parsed_data["Content"]["Fullbody_Parsed"]:
for para in sec["section_text"]:
for sen in para["paragraph_text"]:
sen["cite_spans"] = [ adjust_cite_span( cite_span, cite_span_year_matcher ) for cite_span in sen["cite_spans"] ]
except:
parsed_data = {}
return {"response":parsed_data}, 201
PDF2JSON_HOME = os.getenv("PDF2JSON_HOME")
doc_normalizer = DocumentNormalizer( "./json_schema.json" )
cite_span_year_matcher = re.compile( "\((\d{4})\)" )
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument( "-flask_port", type = int, default = 8060 )
args = parser.parse_args()
print("\n\nWaiting for requests...")
sem = threading.Semaphore()
app.run(host='0.0.0.0', port=args.flask_port, threaded = True, debug = True)