Spaces:
Sleeping
Sleeping
import argparse | |
import json | |
import requests | |
from datetime import datetime | |
from flask import Flask, jsonify, abort, make_response, request, Response | |
from flask_cors import CORS | |
import uuid | |
import os | |
import subprocess | |
import threading | |
import shutil | |
import hashlib | |
import base64 | |
from normalization_utils import DocumentNormalizer | |
import time | |
import socket | |
from urllib.parse import urlparse | |
import re | |
# Make Flask application | |
app = Flask(__name__) | |
CORS(app) | |
def bytes_to_base64_string(f_bytes): | |
return base64.b64encode(f_bytes).decode('ASCII') | |
def base64_string_to_bytes(base64_string): | |
return base64.b64decode(base64_string) | |
def get_md5( file_bytes ): | |
readable_hash = hashlib.md5(file_bytes).hexdigest() | |
return readable_hash | |
def adjust_cite_span( cite_span, cite_span_year_matcher ): | |
non_cite_text_chars = ",; []()" | |
try: | |
start = int(cite_span["start"]) | |
end = int(cite_span["end"]) | |
orig_text = cite_span["text"] | |
text = cite_span_year_matcher.sub( r"Y\1Y", orig_text ) | |
begin_offset = 0 | |
end_offset = 0 | |
for c in text: | |
if c in non_cite_text_chars: | |
start += 1 | |
begin_offset += 1 | |
else: | |
break | |
for c in text[::-1]: | |
if c in non_cite_text_chars: | |
end -= 1 | |
end_offset -= 1 | |
else: | |
break | |
assert start < end | |
new_cite_span = { | |
"start":str( start ), | |
"end":str( end ), | |
"text":orig_text[ begin_offset:len(text) + end_offset ], | |
"ref_id":cite_span["ref_id"] | |
} | |
except: | |
new_cite_span = cite_span | |
return new_cite_span | |
def parse_pdf_base( pdf_bytes ): | |
root_dir = "root_dir_" + str(uuid.uuid4()) | |
pdf_dir = root_dir + "/pdf/" | |
temp_dir = root_dir + "/temp_dir/" | |
output_dir = root_dir + "/output_dir/" | |
try: | |
os.makedirs(pdf_dir) | |
os.makedirs(temp_dir) | |
os.makedirs(output_dir) | |
except: | |
print("warning: folders exist!") | |
try: | |
with open( pdf_dir + "pdf.pdf","wb" ) as f: | |
f.write(pdf_bytes) | |
pdf_name = [ pdf_dir+fname for fname in os.listdir( pdf_dir )][0] | |
subprocess.run( list(map( str, [ | |
"python", | |
PDF2JSON_HOME+"/doc2json/grobid2json/process_pdf.py", | |
"-i", pdf_name, | |
"-t", temp_dir, | |
"-o", output_dir | |
] ) ) ) | |
print("PDF parsing done!") | |
json_name = [ output_dir+fname for fname in os.listdir( output_dir )][0] | |
parsed_data = json.load(open(json_name)) | |
shutil.rmtree(root_dir) | |
except: | |
parsed_data = {} | |
try: | |
shutil.rmtree(root_dir) | |
except: | |
print("warning: removing temporary folder failed!") | |
return parsed_data | |
def convert_pdf_to_json( fbytes, count, conversion_results ): | |
try: | |
parsed_data = parse_pdf_base( fbytes ) | |
except: | |
parsed_data = {} | |
conversion_results[count] = parsed_data | |
def parse_pdf(): | |
try: | |
pdf_bytes = request.files.get('pdf').read() | |
parsed_data = parse_pdf_base( pdf_bytes ) | |
except: | |
parsed_data = {} | |
return {"response":parsed_data}, 201 | |
def parse_and_normalize_pdf(): | |
global doc_normalizer, cite_span_year_matcher | |
try: | |
pdf_bytes = request.files.get('pdf').read() | |
parsed_data = parse_pdf_base( pdf_bytes ) | |
parsed_data = doc_normalizer.normalize( parsed_data ) | |
""" Clean the citation marker text """ | |
for sec in parsed_data["Content"]["Abstract_Parsed"] + parsed_data["Content"]["Fullbody_Parsed"]: | |
for para in sec["section_text"]: | |
for sen in para["paragraph_text"]: | |
sen["cite_spans"] = [ adjust_cite_span( cite_span, cite_span_year_matcher ) for cite_span in sen["cite_spans"] ] | |
except: | |
parsed_data = {} | |
return {"response":parsed_data}, 201 | |
PDF2JSON_HOME = os.getenv("PDF2JSON_HOME") | |
doc_normalizer = DocumentNormalizer( "./json_schema.json" ) | |
cite_span_year_matcher = re.compile( "\((\d{4})\)" ) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( "-flask_port", type = int, default = 8060 ) | |
args = parser.parse_args() | |
print("\n\nWaiting for requests...") | |
sem = threading.Semaphore() | |
app.run(host='0.0.0.0', port=args.flask_port, threaded = True, debug = True) |