Spaces:
Sleeping
Sleeping
from preprocessing_images import preprocessing_function | |
from datetime import datetime | |
from azure.storage.blob import BlobClient | |
from msrest.authentication import CognitiveServicesCredentials | |
#importing azure packages | |
from azure.cognitiveservices.vision.computervision import ComputerVisionClient | |
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes | |
#ocr extraction using azure computer vision API | |
def azure_ocr(pdf_url,computervision_client): | |
try: | |
read_response = computervision_client.read(pdf_url,raw=True) | |
read_operation_location = read_response.headers["Operation-Location"] | |
operation_id = read_operation_location.split("/")[-1] | |
while True: | |
read_result = computervision_client.get_read_result(operation_id) | |
if read_result.status not in ['notStarted', 'running']: | |
break | |
words = [] | |
if read_result.status == OperationStatusCodes.succeeded: | |
for text_result in read_result.analyze_result.read_results: | |
for line in text_result.lines: | |
words.append(line.text) | |
all_text = ' '.join(words) | |
return all_text | |
except Exception as e: | |
raise Exception(e) | |
def extract_text_from_url(test_pdf_url): | |
try: | |
preprocessing_function(test_pdf_url) | |
my_blob = 'test_clean_pdf' + datetime.now().strftime('%Y_%m_%d_%H_%M_%S') | |
blob = BlobClient.from_connection_string(conn_str=connection_string, container_name= my_container, blob_name=my_blob) | |
with open("answer_paper.pdf", "rb") as data: | |
blob.upload_blob(data) | |
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key)) | |
text = azure_ocr(blob.url, computervision_client) | |
text = text.lower() | |
n = text.find("150 word") | |
if n > 0: | |
text = text[n+10:] | |
elif text.find("150 ward") > 0: | |
nn = text.find("150 ward") | |
text = text[nn+10:] | |
return text | |
except Exception as e: | |
raise Exception(e) |