Spaces:
Runtime error
Runtime error
import json | |
import os | |
# from sklearn.externals import joblib | |
import joblib | |
import numpy as np | |
import pandas as pd | |
# from .variables import old_ocr_req_cols | |
# from .skew_correction import PageSkewWraper | |
const_HW = 1.294117647 | |
const_W = 600 | |
def bucket_sort(df, colmn, ymax_col="ymax", ymin_col="ymin"): | |
df["line_number"] = 0 | |
colmn.append("line_number") | |
array_value = df[colmn].values | |
start_index = Line_counter = counter = 0 | |
ymax, ymin, line_no = ( | |
colmn.index(ymax_col), | |
colmn.index(ymin_col), | |
colmn.index("line_number"), | |
) | |
while counter < len(array_value): | |
current_ymax = array_value[start_index][ymax] | |
for next_index in range(start_index, len(array_value)): | |
counter += 1 | |
next_ymin = array_value[next_index][ymin] | |
next_ymax = array_value[next_index][ymax] | |
if current_ymax > next_ymin: | |
array_value[next_index][line_no] = Line_counter + 1 | |
# if current_ymax < next_ymax: | |
# current_ymax = next_ymax | |
else: | |
counter -= 1 | |
break | |
# print(counter, len(array_value), start_index) | |
start_index = counter | |
Line_counter += 1 | |
return pd.DataFrame(array_value, columns=colmn) | |
def do_sorting(df): | |
df.sort_values(["ymin", "xmin"], ascending=True, inplace=True) | |
df["idx"] = df.index | |
if "line_number" in df.columns: | |
print("line number removed") | |
df.drop("line_number", axis=1, inplace=True) | |
req_colns = ["xmin", "ymin", "xmax", "ymax", "idx"] | |
temp_df = df.copy() | |
temp = bucket_sort(temp_df.copy(), req_colns) | |
df = df.merge(temp[["idx", "line_number"]], on="idx") | |
df.sort_values(["line_number", "xmin"], ascending=True, inplace=True) | |
df = df.reset_index(drop=True) | |
df = df.reset_index(drop=True) | |
return df | |