|
import sys, datetime, random, re, cv2 |
|
from os.path import dirname, realpath |
|
sys.path.append(dirname(realpath(__file__)) + "/../") |
|
from util.db_conn import Postgres |
|
from util.minio_conn import HuMinio |
|
from util import findMaxDt |
|
import base64 |
|
from io import BytesIO |
|
import pandas as pd |
|
from PIL import Image |
|
import pdfplumber |
|
|
|
|
|
PG = Postgres("infiniflow", "docgpt") |
|
MINIO = HuMinio("infiniflow") |
|
def set_thumbnail(did, base64): |
|
sql = f""" |
|
update doc_info set thumbnail_base64='{base64}' |
|
where |
|
did={did} |
|
""" |
|
PG.update(sql) |
|
|
|
|
|
def collect(comm, mod, tm): |
|
sql = f""" |
|
select |
|
did, uid, doc_name, location, updated_at |
|
from doc_info |
|
where |
|
updated_at >= '{tm}' |
|
and MOD(did, {comm}) = {mod} |
|
and is_deleted=false |
|
and type <> 'folder' |
|
and thumbnail_base64='' |
|
order by updated_at asc |
|
limit 10 |
|
""" |
|
docs = PG.select(sql) |
|
if len(docs) == 0:return pd.DataFrame() |
|
|
|
mtm = str(docs["updated_at"].max())[:19] |
|
print("TOTAL:", len(docs), "To: ", mtm) |
|
return docs |
|
|
|
|
|
def build(row): |
|
if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", |
|
row["doc_name"].lower().strip()): |
|
set_thumbnail(row["did"], "_") |
|
return |
|
|
|
def thumbnail(img, SIZE=128): |
|
w,h = img.size |
|
p = SIZE/max(w, h) |
|
w, h = int(w*p), int(h*p) |
|
img.thumbnail((w, h)) |
|
buffered = BytesIO() |
|
try: |
|
img.save(buffered, format="JPEG") |
|
except Exception as e: |
|
try: |
|
img.save(buffered, format="PNG") |
|
except Exception as ee: |
|
pass |
|
return base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
|
|
|
|
iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"])) |
|
if re.search(r"\.pdf$", row["doc_name"].lower().strip()): |
|
pdf = pdfplumber.open(iobytes) |
|
img = pdf.pages[0].to_image().annotated |
|
set_thumbnail(row["did"], thumbnail(img)) |
|
|
|
if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()): |
|
img = Image.open(iobytes) |
|
set_thumbnail(row["did"], thumbnail(img)) |
|
|
|
if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()): |
|
url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]), |
|
row["location"], |
|
expires=datetime.timedelta(seconds=60) |
|
) |
|
cap = cv2.VideoCapture(url) |
|
succ = cap.isOpened() |
|
i = random.randint(1, 11) |
|
while succ: |
|
ret, frame = cap.read() |
|
if not ret: break |
|
if i > 0: |
|
i -= 1 |
|
continue |
|
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) |
|
print(img.size) |
|
set_thumbnail(row["did"], thumbnail(img)) |
|
cap.release() |
|
cv2.destroyAllWindows() |
|
|
|
|
|
def main(comm, mod): |
|
global model |
|
tm_fnm = f"res/thumbnail-{comm}-{mod}.tm" |
|
tm = findMaxDt(tm_fnm) |
|
rows = collect(comm, mod, tm) |
|
if len(rows) == 0:return |
|
|
|
tmf = open(tm_fnm, "a+") |
|
for _, r in rows.iterrows(): |
|
build(r) |
|
tmf.write(str(r["updated_at"]) + "\n") |
|
tmf.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
from mpi4py import MPI |
|
comm = MPI.COMM_WORLD |
|
main(comm.Get_size(), comm.Get_rank()) |
|
|
|
|