File size: 1,768 Bytes
af9251e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
"""Loader that loads image files."""
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from paddleocr import PaddleOCR
import os
import nltk
from configs.model_config import NLTK_DATA_PATH
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
def _get_elements(self) -> List:
def image_ocr_txt(filepath, dir_path="tmp_files"):
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
if not os.path.exists(full_dir_path):
os.makedirs(full_dir_path)
filename = os.path.split(filepath)[-1]
ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False)
result = ocr.ocr(img=filepath)
ocr_result = [i[1][0] for line in result for i in line]
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
with open(txt_file_path, 'w', encoding='utf-8') as fout:
fout.write("\n".join(ocr_result))
return txt_file_path
txt_file_path = image_ocr_txt(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
if __name__ == "__main__":
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg")
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
docs = loader.load()
for doc in docs:
print(doc)
|