juanpablomesa commited on
Commit
42ed865
1 Parent(s): af56925

Added easyocr for text extraction

Browse files
Files changed (2) hide show
  1. handler.py +8 -0
  2. requirements.txt +3 -1
handler.py CHANGED
@@ -18,6 +18,7 @@ from huggingface_hub import logging
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
 
20
  import timeit
 
21
 
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
 
@@ -47,6 +48,7 @@ class EndpointHandler:
47
 
48
  logging.set_verbosity_debug()
49
  self.logger = logging.get_logger(__name__)
 
50
 
51
  def download_image(self, url: str) -> bytes:
52
  """
@@ -183,6 +185,12 @@ class EndpointHandler:
183
  "source_type": "images",
184
  **image_metadata,
185
  }
 
 
 
 
 
 
186
  processed_metadata.append(complete_image_metadata)
187
 
188
  except Exception as e:
 
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
 
20
  import timeit
21
+ import easyocr
22
 
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
 
 
48
 
49
  logging.set_verbosity_debug()
50
  self.logger = logging.get_logger(__name__)
51
+ self.reader = easyocr.Reader(["de", "en"])
52
 
53
  def download_image(self, url: str) -> bytes:
54
  """
 
185
  "source_type": "images",
186
  **image_metadata,
187
  }
188
+ # Extract text from image using easyocr
189
+ extracted_text = self.reader.readtext(
190
+ np.array(image), detail=0
191
+ )
192
+ complete_image_metadata["extracted_text"] = extracted_text
193
+
194
  processed_metadata.append(complete_image_metadata)
195
 
196
  except Exception as e:
requirements.txt CHANGED
@@ -21,4 +21,6 @@ tokenizers==0.13.3
21
  tqdm==4.66.1
22
  transformers==4.27.2
23
  typing_extensions==4.8.0
24
- urllib3==2.0.7
 
 
 
21
  tqdm==4.66.1
22
  transformers==4.27.2
23
  typing_extensions==4.8.0
24
+ urllib3==2.0.7
25
+ easyocr==1.7.1
26
+ opencv_python_headless==4.8.1