Spaces:

intuitive262
/

Doc_Reader

Sleeping

intuitive262 commited on Sep 30, 2024

Commit

92873c0

1 Parent(s): c290ebd

Update code files

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,32 +37,33 @@ def extract_text(image, query):
         generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
         return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-def post_process_text(text):
-    # Split the text into lines
-    lines = text.split('. ')
-    processed_lines = []
-    for line in lines:
-        # Separate Hindi and English text
-        parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
-        if len(parts) > 1:
-            processed_lines.append(f"{parts[0]}{parts[1]}\n    {parts[2]}")
-        else:
-            processed_lines.append(line)
-    # Join the lines with double line breaks
-    text = '\n\n'.join(processed_lines)
-    # Remove repeated phrases
-    unique_phrases = list(dict.fromkeys(text.split('\n\n')))
-    text = '\n\n'.join(unique_phrases)
-    return text
 def ocr(image):
     queries = [
         # "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
-        "Look closely at the image and list any text you see, no matter how small or unclear.",
         # "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
     ]
     all_extracted_text = []

         generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
         return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+# def post_process_text(text):
+#     # Split the text into lines
+#     lines = text.split('. ')
+#     processed_lines = []
+#     for line in lines:
+#         # Separate Hindi and English text
+#         parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
+#         if len(parts) > 1:
+#             processed_lines.append(f"{parts[0]}{parts[1]}\n    {parts[2]}")
+#         else:
+#             processed_lines.append(line)
+#     # Join the lines with double line breaks
+#     text = '\n\n'.join(processed_lines)
+#     # Remove repeated phrases
+#     unique_phrases = list(dict.fromkeys(text.split('\n\n')))
+#     text = '\n\n'.join(unique_phrases)
+#     return text
 def ocr(image):
     queries = [
         # "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
+        # "Look closely at the image and list any text you see, no matter how small or unclear.",
         # "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
+        "Extract all the text in Sanskrit and English from the image."
     ]
     all_extracted_text = []