Spaces:
Sleeping
Sleeping
Commit
·
92873c0
1
Parent(s):
c290ebd
Update code files
Browse files
app.py
CHANGED
@@ -37,32 +37,33 @@ def extract_text(image, query):
|
|
37 |
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
38 |
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
39 |
|
40 |
-
def post_process_text(text):
|
41 |
-
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
|
61 |
def ocr(image):
|
62 |
queries = [
|
63 |
# "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
|
64 |
-
"Look closely at the image and list any text you see, no matter how small or unclear.",
|
65 |
# "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
|
|
|
66 |
]
|
67 |
|
68 |
all_extracted_text = []
|
|
|
37 |
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
38 |
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
39 |
|
40 |
+
# def post_process_text(text):
|
41 |
+
# # Split the text into lines
|
42 |
+
# lines = text.split('. ')
|
43 |
|
44 |
+
# processed_lines = []
|
45 |
+
# for line in lines:
|
46 |
+
# # Separate Hindi and English text
|
47 |
+
# parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
|
48 |
+
# if len(parts) > 1:
|
49 |
+
# processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
|
50 |
+
# else:
|
51 |
+
# processed_lines.append(line)
|
52 |
|
53 |
+
# # Join the lines with double line breaks
|
54 |
+
# text = '\n\n'.join(processed_lines)
|
55 |
|
56 |
+
# # Remove repeated phrases
|
57 |
+
# unique_phrases = list(dict.fromkeys(text.split('\n\n')))
|
58 |
+
# text = '\n\n'.join(unique_phrases)
|
59 |
+
# return text
|
60 |
|
61 |
def ocr(image):
|
62 |
queries = [
|
63 |
# "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
|
64 |
+
# "Look closely at the image and list any text you see, no matter how small or unclear.",
|
65 |
# "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
|
66 |
+
"Extract all the text in Sanskrit and English from the image."
|
67 |
]
|
68 |
|
69 |
all_extracted_text = []
|