intuitive262 commited on
Commit
92873c0
·
1 Parent(s): c290ebd

Update code files

Browse files
Files changed (1) hide show
  1. app.py +19 -18
app.py CHANGED
@@ -37,32 +37,33 @@ def extract_text(image, query):
37
  generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
- def post_process_text(text):
41
- # Split the text into lines
42
- lines = text.split('. ')
43
 
44
- processed_lines = []
45
- for line in lines:
46
- # Separate Hindi and English text
47
- parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
48
- if len(parts) > 1:
49
- processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
50
- else:
51
- processed_lines.append(line)
52
 
53
- # Join the lines with double line breaks
54
- text = '\n\n'.join(processed_lines)
55
 
56
- # Remove repeated phrases
57
- unique_phrases = list(dict.fromkeys(text.split('\n\n')))
58
- text = '\n\n'.join(unique_phrases)
59
- return text
60
 
61
  def ocr(image):
62
  queries = [
63
  # "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
64
- "Look closely at the image and list any text you see, no matter how small or unclear.",
65
  # "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
 
66
  ]
67
 
68
  all_extracted_text = []
 
37
  generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
+ # def post_process_text(text):
41
+ # # Split the text into lines
42
+ # lines = text.split('. ')
43
 
44
+ # processed_lines = []
45
+ # for line in lines:
46
+ # # Separate Hindi and English text
47
+ # parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
48
+ # if len(parts) > 1:
49
+ # processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
50
+ # else:
51
+ # processed_lines.append(line)
52
 
53
+ # # Join the lines with double line breaks
54
+ # text = '\n\n'.join(processed_lines)
55
 
56
+ # # Remove repeated phrases
57
+ # unique_phrases = list(dict.fromkeys(text.split('\n\n')))
58
+ # text = '\n\n'.join(unique_phrases)
59
+ # return text
60
 
61
  def ocr(image):
62
  queries = [
63
  # "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
64
+ # "Look closely at the image and list any text you see, no matter how small or unclear.",
65
  # "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
66
+ "Extract all the text in Sanskrit and English from the image."
67
  ]
68
 
69
  all_extracted_text = []