aarishshahmohsin commited on
Commit
716edce
1 Parent(s): 1625392

added everything now

Browse files
Files changed (9) hide show
  1. README.md +0 -14
  2. app copy.py +0 -12
  3. app.py +95 -127
  4. image.png +0 -0
  5. my_model/config.json +0 -39
  6. my_model/generation_config.json +0 -6
  7. new_app.py +0 -9
  8. requirements.txt +4 -9
  9. temp_app.py +0 -119
README.md DELETED
@@ -1,14 +0,0 @@
1
- ---
2
- title: Ocr Reader
3
- emoji: 🦀
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.38.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: OCR project for IIT Roorkee Internship
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app copy.py DELETED
@@ -1,12 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModel
2
- # tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
3
- # model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
4
- tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
5
- model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
6
-
7
- model = model.eval().cpu()
8
-
9
- image_path = './image.png'
10
- english_extraction = model.chat(tokenizer, image_path, ocr_type='ocr')
11
- print(english_extraction)
12
-
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,127 +1,95 @@
1
- import streamlit as st
2
- from PIL import Image
3
- from surya.ocr import run_ocr
4
- from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
5
- from surya.model.recognition.model import load_model as load_rec_model
6
- from surya.model.recognition.processor import load_processor as load_rec_processor
7
- import re
8
- from transformers import AutoModel, AutoTokenizer
9
- import torch
10
- import tempfile
11
- import os
12
-
13
- # os.environ["CUDA_VISIBLE_DEVICES"] = ""
14
- st.set_page_config(page_title="OCR Application", page_icon="🖼️", layout="wide")
15
-
16
- # device = "cuda" if torch.cuda.is_available() else "cpu"
17
- device="cpu"
18
-
19
- # @st.cache_resource
20
- # def load_surya_models():
21
- det_processor, det_model = load_det_processor(), load_det_model()
22
- det_model.to(device)
23
- rec_model, rec_processor = load_rec_model(), load_rec_processor()
24
- rec_model.to(device)
25
- # return det_processor, det_model, rec_model, rec_processor
26
-
27
- # @st.cache_resource
28
- # def load_got_ocr_model():
29
- # tokenizer = AutoTokenizer.from_pretrained('aarishshahmohsin/got_ocr_cpu', trust_remote_code=True, device_map='cpu')
30
- # model = AutoModel.from_pretrained('aarishshahmohsin/got_ocr_cpu', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
31
- # tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
32
- # got_model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
33
- tokenizer = AutoTokenizer.from_pretrained('aarishshahmohsin/got_ocr_2', trust_remote_code=True, device_map='cpu')
34
- got_model = AutoModel.from_pretrained('aarishshahmohsin/got_ocr_2', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
35
- # got_model = got_model.to_empty()
36
- got_model = got_model.eval().to(device)
37
- # return tokenizer, model
38
-
39
- # det_processor, det_model, rec_model, rec_processor = load_surya_models()
40
- # tokenizer, got_model = load_got_ocr_model()
41
-
42
- st.title("OCR Application (Aarish Shah Mohsin)")
43
- st.write("Upload an image for OCR processing. Using GOT-OCR for English translations, Picked Surya OCR Model for English+Hindi Translations")
44
-
45
- st.sidebar.header("Configuration")
46
- model_choice = st.sidebar.selectbox("Select OCR Model:", ("For English + Hindi", "For English (GOT-OCR)"))
47
-
48
- # Store the uploaded image and extracted text in session state
49
- if 'uploaded_image' not in st.session_state:
50
- st.session_state.uploaded_image = None
51
- if 'extracted_text' not in st.session_state:
52
- st.session_state.extracted_text = ""
53
-
54
- uploaded_file = st.sidebar.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
55
-
56
- # Update the session state if a new file is uploaded
57
- if uploaded_file is not None:
58
- st.session_state.uploaded_image = uploaded_file
59
-
60
- predict_button = st.sidebar.button("Predict", key="predict")
61
-
62
- col1, col2 = st.columns([2, 1])
63
-
64
- # Display the image preview if it's already uploaded
65
- if st.session_state.uploaded_image:
66
- image = Image.open(st.session_state.uploaded_image)
67
-
68
- with col1:
69
- # Display a smaller preview of the uploaded image (set width to 300px)
70
- col1.image(image, caption='Uploaded Image', use_column_width=False, width=300)
71
-
72
- # Handle predictions
73
- if predict_button and st.session_state.uploaded_image:
74
- # with col2:
75
- with st.spinner("Processing..."):
76
- # Save the uploaded file temporarily
77
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
78
- temp_file.write(st.session_state.uploaded_image.getvalue())
79
- temp_file_path = temp_file.name
80
-
81
- image = Image.open(temp_file_path)
82
- image = image.convert("RGB")
83
-
84
- if model_choice == "For English + Hindi":
85
- langs = ["en", "hi"]
86
- predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
87
- text_list = re.findall(r"text='(.*?)'", str(predictions[0]))
88
- extracted_text = ' '.join(text_list)
89
-
90
- st.session_state.extracted_text = extracted_text # Save extracted text in session state
91
-
92
- # with col2:
93
- # st.subheader("Extracted Text (Surya):")
94
- # st.write(extracted_text)
95
-
96
- elif model_choice == "For English (GOT-OCR)":
97
- image_file = temp_file_path
98
- res = got_model.chat(tokenizer, image_file, ocr_type='ocr')
99
-
100
- st.session_state.extracted_text = res # Save extracted text in session state
101
-
102
- # with col2:
103
- # st.subheader("Extracted Text (GOT-OCR):")
104
- # st.write(res)
105
-
106
- # Delete the temporary file after processing
107
- if os.path.exists(temp_file_path):
108
- os.remove(temp_file_path)
109
-
110
- # Search functionality
111
- if st.session_state.extracted_text:
112
- search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
113
-
114
- # Create a pattern to find the search query in a case-insensitive way
115
- if search_query:
116
- pattern = re.compile(re.escape(search_query), re.IGNORECASE)
117
- highlighted_text = st.session_state.extracted_text
118
-
119
- # Replace matching text with highlighted version (bright green)
120
- highlighted_text = pattern.sub(lambda m: f"<span style='background-color: limegreen;'>{m.group(0)}</span>", highlighted_text)
121
-
122
- st.markdown("### Highlighted Search Results:")
123
- st.markdown(highlighted_text, unsafe_allow_html=True)
124
- else:
125
- # If no search query, show the original extracted text
126
- st.markdown("### Extracted Text:")
127
- st.markdown(st.session_state.extracted_text, unsafe_allow_html=True)
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset
7
+
8
+ # Model configurations
9
+ models = {
10
+ "Urdu Model": {
11
+ "checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
12
+ "vocoder": "microsoft/speecht5_hifigan",
13
+ "processor": "aarishshahmohsin/urdu_processor_t5",
14
+ },
15
+ "Technical Model": {
16
+ "checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
17
+ "vocoder": "microsoft/speecht5_hifigan",
18
+ "processor": "microsoft/speecht5_tts", # Using same checkpoint for processor
19
+ }
20
+ }
21
+
22
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
23
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
24
+
25
+
26
+ # Initialize all models at startup
27
+ print("Loading models...")
28
+ loaded_models = {}
29
+ for model_name, config in models.items():
30
+ processor = SpeechT5Processor.from_pretrained(config["processor"])
31
+ model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
32
+ vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
33
+
34
+ loaded_models[model_name] = {
35
+ "processor": processor,
36
+ "model": model,
37
+ "vocoder": vocoder
38
+ }
39
+ print("Models loaded successfully!")
40
+
41
+ def predict(text, model_name):
42
+ if len(text.strip()) == 0:
43
+ return (16000, np.zeros(0).astype(np.int16))
44
+
45
+ model_components = loaded_models[model_name]
46
+ processor = model_components["processor"]
47
+ model = model_components["model"]
48
+ vocoder = model_components["vocoder"]
49
+
50
+ inputs = processor(text=text, return_tensors="pt")
51
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
52
+ speech = (speech.numpy() * 32767).astype(np.int16)
53
+
54
+ return (16000, speech)
55
+
56
+ # UI Configuration
57
+ title = "Multi-Model SpeechT5 Demo"
58
+
59
+ examples = [
60
+ # Urdu Model Examples
61
+ ["میں نے آج بہت کام کیا۔", "Urdu Model"],
62
+ ["آپ کا دن کیسا گزرا؟", "Urdu Model"],
63
+
64
+ # Technical Model Examples
65
+ ["JSON response with HTTP status code 200.", "Technical Model"],
66
+ ["Nginx is the best", "Technical Model"],
67
+ ]
68
+
69
+ description = """
70
+ Select a model and enter text to generate speech.
71
+
72
+ 1. Regional Language(Urdu)
73
+ 2. Technical Speech
74
+
75
+ """
76
+
77
+ # Create and launch the interface
78
+ gr.Interface(
79
+ fn=predict,
80
+ inputs=[
81
+ gr.Text(label="Input Text"),
82
+ gr.Dropdown(
83
+ choices=list(models.keys()),
84
+ label="Select Model",
85
+ value="Technical Model"
86
+ )
87
+ ],
88
+ outputs=[
89
+ gr.Audio(label="Generated Speech", type="numpy"),
90
+ ],
91
+ title=title,
92
+ description=description,
93
+ examples=examples, # Add examples to the interface
94
+ cache_examples=True,
95
+ ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
image.png DELETED
Binary file (2.39 kB)
 
my_model/config.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "_name_or_path": "ucaslcl/GOT-OCR2_0",
3
- "architectures": [
4
- "GOTQwenForCausalLM"
5
- ],
6
- "attention_dropout": 0.0,
7
- "auto_map": {
8
- "AutoConfig": "ucaslcl/GOT-OCR2_0--modeling_GOT.GOTConfig",
9
- "AutoModel": "ucaslcl/GOT-OCR2_0--modeling_GOT.GOTQwenForCausalLM"
10
- },
11
- "bos_token_id": 151643,
12
- "eos_token_id": 151643,
13
- "freeze_vision_tower": false,
14
- "hidden_act": "silu",
15
- "hidden_size": 1024,
16
- "im_end_token": 151858,
17
- "im_patch_token": 151859,
18
- "im_start_token": 151857,
19
- "image_token_len": 256,
20
- "initializer_range": 0.02,
21
- "intermediate_size": 2816,
22
- "max_position_embeddings": 32768,
23
- "max_window_layers": 21,
24
- "model_type": "GOT",
25
- "num_attention_heads": 16,
26
- "num_hidden_layers": 24,
27
- "num_key_value_heads": 16,
28
- "rms_norm_eps": 1e-06,
29
- "rope_scaling": null,
30
- "rope_theta": 1000000.0,
31
- "sliding_window": null,
32
- "tie_word_embeddings": true,
33
- "torch_dtype": "float32",
34
- "transformers_version": "4.45.1",
35
- "use_cache": true,
36
- "use_im_start_end": true,
37
- "use_sliding_window": false,
38
- "vocab_size": 151860
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
my_model/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "eos_token_id": 151643,
4
- "max_new_tokens": 2048,
5
- "transformers_version": "4.45.1"
6
- }
 
 
 
 
 
 
 
new_app.py DELETED
@@ -1,9 +0,0 @@
1
- from transformers import AutoModel, AutoTokenizer
2
-
3
- model_name = "ucaslcl/GOT-OCR2_0"
4
-
5
- tokenizer = AutoTokenizer.from_pretrained(model_name)
6
- model = AutoModel.from_pretrained(model_name, device_map="auto")
7
-
8
- model.to("cpu")
9
- model.save_pretrained("./my_model")
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,10 +1,5 @@
1
- streamlit
2
- Pillow
3
- surya-ocr
4
- torch
5
  transformers
6
- tiktoken
7
- torchvision
8
- verovio
9
- accelerate
10
- rapidfuzz
 
 
 
 
 
1
  transformers
2
+ datasets
3
+ librosa
4
+ torch
5
+ numpy
 
temp_app.py DELETED
@@ -1,119 +0,0 @@
1
- import streamlit as st
2
- from PIL import Image
3
- from surya.ocr import run_ocr
4
- from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
5
- from surya.model.recognition.model import load_model as load_rec_model
6
- from surya.model.recognition.processor import load_processor as load_rec_processor
7
- import re
8
- from transformers import AutoModel, AutoTokenizer
9
- import torch
10
- import tempfile
11
- import os
12
-
13
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
14
-
15
- st.set_page_config(page_title="OCR Application", page_icon="🖼️", layout="wide")
16
-
17
- # Force CPU if CUDA is not available
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
-
20
- @st.cache_resource
21
- def load_surya_models():
22
- det_processor, det_model = load_det_processor(), load_det_model()
23
- det_model.to(device)
24
- rec_model, rec_processor = load_rec_model(), load_rec_processor()
25
- rec_model.to(device)
26
- return det_processor, det_model, rec_model, rec_processor
27
-
28
- @st.cache_resource
29
- def load_got_ocr_model():
30
- tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
31
- model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map=device, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
32
- model.eval().to(device)
33
-
34
- # Override .half() and .cuda() to ensure everything runs in float32 and on CPU
35
- torch.Tensor.half = lambda x: x.float()
36
- torch.Tensor.cuda = lambda x, **kwargs: x.cpu()
37
-
38
- return tokenizer, model
39
-
40
- det_processor, det_model, rec_model, rec_processor = load_surya_models()
41
- tokenizer, got_model = load_got_ocr_model()
42
-
43
- st.title("OCR Application (Aarish Shah Mohsin)")
44
- st.write("Upload an image for OCR processing. Using GOT-OCR for English translations, Picked Surya OCR Model for English+Hindi Translations")
45
-
46
- st.sidebar.header("Configuration")
47
- model_choice = st.sidebar.selectbox("Select OCR Model:", ("For English + Hindi", "For English (GOT-OCR)"))
48
-
49
- # Store the uploaded image and extracted text in session state
50
- if 'uploaded_image' not in st.session_state:
51
- st.session_state.uploaded_image = None
52
- if 'extracted_text' not in st.session_state:
53
- st.session_state.extracted_text = ""
54
-
55
- uploaded_file = st.sidebar.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
56
-
57
- # Update the session state if a new file is uploaded
58
- if uploaded_file is not None:
59
- st.session_state.uploaded_image = uploaded_file
60
-
61
- predict_button = st.sidebar.button("Predict", key="predict")
62
-
63
- col1, col2 = st.columns([2, 1])
64
-
65
- # Display the image preview if it's already uploaded
66
- if st.session_state.uploaded_image:
67
- image = Image.open(st.session_state.uploaded_image)
68
-
69
- with col1:
70
- # Display a smaller preview of the uploaded image (set width to 300px)
71
- col1.image(image, caption='Uploaded Image', use_column_width=False, width=300)
72
-
73
- # Handle predictions
74
- if predict_button and st.session_state.uploaded_image:
75
- with st.spinner("Processing..."):
76
- # Save the uploaded file temporarily
77
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
78
- temp_file.write(st.session_state.uploaded_image.getvalue())
79
- temp_file_path = temp_file.name
80
-
81
- image = Image.open(temp_file_path)
82
- image = image.convert("RGB")
83
-
84
- if model_choice == "For English + Hindi":
85
- langs = ["en", "hi"]
86
- predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
87
- text_list = re.findall(r"text='(.*?)'", str(predictions[0]))
88
- extracted_text = ' '.join(text_list)
89
-
90
- st.session_state.extracted_text = extracted_text # Save extracted text in session state
91
-
92
- elif model_choice == "For English (GOT-OCR)":
93
- image_file = temp_file_path
94
- res = got_model.chat(tokenizer, image_file, ocr_type='ocr')
95
-
96
- st.session_state.extracted_text = res # Save extracted text in session state
97
-
98
- # Delete the temporary file after processing
99
- if os.path.exists(temp_file_path):
100
- os.remove(temp_file_path)
101
-
102
- # Search functionality
103
- if st.session_state.extracted_text:
104
- search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
105
-
106
- # Create a pattern to find the search query in a case-insensitive way
107
- if search_query:
108
- pattern = re.compile(re.escape(search_query), re.IGNORECASE)
109
- highlighted_text = st.session_state.extracted_text
110
-
111
- # Replace matching text with highlighted version (bright green)
112
- highlighted_text = pattern.sub(lambda m: f"<span style='background-color: limegreen;'>{m.group(0)}</span>", highlighted_text)
113
-
114
- st.markdown("### Highlighted Search Results:")
115
- st.markdown(highlighted_text, unsafe_allow_html=True)
116
- else:
117
- # If no search query, show the original extracted text
118
- st.markdown("### Extracted Text:")
119
- st.markdown(st.session_state.extracted_text, unsafe_allow_html=True)