Spaces:
Configuration error
Configuration error
aarishshahmohsin
commited on
Commit
•
716edce
1
Parent(s):
1625392
added everything now
Browse files- README.md +0 -14
- app copy.py +0 -12
- app.py +95 -127
- image.png +0 -0
- my_model/config.json +0 -39
- my_model/generation_config.json +0 -6
- new_app.py +0 -9
- requirements.txt +4 -9
- temp_app.py +0 -119
README.md
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Ocr Reader
|
3 |
-
emoji: 🦀
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: indigo
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.38.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
short_description: OCR project for IIT Roorkee Internship
|
12 |
-
---
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app copy.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
from transformers import AutoTokenizer, AutoModel
|
2 |
-
# tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
|
3 |
-
# model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
|
4 |
-
tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
|
5 |
-
model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
|
6 |
-
|
7 |
-
model = model.eval().cpu()
|
8 |
-
|
9 |
-
image_path = './image.png'
|
10 |
-
english_extraction = model.chat(tokenizer, image_path, ocr_type='ocr')
|
11 |
-
print(english_extraction)
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,127 +1,95 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
from
|
6 |
-
from
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
#
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
#
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
elif model_choice == "For English (GOT-OCR)":
|
97 |
-
image_file = temp_file_path
|
98 |
-
res = got_model.chat(tokenizer, image_file, ocr_type='ocr')
|
99 |
-
|
100 |
-
st.session_state.extracted_text = res # Save extracted text in session state
|
101 |
-
|
102 |
-
# with col2:
|
103 |
-
# st.subheader("Extracted Text (GOT-OCR):")
|
104 |
-
# st.write(res)
|
105 |
-
|
106 |
-
# Delete the temporary file after processing
|
107 |
-
if os.path.exists(temp_file_path):
|
108 |
-
os.remove(temp_file_path)
|
109 |
-
|
110 |
-
# Search functionality
|
111 |
-
if st.session_state.extracted_text:
|
112 |
-
search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
|
113 |
-
|
114 |
-
# Create a pattern to find the search query in a case-insensitive way
|
115 |
-
if search_query:
|
116 |
-
pattern = re.compile(re.escape(search_query), re.IGNORECASE)
|
117 |
-
highlighted_text = st.session_state.extracted_text
|
118 |
-
|
119 |
-
# Replace matching text with highlighted version (bright green)
|
120 |
-
highlighted_text = pattern.sub(lambda m: f"<span style='background-color: limegreen;'>{m.group(0)}</span>", highlighted_text)
|
121 |
-
|
122 |
-
st.markdown("### Highlighted Search Results:")
|
123 |
-
st.markdown(highlighted_text, unsafe_allow_html=True)
|
124 |
-
else:
|
125 |
-
# If no search query, show the original extracted text
|
126 |
-
st.markdown("### Extracted Text:")
|
127 |
-
st.markdown(st.session_state.extracted_text, unsafe_allow_html=True)
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
6 |
+
from datasets import load_dataset
|
7 |
+
|
8 |
+
# Model configurations
|
9 |
+
models = {
|
10 |
+
"Urdu Model": {
|
11 |
+
"checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
|
12 |
+
"vocoder": "microsoft/speecht5_hifigan",
|
13 |
+
"processor": "aarishshahmohsin/urdu_processor_t5",
|
14 |
+
},
|
15 |
+
"Technical Model": {
|
16 |
+
"checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
|
17 |
+
"vocoder": "microsoft/speecht5_hifigan",
|
18 |
+
"processor": "microsoft/speecht5_tts", # Using same checkpoint for processor
|
19 |
+
}
|
20 |
+
}
|
21 |
+
|
22 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
23 |
+
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
24 |
+
|
25 |
+
|
26 |
+
# Initialize all models at startup
|
27 |
+
print("Loading models...")
|
28 |
+
loaded_models = {}
|
29 |
+
for model_name, config in models.items():
|
30 |
+
processor = SpeechT5Processor.from_pretrained(config["processor"])
|
31 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
|
32 |
+
vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
|
33 |
+
|
34 |
+
loaded_models[model_name] = {
|
35 |
+
"processor": processor,
|
36 |
+
"model": model,
|
37 |
+
"vocoder": vocoder
|
38 |
+
}
|
39 |
+
print("Models loaded successfully!")
|
40 |
+
|
41 |
+
def predict(text, model_name):
|
42 |
+
if len(text.strip()) == 0:
|
43 |
+
return (16000, np.zeros(0).astype(np.int16))
|
44 |
+
|
45 |
+
model_components = loaded_models[model_name]
|
46 |
+
processor = model_components["processor"]
|
47 |
+
model = model_components["model"]
|
48 |
+
vocoder = model_components["vocoder"]
|
49 |
+
|
50 |
+
inputs = processor(text=text, return_tensors="pt")
|
51 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
|
52 |
+
speech = (speech.numpy() * 32767).astype(np.int16)
|
53 |
+
|
54 |
+
return (16000, speech)
|
55 |
+
|
56 |
+
# UI Configuration
|
57 |
+
title = "Multi-Model SpeechT5 Demo"
|
58 |
+
|
59 |
+
examples = [
|
60 |
+
# Urdu Model Examples
|
61 |
+
["میں نے آج بہت کام کیا۔", "Urdu Model"],
|
62 |
+
["آپ کا دن کیسا گزرا؟", "Urdu Model"],
|
63 |
+
|
64 |
+
# Technical Model Examples
|
65 |
+
["JSON response with HTTP status code 200.", "Technical Model"],
|
66 |
+
["Nginx is the best", "Technical Model"],
|
67 |
+
]
|
68 |
+
|
69 |
+
description = """
|
70 |
+
Select a model and enter text to generate speech.
|
71 |
+
|
72 |
+
1. Regional Language(Urdu)
|
73 |
+
2. Technical Speech
|
74 |
+
|
75 |
+
"""
|
76 |
+
|
77 |
+
# Create and launch the interface
|
78 |
+
gr.Interface(
|
79 |
+
fn=predict,
|
80 |
+
inputs=[
|
81 |
+
gr.Text(label="Input Text"),
|
82 |
+
gr.Dropdown(
|
83 |
+
choices=list(models.keys()),
|
84 |
+
label="Select Model",
|
85 |
+
value="Technical Model"
|
86 |
+
)
|
87 |
+
],
|
88 |
+
outputs=[
|
89 |
+
gr.Audio(label="Generated Speech", type="numpy"),
|
90 |
+
],
|
91 |
+
title=title,
|
92 |
+
description=description,
|
93 |
+
examples=examples, # Add examples to the interface
|
94 |
+
cache_examples=True,
|
95 |
+
).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image.png
DELETED
Binary file (2.39 kB)
|
|
my_model/config.json
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "ucaslcl/GOT-OCR2_0",
|
3 |
-
"architectures": [
|
4 |
-
"GOTQwenForCausalLM"
|
5 |
-
],
|
6 |
-
"attention_dropout": 0.0,
|
7 |
-
"auto_map": {
|
8 |
-
"AutoConfig": "ucaslcl/GOT-OCR2_0--modeling_GOT.GOTConfig",
|
9 |
-
"AutoModel": "ucaslcl/GOT-OCR2_0--modeling_GOT.GOTQwenForCausalLM"
|
10 |
-
},
|
11 |
-
"bos_token_id": 151643,
|
12 |
-
"eos_token_id": 151643,
|
13 |
-
"freeze_vision_tower": false,
|
14 |
-
"hidden_act": "silu",
|
15 |
-
"hidden_size": 1024,
|
16 |
-
"im_end_token": 151858,
|
17 |
-
"im_patch_token": 151859,
|
18 |
-
"im_start_token": 151857,
|
19 |
-
"image_token_len": 256,
|
20 |
-
"initializer_range": 0.02,
|
21 |
-
"intermediate_size": 2816,
|
22 |
-
"max_position_embeddings": 32768,
|
23 |
-
"max_window_layers": 21,
|
24 |
-
"model_type": "GOT",
|
25 |
-
"num_attention_heads": 16,
|
26 |
-
"num_hidden_layers": 24,
|
27 |
-
"num_key_value_heads": 16,
|
28 |
-
"rms_norm_eps": 1e-06,
|
29 |
-
"rope_scaling": null,
|
30 |
-
"rope_theta": 1000000.0,
|
31 |
-
"sliding_window": null,
|
32 |
-
"tie_word_embeddings": true,
|
33 |
-
"torch_dtype": "float32",
|
34 |
-
"transformers_version": "4.45.1",
|
35 |
-
"use_cache": true,
|
36 |
-
"use_im_start_end": true,
|
37 |
-
"use_sliding_window": false,
|
38 |
-
"vocab_size": 151860
|
39 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_model/generation_config.json
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bos_token_id": 151643,
|
3 |
-
"eos_token_id": 151643,
|
4 |
-
"max_new_tokens": 2048,
|
5 |
-
"transformers_version": "4.45.1"
|
6 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
new_app.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
from transformers import AutoModel, AutoTokenizer
|
2 |
-
|
3 |
-
model_name = "ucaslcl/GOT-OCR2_0"
|
4 |
-
|
5 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
6 |
-
model = AutoModel.from_pretrained(model_name, device_map="auto")
|
7 |
-
|
8 |
-
model.to("cpu")
|
9 |
-
model.save_pretrained("./my_model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,10 +1,5 @@
|
|
1 |
-
streamlit
|
2 |
-
Pillow
|
3 |
-
surya-ocr
|
4 |
-
torch
|
5 |
transformers
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
rapidfuzz
|
|
|
|
|
|
|
|
|
|
|
1 |
transformers
|
2 |
+
datasets
|
3 |
+
librosa
|
4 |
+
torch
|
5 |
+
numpy
|
|
temp_app.py
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from PIL import Image
|
3 |
-
from surya.ocr import run_ocr
|
4 |
-
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
|
5 |
-
from surya.model.recognition.model import load_model as load_rec_model
|
6 |
-
from surya.model.recognition.processor import load_processor as load_rec_processor
|
7 |
-
import re
|
8 |
-
from transformers import AutoModel, AutoTokenizer
|
9 |
-
import torch
|
10 |
-
import tempfile
|
11 |
-
import os
|
12 |
-
|
13 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
14 |
-
|
15 |
-
st.set_page_config(page_title="OCR Application", page_icon="🖼️", layout="wide")
|
16 |
-
|
17 |
-
# Force CPU if CUDA is not available
|
18 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
-
|
20 |
-
@st.cache_resource
|
21 |
-
def load_surya_models():
|
22 |
-
det_processor, det_model = load_det_processor(), load_det_model()
|
23 |
-
det_model.to(device)
|
24 |
-
rec_model, rec_processor = load_rec_model(), load_rec_processor()
|
25 |
-
rec_model.to(device)
|
26 |
-
return det_processor, det_model, rec_model, rec_processor
|
27 |
-
|
28 |
-
@st.cache_resource
|
29 |
-
def load_got_ocr_model():
|
30 |
-
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
31 |
-
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map=device, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
|
32 |
-
model.eval().to(device)
|
33 |
-
|
34 |
-
# Override .half() and .cuda() to ensure everything runs in float32 and on CPU
|
35 |
-
torch.Tensor.half = lambda x: x.float()
|
36 |
-
torch.Tensor.cuda = lambda x, **kwargs: x.cpu()
|
37 |
-
|
38 |
-
return tokenizer, model
|
39 |
-
|
40 |
-
det_processor, det_model, rec_model, rec_processor = load_surya_models()
|
41 |
-
tokenizer, got_model = load_got_ocr_model()
|
42 |
-
|
43 |
-
st.title("OCR Application (Aarish Shah Mohsin)")
|
44 |
-
st.write("Upload an image for OCR processing. Using GOT-OCR for English translations, Picked Surya OCR Model for English+Hindi Translations")
|
45 |
-
|
46 |
-
st.sidebar.header("Configuration")
|
47 |
-
model_choice = st.sidebar.selectbox("Select OCR Model:", ("For English + Hindi", "For English (GOT-OCR)"))
|
48 |
-
|
49 |
-
# Store the uploaded image and extracted text in session state
|
50 |
-
if 'uploaded_image' not in st.session_state:
|
51 |
-
st.session_state.uploaded_image = None
|
52 |
-
if 'extracted_text' not in st.session_state:
|
53 |
-
st.session_state.extracted_text = ""
|
54 |
-
|
55 |
-
uploaded_file = st.sidebar.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
|
56 |
-
|
57 |
-
# Update the session state if a new file is uploaded
|
58 |
-
if uploaded_file is not None:
|
59 |
-
st.session_state.uploaded_image = uploaded_file
|
60 |
-
|
61 |
-
predict_button = st.sidebar.button("Predict", key="predict")
|
62 |
-
|
63 |
-
col1, col2 = st.columns([2, 1])
|
64 |
-
|
65 |
-
# Display the image preview if it's already uploaded
|
66 |
-
if st.session_state.uploaded_image:
|
67 |
-
image = Image.open(st.session_state.uploaded_image)
|
68 |
-
|
69 |
-
with col1:
|
70 |
-
# Display a smaller preview of the uploaded image (set width to 300px)
|
71 |
-
col1.image(image, caption='Uploaded Image', use_column_width=False, width=300)
|
72 |
-
|
73 |
-
# Handle predictions
|
74 |
-
if predict_button and st.session_state.uploaded_image:
|
75 |
-
with st.spinner("Processing..."):
|
76 |
-
# Save the uploaded file temporarily
|
77 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
|
78 |
-
temp_file.write(st.session_state.uploaded_image.getvalue())
|
79 |
-
temp_file_path = temp_file.name
|
80 |
-
|
81 |
-
image = Image.open(temp_file_path)
|
82 |
-
image = image.convert("RGB")
|
83 |
-
|
84 |
-
if model_choice == "For English + Hindi":
|
85 |
-
langs = ["en", "hi"]
|
86 |
-
predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
|
87 |
-
text_list = re.findall(r"text='(.*?)'", str(predictions[0]))
|
88 |
-
extracted_text = ' '.join(text_list)
|
89 |
-
|
90 |
-
st.session_state.extracted_text = extracted_text # Save extracted text in session state
|
91 |
-
|
92 |
-
elif model_choice == "For English (GOT-OCR)":
|
93 |
-
image_file = temp_file_path
|
94 |
-
res = got_model.chat(tokenizer, image_file, ocr_type='ocr')
|
95 |
-
|
96 |
-
st.session_state.extracted_text = res # Save extracted text in session state
|
97 |
-
|
98 |
-
# Delete the temporary file after processing
|
99 |
-
if os.path.exists(temp_file_path):
|
100 |
-
os.remove(temp_file_path)
|
101 |
-
|
102 |
-
# Search functionality
|
103 |
-
if st.session_state.extracted_text:
|
104 |
-
search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
|
105 |
-
|
106 |
-
# Create a pattern to find the search query in a case-insensitive way
|
107 |
-
if search_query:
|
108 |
-
pattern = re.compile(re.escape(search_query), re.IGNORECASE)
|
109 |
-
highlighted_text = st.session_state.extracted_text
|
110 |
-
|
111 |
-
# Replace matching text with highlighted version (bright green)
|
112 |
-
highlighted_text = pattern.sub(lambda m: f"<span style='background-color: limegreen;'>{m.group(0)}</span>", highlighted_text)
|
113 |
-
|
114 |
-
st.markdown("### Highlighted Search Results:")
|
115 |
-
st.markdown(highlighted_text, unsafe_allow_html=True)
|
116 |
-
else:
|
117 |
-
# If no search query, show the original extracted text
|
118 |
-
st.markdown("### Extracted Text:")
|
119 |
-
st.markdown(st.session_state.extracted_text, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|