Spaces:

aarishshahmohsin
/

tts_iit_roorkee

Configuration error

App Files Files Community

aarishshahmohsin commited on 21 days ago

Commit

716edce

•

1 Parent(s): 1625392

added everything now

Browse files

Files changed (9) hide show

README.md +0 -14
app copy.py +0 -12
app.py +95 -127
image.png +0 -0
my_model/config.json +0 -39
my_model/generation_config.json +0 -6
new_app.py +0 -9
requirements.txt +4 -9
temp_app.py +0 -119

README.md DELETED Viewed

@@ -1,14 +0,0 @@
----
-title: Ocr Reader
-emoji: 🦀
-colorFrom: purple
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.38.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: OCR project for IIT Roorkee Internship
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app copy.py DELETED Viewed

@@ -1,12 +0,0 @@
-from transformers import AutoTokenizer, AutoModel
-# tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
-# model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
-tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
-model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
-model = model.eval().cpu()
-image_path = './image.png'
-english_extraction = model.chat(tokenizer, image_path, ocr_type='ocr')
-print(english_extraction)

app.py CHANGED Viewed

@@ -1,127 +1,95 @@
-import streamlit as st
-from PIL import Image
-from surya.ocr import run_ocr
-from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
-from surya.model.recognition.model import load_model as load_rec_model
-from surya.model.recognition.processor import load_processor as load_rec_processor
-import re
-from transformers import AutoModel, AutoTokenizer
-import torch
-import tempfile
-import os
-# os.environ["CUDA_VISIBLE_DEVICES"] = ""
-st.set_page_config(page_title="OCR Application", page_icon="🖼️", layout="wide")
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-device="cpu"
-# @st.cache_resource
-# def load_surya_models():
-det_processor, det_model = load_det_processor(), load_det_model()
-det_model.to(device)
-rec_model, rec_processor = load_rec_model(), load_rec_processor()
-rec_model.to(device)
-# return det_processor, det_model, rec_model, rec_processor
-# @st.cache_resource
-# def load_got_ocr_model():
-    # tokenizer = AutoTokenizer.from_pretrained('aarishshahmohsin/got_ocr_cpu', trust_remote_code=True, device_map='cpu')
-    # model = AutoModel.from_pretrained('aarishshahmohsin/got_ocr_cpu', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
-# tokenizer = AutoTokenizer.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, device_map='cpu')
-# got_model = AutoModel.from_pretrained('RufusRubin777/GOT-OCR2_0_CPU', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
-tokenizer = AutoTokenizer.from_pretrained('aarishshahmohsin/got_ocr_2', trust_remote_code=True, device_map='cpu')
-got_model = AutoModel.from_pretrained('aarishshahmohsin/got_ocr_2', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
-# got_model = got_model.to_empty()
-got_model = got_model.eval().to(device)
-# return tokenizer, model
-# det_processor, det_model, rec_model, rec_processor = load_surya_models()
-# tokenizer, got_model = load_got_ocr_model()
-st.title("OCR Application  (Aarish Shah Mohsin)")
-st.write("Upload an image for OCR processing. Using GOT-OCR for English translations, Picked Surya OCR Model for English+Hindi Translations")
-st.sidebar.header("Configuration")
-model_choice = st.sidebar.selectbox("Select OCR Model:", ("For English + Hindi", "For English (GOT-OCR)"))
-# Store the uploaded image and extracted text in session state
-if 'uploaded_image' not in st.session_state:
-    st.session_state.uploaded_image = None
-if 'extracted_text' not in st.session_state:
-    st.session_state.extracted_text = ""
-uploaded_file = st.sidebar.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
-# Update the session state if a new file is uploaded
-if uploaded_file is not None:
-    st.session_state.uploaded_image = uploaded_file
-predict_button = st.sidebar.button("Predict", key="predict")
-col1, col2 = st.columns([2, 1])
-# Display the image preview if it's already uploaded
-if st.session_state.uploaded_image:
-    image = Image.open(st.session_state.uploaded_image)
-    with col1:
-        # Display a smaller preview of the uploaded image (set width to 300px)
-        col1.image(image, caption='Uploaded Image', use_column_width=False, width=300)
-# Handle predictions
-if predict_button and st.session_state.uploaded_image:
-    # with col2:
-    with st.spinner("Processing..."):
-        # Save the uploaded file temporarily
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-            temp_file.write(st.session_state.uploaded_image.getvalue())
-            temp_file_path = temp_file.name
-        image = Image.open(temp_file_path)
-        image = image.convert("RGB")
-        if model_choice == "For English + Hindi":
-            langs = ["en", "hi"]
-            predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
-            text_list = re.findall(r"text='(.*?)'", str(predictions[0]))
-            extracted_text = ' '.join(text_list)
-            st.session_state.extracted_text = extracted_text  # Save extracted text in session state
-            # with col2:
-            #     st.subheader("Extracted Text (Surya):")
-            #     st.write(extracted_text)
-        elif model_choice == "For English (GOT-OCR)":
-            image_file = temp_file_path
-            res = got_model.chat(tokenizer, image_file, ocr_type='ocr')
-            st.session_state.extracted_text = res  # Save extracted text in session state
-            # with col2:
-            #     st.subheader("Extracted Text (GOT-OCR):")
-            #     st.write(res)
-        # Delete the temporary file after processing
-        if os.path.exists(temp_file_path):
-            os.remove(temp_file_path)
-# Search functionality
-if st.session_state.extracted_text:
-    search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
-    # Create a pattern to find the search query in a case-insensitive way
-    if search_query:
-        pattern = re.compile(re.escape(search_query), re.IGNORECASE)
-        highlighted_text = st.session_state.extracted_text
-        # Replace matching text with highlighted version (bright green)
-        highlighted_text = pattern.sub(lambda m: f"<span style='background-color: limegreen;'>{m.group(0)}</span>", highlighted_text)
-        st.markdown("### Highlighted Search Results:")
-        st.markdown(highlighted_text, unsafe_allow_html=True)
-    else:
-        # If no search query, show the original extracted text
-        st.markdown("### Extracted Text:")
-        st.markdown(st.session_state.extracted_text, unsafe_allow_html=True)

+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+# Model configurations
+models = {
+    "Urdu Model": {
+        "checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
+        "vocoder": "microsoft/speecht5_hifigan",
+        "processor": "aarishshahmohsin/urdu_processor_t5",
+    },
+    "Technical Model": {
+        "checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
+        "vocoder": "microsoft/speecht5_hifigan",
+        "processor": "microsoft/speecht5_tts",  # Using same checkpoint for processor
+    }
+}
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+# Initialize all models at startup
+print("Loading models...")
+loaded_models = {}
+for model_name, config in models.items():
+    processor = SpeechT5Processor.from_pretrained(config["processor"])
+    model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
+    vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
+    loaded_models[model_name] = {
+        "processor": processor,
+        "model": model,
+        "vocoder": vocoder
+    }
+print("Models loaded successfully!")
+def predict(text, model_name):
+    if len(text.strip()) == 0:
+        return (16000, np.zeros(0).astype(np.int16))
+    model_components = loaded_models[model_name]
+    processor = model_components["processor"]
+    model = model_components["model"]
+    vocoder = model_components["vocoder"]
+    inputs = processor(text=text, return_tensors="pt")
+    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
+    speech = (speech.numpy() * 32767).astype(np.int16)
+    return (16000, speech)
+# UI Configuration
+title = "Multi-Model SpeechT5 Demo"
+examples = [
+    # Urdu Model Examples
+    ["میں نے آج بہت کام کیا۔", "Urdu Model"],
+    ["آپ کا دن کیسا گزرا؟", "Urdu Model"],
+    # Technical Model Examples
+    ["JSON response with HTTP status code 200.", "Technical Model"],
+    ["Nginx is the best", "Technical Model"],
+]
+description = """
+Select a model and enter text to generate speech.
+1. Regional Language(Urdu)
+2. Technical Speech
+"""
+# Create and launch the interface
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Text(label="Input Text"),
+        gr.Dropdown(
+            choices=list(models.keys()),
+            label="Select Model",
+            value="Technical Model"
+        )
+    ],
+    outputs=[
+        gr.Audio(label="Generated Speech", type="numpy"),
+    ],
+    title=title,
+    description=description,
+    examples=examples,  # Add examples to the interface
+    cache_examples=True,
+).launch()

image.png DELETED Viewed

Binary file (2.39 kB)

my_model/config.json DELETED Viewed

@@ -1,39 +0,0 @@
-{
-  "_name_or_path": "ucaslcl/GOT-OCR2_0",
-  "architectures": [
-    "GOTQwenForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "ucaslcl/GOT-OCR2_0--modeling_GOT.GOTConfig",
-    "AutoModel": "ucaslcl/GOT-OCR2_0--modeling_GOT.GOTQwenForCausalLM"
-  },
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "freeze_vision_tower": false,
-  "hidden_act": "silu",
-  "hidden_size": 1024,
-  "im_end_token": 151858,
-  "im_patch_token": 151859,
-  "im_start_token": 151857,
-  "image_token_len": 256,
-  "initializer_range": 0.02,
-  "intermediate_size": 2816,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "GOT",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 16,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.45.1",
-  "use_cache": true,
-  "use_im_start_end": true,
-  "use_sliding_window": false,
-  "vocab_size": 151860
-}

my_model/generation_config.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048,
-  "transformers_version": "4.45.1"
-}

new_app.py DELETED Viewed

@@ -1,9 +0,0 @@
-from transformers import AutoModel, AutoTokenizer
-model_name = "ucaslcl/GOT-OCR2_0"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name, device_map="auto")
-model.to("cpu")
-model.save_pretrained("./my_model")

requirements.txt CHANGED Viewed

@@ -1,10 +1,5 @@
-streamlit
-Pillow
-surya-ocr
-torch
 transformers
-tiktoken
-torchvision
-verovio
-accelerate
-rapidfuzz

 transformers
+datasets
+librosa
+torch
+numpy

temp_app.py DELETED Viewed

@@ -1,119 +0,0 @@
-import streamlit as st
-from PIL import Image
-from surya.ocr import run_ocr
-from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
-from surya.model.recognition.model import load_model as load_rec_model
-from surya.model.recognition.processor import load_processor as load_rec_processor
-import re
-from transformers import AutoModel, AutoTokenizer
-import torch
-import tempfile
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-st.set_page_config(page_title="OCR Application", page_icon="🖼️", layout="wide")
-# Force CPU if CUDA is not available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-@st.cache_resource
-def load_surya_models():
-    det_processor, det_model = load_det_processor(), load_det_model()
-    det_model.to(device)
-    rec_model, rec_processor = load_rec_model(), load_rec_processor()
-    rec_model.to(device)
-    return det_processor, det_model, rec_model, rec_processor
-@st.cache_resource
-def load_got_ocr_model():
-    tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
-    model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map=device, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
-    model.eval().to(device)
-    # Override .half() and .cuda() to ensure everything runs in float32 and on CPU
-    torch.Tensor.half = lambda x: x.float()
-    torch.Tensor.cuda = lambda x, **kwargs: x.cpu()
-    return tokenizer, model
-det_processor, det_model, rec_model, rec_processor = load_surya_models()
-tokenizer, got_model = load_got_ocr_model()
-st.title("OCR Application  (Aarish Shah Mohsin)")
-st.write("Upload an image for OCR processing. Using GOT-OCR for English translations, Picked Surya OCR Model for English+Hindi Translations")
-st.sidebar.header("Configuration")
-model_choice = st.sidebar.selectbox("Select OCR Model:", ("For English + Hindi", "For English (GOT-OCR)"))
-# Store the uploaded image and extracted text in session state
-if 'uploaded_image' not in st.session_state:
-    st.session_state.uploaded_image = None
-if 'extracted_text' not in st.session_state:
-    st.session_state.extracted_text = ""
-uploaded_file = st.sidebar.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
-# Update the session state if a new file is uploaded
-if uploaded_file is not None:
-    st.session_state.uploaded_image = uploaded_file
-predict_button = st.sidebar.button("Predict", key="predict")
-col1, col2 = st.columns([2, 1])
-# Display the image preview if it's already uploaded
-if st.session_state.uploaded_image:
-    image = Image.open(st.session_state.uploaded_image)
-    with col1:
-        # Display a smaller preview of the uploaded image (set width to 300px)
-        col1.image(image, caption='Uploaded Image', use_column_width=False, width=300)
-# Handle predictions
-if predict_button and st.session_state.uploaded_image:
-    with st.spinner("Processing..."):
-        # Save the uploaded file temporarily
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-            temp_file.write(st.session_state.uploaded_image.getvalue())
-            temp_file_path = temp_file.name
-        image = Image.open(temp_file_path)
-        image = image.convert("RGB")
-        if model_choice == "For English + Hindi":
-            langs = ["en", "hi"]
-            predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
-            text_list = re.findall(r"text='(.*?)'", str(predictions[0]))
-            extracted_text = ' '.join(text_list)
-            st.session_state.extracted_text = extracted_text  # Save extracted text in session state
-        elif model_choice == "For English (GOT-OCR)":
-            image_file = temp_file_path
-            res = got_model.chat(tokenizer, image_file, ocr_type='ocr')
-            st.session_state.extracted_text = res  # Save extracted text in session state
-        # Delete the temporary file after processing
-        if os.path.exists(temp_file_path):
-            os.remove(temp_file_path)
-# Search functionality
-if st.session_state.extracted_text:
-    search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
-    # Create a pattern to find the search query in a case-insensitive way
-    if search_query:
-        pattern = re.compile(re.escape(search_query), re.IGNORECASE)
-        highlighted_text = st.session_state.extracted_text
-        # Replace matching text with highlighted version (bright green)
-        highlighted_text = pattern.sub(lambda m: f"<span style='background-color: limegreen;'>{m.group(0)}</span>", highlighted_text)
-        st.markdown("### Highlighted Search Results:")
-        st.markdown(highlighted_text, unsafe_allow_html=True)
-    else:
-        # If no search query, show the original extracted text
-        st.markdown("### Extracted Text:")
-        st.markdown(st.session_state.extracted_text, unsafe_allow_html=True)