Spaces:

grascii
/

search

Sleeping

App Files Files Community

chanicpanic commited on Dec 13, 2024

Commit

5bcc73a

1 Parent(s): f93b005

Create streamlit app

Browse files

Files changed (10) hide show

.gitignore +4 -0
.streamlit/config.toml +5 -0
README.md +12 -5
app.py +36 -0
report.py +115 -0
requirements.txt +84 -0
save_image.py +23 -0
scheduler.py +191 -0
search.py +230 -0
vision.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.streamlit/secrets.toml
+env/
+__pycache__/
+flagged_rows/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[browser]
+gatherUsageStats = false
+[server]
+maxUploadSize = 5

README.md CHANGED Viewed

@@ -1,12 +1,19 @@
 ---
-title: Search
-emoji: 👀
 colorFrom: gray
 colorTo: green
-sdk: gradio
-sdk_version: 4.31.0
 app_file: app.py
-pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Grascii Search
+emoji: 🔎
 colorFrom: gray
 colorTo: green
+sdk: streamlit
+sdk_version: 1.40.2
 app_file: app.py
+pinned: true
+models:
+  - grascii/gregg-vision-v0.2.1
+datasets:
+  - grascii/gregg-preanniversary-words
+preload_from_hub:
+  - grascii/gregg-vision-v0.2.1
+  - grascii/gregg-preanniversary-words
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Grascii Search",
+    menu_items={
+        "About": """
+        Web interface for [grascii](https://github.com/grascii/grascii)'s
+        search utility
+        Image search powered by [gregg-vision-v0.2.1](https://huggingface.co/grascii/gregg-vision-v0.2.1)
+        """
+    },
+)
+import pandas as pd  # noqa E402
+from search import write_grascii_search, write_reverse_search  # noqa E402
+pd.options.mode.copy_on_write = True
+if "report_submitted" not in st.session_state:
+    st.session_state["report_submitted"] = False
+if "grascii" not in st.session_state:
+    st.session_state["grascii"] = ""
+if st.session_state["report_submitted"]:
+    st.toast("Thanks for the report!")
+    st.session_state["report_submitted"] = False
+tab1, tab2 = st.tabs(["Grascii", "Reverse"])
+with tab1:
+    write_grascii_search()
+with tab2:
+    write_reverse_search()

report.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import streamlit as st
+from pathlib import Path
+from uuid import uuid4
+import csv
+from datetime import datetime, timezone
+from huggingface_hub import CommitScheduler
+CSV_DATASET_DIR = Path("flagged_rows")
+CSV_DATASET_DIR.mkdir(parents=True, exist_ok=True)
+CSV_DATASET_PATH = CSV_DATASET_DIR / f"train-{uuid4()}.csv"
+wrote_header = False
+def write_header(writer):
+    writer.writerow(
+        [
+            "date",
+            "grascii",
+            "longhand",
+            "incorrect_grascii",
+            "incorrect_longhand",
+            "incorrect_shorthand",
+            "improperly_cropped",
+            "extraneous_marks",
+        ]
+    )
+    global wrote_header
+    wrote_header = True
+scheduler = CommitScheduler(
+    repo_id=st.secrets.FEEDBACK_REPO,
+    repo_type="dataset",
+    folder_path=CSV_DATASET_DIR,
+    path_in_repo="data",
+    every=15,
+    token=st.secrets.HF_TOKEN,
+)
+@st.dialog("Flag Results for Review", width="large")
+def report_dialog(data):
+    st.write("Please select one or more reasons for flagging each row:")
+    report_df = data
+    report_df["3"] = True
+    report_df["4"] = False
+    report_df["5"] = False
+    report_df["6"] = False
+    report_df["7"] = False
+    report_df["8"] = False
+    final_report = st.data_editor(
+        report_df,
+        hide_index=True,
+        column_config={
+            "0": "Grascii",
+            "1": "Longhand",
+            "2": st.column_config.ImageColumn("Shorthand", width="medium"),
+            "3": st.column_config.CheckboxColumn("Flag"),
+            "4": st.column_config.CheckboxColumn("Grascii is incorrect"),
+            "5": st.column_config.CheckboxColumn("Longhand is incorrect"),
+            "6": st.column_config.CheckboxColumn("Shorthand image is incorrect"),
+            "7": st.column_config.CheckboxColumn(
+                "Shorthand image is improperly cropped"
+            ),
+            "8": st.column_config.CheckboxColumn(
+                "Shorthand image contains extraneous marks"
+            ),
+        },
+        disabled=["0", "1", "2"],
+        use_container_width=True,
+    )
+    st.write(
+        "If you decide that a listed row does not need to be flagged, uncheck its 'Flag' box to prevent it from being included in the submission."
+    )
+    if st.button("Submit"):
+        with scheduler.lock:
+            with open(CSV_DATASET_PATH, "a", newline="") as f:
+                writer = csv.writer(f, dialect="unix")
+                def write_row(row):
+                    if not wrote_header:
+                        write_header(writer)
+                    if row.iloc[3] and any(
+                        [
+                            row.iloc[4],
+                            row.iloc[5],
+                            row.iloc[6],
+                            row.iloc[7],
+                            row.iloc[8],
+                        ]
+                    ):
+                        writer.writerow(
+                            [
+                                datetime.now(timezone.utc).date(),
+                                row.iloc[0],
+                                row.iloc[1],
+                                1 if row.iloc[4] else 0,
+                                1 if row.iloc[5] else 0,
+                                1 if row.iloc[6] else 0,
+                                1 if row.iloc[7] else 0,
+                                1 if row.iloc[8] else 0,
+                            ]
+                        )
+                final_report.apply(write_row, axis=1)
+        st.session_state["report_submitted"] = True
+        st.rerun()

requirements.txt ADDED Viewed

	@@ -0,0 +1,84 @@

+accelerate==1.2.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.10
+aiosignal==1.3.1
+altair==5.5.0
+async-timeout==5.0.1
+attrs==24.2.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+datasets==3.1.0
+dill==0.3.8
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+gitdb==4.0.11
+GitPython==3.1.43
+grascii==0.6.0
+huggingface-hub==0.26.5
+idna==3.10
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lark==1.2.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+narwhals==1.16.0
+networkx==3.4.2
+numpy==2.2.0
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+platformdirs==4.3.6
+propcache==0.2.1
+protobuf==5.29.1
+psutil==6.1.0
+pyarrow==18.1.0
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.4.5
+six==1.17.0
+smmap==5.0.1
+streamlit==1.40.2
+sympy==1.13.1
+tenacity==9.0.0
+tokenizers==0.21.0
+toml==0.10.2
+torch==2.5.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.47.0
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0
+xxhash==3.5.0
+yarl==1.18.3

save_image.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from hashlib import sha256
+import streamlit as st
+from datetime import datetime, timezone
+from scheduler import ParquetScheduler
+scheduler = ParquetScheduler(
+    repo_id=st.secrets.IMAGES_REPO,
+    token=st.secrets.HF_TOKEN,
+    every=15,
+)
+@st.cache_data(ttl=3600)
+def save_image(data, prediction):
+    scheduler.append(
+        {
+            "date": datetime.now(timezone.utc).date(),
+            "image": data,
+            "prediction": prediction,
+            "sha256": sha256(data).hexdigest(),
+        }
+    )

scheduler.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+MIT License
+Copyright (c) 2023 hysts
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import json
+import tempfile
+import uuid
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+import pyarrow as pa
+import pyarrow.parquet as pq
+from huggingface_hub import CommitScheduler, HfApi
+class ParquetScheduler(CommitScheduler):
+    """
+    Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
+    call will result in 1 row in your final dataset.
+    ```py
+    # Start scheduler
+    >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
+    # Append some data to be uploaded
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    ```
+    The scheduler will automatically infer the schema from the data it pushes.
+    Optionally, you can manually set the schema yourself:
+    ```py
+    >>> scheduler = ParquetScheduler(
+    ...     repo_id="my-parquet-dataset",
+    ...     schema={
+    ...         "prompt": {"_type": "Value", "dtype": "string"},
+    ...         "negative_prompt": {"_type": "Value", "dtype": "string"},
+    ...         "guidance_scale": {"_type": "Value", "dtype": "int64"},
+    ...         "image": {"_type": "Image"},
+    ...     },
+    ... )
+    See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
+    possible values.
+    """
+    def __init__(
+        self,
+        *,
+        repo_id: str,
+        schema: Optional[Dict[str, Dict[str, str]]] = None,
+        every: Union[int, float] = 5,
+        revision: Optional[str] = None,
+        private: bool = False,
+        token: Optional[str] = None,
+        allow_patterns: Union[List[str], str, None] = None,
+        ignore_patterns: Union[List[str], str, None] = None,
+        hf_api: Optional[HfApi] = None,
+    ) -> None:
+        super().__init__(
+            repo_id=repo_id,
+            folder_path=tempfile.tempdir,  # not used by the scheduler
+            every=every,
+            repo_type="dataset",
+            revision=revision,
+            private=private,
+            token=token,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            hf_api=hf_api,
+        )
+        self._rows: List[Dict[str, Any]] = []
+        self._schema = schema
+    def append(self, row: Dict[str, Any]) -> None:
+        """Add a new item to be uploaded."""
+        with self.lock:
+            self._rows.append(row)
+    def push_to_hub(self):
+        # Check for new rows to push
+        with self.lock:
+            rows = self._rows
+            self._rows = []
+        if not rows:
+            return
+        print(f"Got {len(rows)} item(s) to commit.")
+        # Load images + create 'features' config for datasets library
+        schema: Dict[str, Dict] = self._schema or {}
+        path_to_cleanup: List[Path] = []
+        for row in rows:
+            for key, value in row.items():
+                # Infer schema (for `datasets` library)
+                if key not in schema:
+                    schema[key] = _infer_schema(key, value)
+                # Load binary files if necessary
+                if schema[key]["_type"] in ("Image", "Audio"):
+                    if isinstance(value, bytes):
+                        row[key] = {
+                            "path": "",
+                            "bytes": value,
+                        }
+                    else:
+                        # It's an image or audio: we load the bytes and remember to cleanup the file
+                        file_path = Path(value)
+                        if file_path.is_file():
+                            row[key] = {
+                                "path": file_path.name,
+                                "bytes": file_path.read_bytes(),
+                            }
+                            path_to_cleanup.append(file_path)
+        # Complete rows if needed
+        for row in rows:
+            for feature in schema:
+                if feature not in row:
+                    row[feature] = None
+        # Export items to Arrow format
+        table = pa.Table.from_pylist(rows)
+        # Add metadata (used by datasets library)
+        table = table.replace_schema_metadata(
+            {"huggingface": json.dumps({"info": {"features": schema}})}
+        )
+        # Write to parquet file
+        archive_file = tempfile.NamedTemporaryFile()
+        pq.write_table(table, archive_file.name)
+        # Upload
+        self.api.upload_file(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            revision=self.revision,
+            path_in_repo=f"{uuid.uuid4()}.parquet",
+            path_or_fileobj=archive_file.name,
+        )
+        print("Commit completed.")
+        # Cleanup
+        archive_file.close()
+        for path in path_to_cleanup:
+            path.unlink(missing_ok=True)
+def _infer_schema(key: str, value: Any) -> Dict[str, str]:
+    """Infer schema for the `datasets` library.
+    See
+    https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value.
+    """
+    if "image" in key:
+        return {"_type": "Image"}
+    if "audio" in key:
+        return {"_type": "Audio"}
+    if isinstance(value, int):
+        return {"_type": "Value", "dtype": "int64"}
+    if isinstance(value, float):
+        return {"_type": "Value", "dtype": "float64"}
+    if isinstance(value, bool):
+        return {"_type": "Value", "dtype": "bool"}
+    if isinstance(value, bytes):
+        return {"_type": "Value", "dtype": "binary"}
+    # Otherwise in last resort => convert it to a string
+    return {"_type": "Value", "dtype": "string"}

search.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import base64
+from io import BytesIO
+import numpy as np
+import streamlit as st
+from PIL import Image
+import pandas as pd
+from datasets import load_dataset
+from grascii import GrasciiSearcher, InvalidGrascii, ReverseSearcher
+from report import report_dialog
+from vision import run_vision
+from save_image import save_image
+@st.cache_data(show_spinner="Loading shorthand images")
+def load_images():
+    ds = load_dataset(
+        "grascii/gregg-preanniversary-words", split="train", token=st.secrets.HF_TOKEN
+    )
+    image_map = {}
+    for row in ds:
+        buffered = BytesIO()
+        row["image"].save(buffered, format="PNG")
+        b64 = base64.b64encode(buffered.getvalue())
+        image_map[row["longhand"]] = "data:image/png;base64," + b64.decode("utf-8")
+    return image_map
+image_map = load_images()
+def set_grascii():
+    if "grascii_text_box" in st.session_state:
+        st.session_state["grascii"] = st.session_state["grascii_text_box"]
+def write_grascii_search():
+    searcher = GrasciiSearcher()
+    grascii_results = []
+    search_by = st.radio("Search by", ["text", "image (beta)"], horizontal=True)
+    with st.form("Grascii Search"):
+        placeholder = st.empty()
+        if search_by == "text":
+            placeholder.text_input(
+                "Grascii", value=st.session_state["grascii"], key="grascii_text_box"
+            )
+        else:
+            image_data = placeholder.file_uploader(
+                "Image",
+                type=["png", "jpg"],
+                help="""
+                    Upload an image of a shorthand form.
+                    At this time, minimal preprocessing is performed on images
+                    before running them through the model. For best results,
+                    upload an image:
+                    - of a closely cropped, single shorthand form
+                    - with the shorthand written in black on a white background
+                    - that does not contain marks beside the shorthand form
+                    """,
+            )
+            save = st.checkbox(
+                "Save images I upload for potential inclusion in open-source datasets used to train and improve models"
+            )
+            if image_data:
+                image = Image.open(image_data).convert("RGBA")
+                background = Image.new("RGBA", image.size, (255, 255, 255))
+                alpha_composite = Image.alpha_composite(background, image)
+                arr = np.array([alpha_composite.convert("L")])
+                tokens = run_vision(arr)
+                st.session_state["grascii"] = "".join(tokens)
+                if save:
+                    save_image(image_data.getvalue(), "-".join(tokens))
+        with st.expander("Options"):
+            interpretation = st.radio(
+                "Interpretation",
+                ["best", "all"],
+                horizontal=True,
+                help="""
+                    How to intepret ambiguous Grascii strings.
+                    - best: Only search using the best interpretation
+                    - all: Search using all possible interpretations.
+                    """,
+            )
+            uncertainty = st.slider(
+                "Uncertainty",
+                min_value=0,
+                max_value=2,
+                help="The uncertainty of the strokes in the Grascii string",
+            )
+            fix_first = st.checkbox(
+                "Fix First", help="Apply an uncertainty of 0 to the first token"
+            )
+            search_mode = st.selectbox(
+                "Search Mode",
+                ["match", "start", "contain"],
+                help="""
+                    The type of search to perform.
+                    - match: Search for entries that closely match the Grascii string
+                    - start: Search for entries that start with the Grascii string
+                    - contain: Search for entries that contain the Grascii string
+                    """,
+            )
+            annotation_mode = st.selectbox(
+                "Annotation Mode",
+                ["strict", "retain", "discard"],
+                index=2,
+                help="""
+                    How to handle Grascii annotations.
+                    - discard: Annotations are discarded.
+                        Search results may contain annotations in any location.
+                    - retain: Annotations in the input must appear in search results.
+                        Other annotations may appear in the results.
+                    - strict: Annotations in the input must appear in search results.
+                        Other annotations may not appear in the results.
+                    """,
+            )
+            aspirate_mode = st.selectbox(
+                "Aspirate Mode",
+                ["strict", "retain", "discard"],
+                index=2,
+                help="""
+                    How to handle Grascii asirates (').
+                    - discard: Aspirates are discarded.
+                        Search results may contain aspirates in any location.
+                    - retain: Aspirates in the input must appear in search results.
+                        Other aspirates may appear in the results.
+                    - strict: Aspirates in the input must appear in search results.
+                        Other aspirates may not appear in the results.
+                    """,
+            )
+            disjoiner_mode = st.selectbox(
+                "Disjoiner Mode",
+                ["strict", "retain", "discard"],
+                index=0,
+                help="""
+                    How to handle Grascii disjoiners (^).
+                    - discard: Disjoiners are discarded.
+                        Search results may contain disjoiners in any location.
+                    - retain: Disjoiners in the input must appear in search results.
+                        Other disjoiners may appear in the results.
+                    - strict: Disjoiners in the input must appear in search results.
+                        Other disjoiners may not appear in the results.
+                    """,
+            )
+        st.form_submit_button("Search", on_click=set_grascii)
+    grascii = st.session_state["grascii"]
+    try:
+        grascii_results = searcher.sorted_search(
+            grascii=grascii,
+            interpretation=interpretation,
+            uncertainty=uncertainty,
+            fix_first=fix_first,
+            search_mode=search_mode,
+            annotation_mode=annotation_mode,
+            aspirate_mode=aspirate_mode,
+            disjoiner_mode=disjoiner_mode,
+        )
+    except InvalidGrascii as e:
+        if grascii:
+            st.error(f"Invalid Grascii\n```\n{e.context}\n```")
+    else:
+        write_results(grascii_results, grascii.upper(), "grascii")
+@st.fragment
+def write_results(results, term, key_prefix):
+    rows = map(
+        lambda r: [
+            r.entry.grascii,
+            r.entry.translation,
+            image_map.get(r.entry.translation),
+        ],
+        results,
+    )
+    data = pd.DataFrame(rows)
+    r = "Results" if len(data) != 1 else "Result"
+    st.write(f'{len(data)} {r} for "{term}"')
+    event = st.dataframe(
+        data,
+        use_container_width=True,
+        column_config={
+            "0": "Grascii",
+            "1": "Longhand",
+            "2": st.column_config.ImageColumn("Shorthand", width="medium"),
+        },
+        selection_mode="multi-row",
+        on_select="rerun",
+        key=key_prefix + "_data_frame",
+    )
+    selected_rows = event.selection.rows
+    if st.button(
+        "Flag Selected Rows",
+        key=key_prefix + "_report_button",
+        disabled=len(selected_rows) == 0,
+    ):
+        report_dialog(data.iloc[selected_rows])
+def write_reverse_search():
+    searcher = ReverseSearcher()
+    reverse_results = []
+    with st.form("Reverse Search"):
+        word = st.text_input("Word(s)")
+        st.form_submit_button("Search")
+        if word:
+            reverse_results = searcher.sorted_search(
+                reverse=word,
+            )
+    if word:
+        write_results(reverse_results, word, "reverse")

vision.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import streamlit as st
+from transformers import (
+    PreTrainedTokenizerFast,
+    VisionEncoderDecoderModel,
+    ViTImageProcessor,
+)
+model_name = "grascii/gregg-vision-v0.2.1"
+@st.cache_resource(show_spinner=f"Loading {model_name}")
+def load_model():
+    model = VisionEncoderDecoderModel.from_pretrained(
+        model_name, token=st.secrets.HF_TOKEN
+    )
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(
+        model_name,
+        token=st.secrets.HF_TOKEN,
+    )
+    processor = ViTImageProcessor.from_pretrained(model_name, token=st.secrets.HF_TOKEN)
+    return model, tokenizer, processor
+@st.cache_data(ttl=3600, show_spinner=f"Running {model_name}")
+def run_vision(image):
+    model, tokenizer, processor = load_model()
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+    generated = model.generate(pixel_values, max_new_tokens=12)[0]
+    return tokenizer.convert_ids_to_tokens(generated, skip_special_tokens=True)