Spaces:
Runtime error
Runtime error
dkoshman
commited on
Commit
·
e33424f
1
Parent(s):
02f3832
dataset, preprocessing, gitignore
Browse files- .gitignore +2 -0
- app.py +14 -1
- data_preprocessing.py +107 -0
- resources/latex.json +1 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.ipynb
|
2 |
+
/__pycache__
|
app.py
CHANGED
@@ -1,4 +1,17 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
st.markdown("### Hello, world!")
|
4 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
st.markdown("### Hello, world!")
|
4 |
+
st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
|
5 |
+
# ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter
|
6 |
+
|
7 |
+
text = st.text_area("TEXT HERE")
|
8 |
+
# ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент
|
9 |
+
|
10 |
+
# from transformers import pipeline
|
11 |
+
# pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
|
12 |
+
# raw_predictions = pipe(text)
|
13 |
+
# тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost
|
14 |
+
|
15 |
+
# st.markdown(f"{raw_predictions}")
|
16 |
+
st.markdown(f"Simon says {text}!")
|
17 |
+
# выводим результаты модели в текстовое поле, на потеху пользователю
|
data_preprocessing.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import tokenizers
|
4 |
+
import torch
|
5 |
+
import torchvision
|
6 |
+
import torchvision.transforms as T
|
7 |
+
import tqdm
|
8 |
+
import PIL
|
9 |
+
from torch.utils.data import Dataset, DataLoader
|
10 |
+
|
11 |
+
|
12 |
+
directory = "/external2/dkkoshman/repos/ML2TransformerApp/data/"
|
13 |
+
|
14 |
+
class TexImageDataset(Dataset):
|
15 |
+
"""Image to tex dataset."""
|
16 |
+
|
17 |
+
def __init__(self, root_dir, image_preprocessing=None, tex_preprocessing=None):
|
18 |
+
"""
|
19 |
+
Args:
|
20 |
+
root_dir (string): Directory with all the images and tex files.
|
21 |
+
transform (callable, optional): Optional transform to be applied
|
22 |
+
on a sample.
|
23 |
+
|
24 |
+
image_preprocessing: callable image preprocessing
|
25 |
+
|
26 |
+
tex_preprocessing: callable tex preprocessing
|
27 |
+
"""
|
28 |
+
|
29 |
+
torch.multiprocessing.set_sharing_strategy('file_system')
|
30 |
+
self.root_dir = root_dir
|
31 |
+
filenames = sorted(
|
32 |
+
set(os.path.splitext(filename)[0] for filename in os.listdir(root_dir) if filename.endswith('png'))
|
33 |
+
)
|
34 |
+
self.data = []
|
35 |
+
|
36 |
+
for filename in tqdm.tqdm(filenames):
|
37 |
+
tex_path = self.root_dir + filename + '.tex'
|
38 |
+
image_path = self.root_dir + filename + '.png'
|
39 |
+
|
40 |
+
with open(tex_path) as file:
|
41 |
+
tex = file.read()
|
42 |
+
if tex_preprocessing:
|
43 |
+
tex = tex_preprocessing(tex)
|
44 |
+
|
45 |
+
image = torchvision.io.read_image(image_path)
|
46 |
+
if image_preprocessing:
|
47 |
+
image = image_preprocessing(image)
|
48 |
+
|
49 |
+
self.data.append((image, tex))
|
50 |
+
|
51 |
+
def __len__(self):
|
52 |
+
return len(self.data)
|
53 |
+
|
54 |
+
def __getitem__(self, idx):
|
55 |
+
image, tex = self.data[idx]
|
56 |
+
return {"image": image, "tex": tex}
|
57 |
+
|
58 |
+
|
59 |
+
class StandardizeImage(object):
|
60 |
+
"""Pad and crop image to a given size, invert and normalize"""
|
61 |
+
|
62 |
+
def __init__(self, width=1024, height=128):
|
63 |
+
self.transform = T.Compose((
|
64 |
+
T.Resize(height),
|
65 |
+
T.Grayscale(),
|
66 |
+
T.functional.invert,
|
67 |
+
T.CenterCrop((height, width))
|
68 |
+
))
|
69 |
+
|
70 |
+
def __call__(self, image):
|
71 |
+
image = self.transform(image)
|
72 |
+
return image
|
73 |
+
|
74 |
+
|
75 |
+
class RandomTransformImage(object):
|
76 |
+
"""Standardize image and randomly augment"""
|
77 |
+
|
78 |
+
def __init__(self, standardize, random_magnitude=5):
|
79 |
+
self.brighten = T.ColorJitter(brightness=(1/random_magnitude, 1 + 1/random_magnitude))
|
80 |
+
self.standardize = standardize
|
81 |
+
self.rand_aug = T.RandAugment(magnitude=random_magnitude)
|
82 |
+
|
83 |
+
def __call__(self, image):
|
84 |
+
image = self.brighten(image)
|
85 |
+
image = self.standardize(image)
|
86 |
+
image = image.contiguous()
|
87 |
+
image = self.rand_aug(image)
|
88 |
+
return image
|
89 |
+
|
90 |
+
|
91 |
+
def generate_tex_tokenizer(dataset):
|
92 |
+
"""Returns a tokeniser trained on tex strings from dataset"""
|
93 |
+
|
94 |
+
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token="[UNK]"))
|
95 |
+
tokenizer_trainer = tokenizers.trainers.BpeTrainer(
|
96 |
+
vocab_size=300,
|
97 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
98 |
+
)
|
99 |
+
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
|
100 |
+
tokenizer.train_from_iterator((item['tex'] for item in dataset), trainer=tokenizer_trainer)
|
101 |
+
tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
|
102 |
+
single="$A [SEP]",
|
103 |
+
special_tokens=[("[SEP]", tokenizer.token_to_id("[SEP]"))]
|
104 |
+
)
|
105 |
+
tokenizer.enable_padding(pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]")
|
106 |
+
|
107 |
+
return tokenizer
|
resources/latex.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"special": {"dollar": "$", "underscore": "_", "caret": "^", "left_bracket": "{", "right_bracket": "}", "ampersand": "&"}, "chars": "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"'()*+,-./:;<=>?@[]`|~", "greek": ["\\alpha", "\\beta", "\\gamma", "\\delta", "\\epsilon", "\\varepsilon", "\\zeta", "\\eta", "\\theta", "\\vartheta", "\\iota", "\\kappa", "\\lambda", "\\mu", "\\nu", "\\xi", "\\pi", "\\varpi", "\\rho", "\\varrho", "\\sigma", "\\varsigma", "\\tau", "\\upsilon", "\\phi", "\\varphi", "\\chi", "\\psi", "\\omega", "\\Gamma", "\\Delta", "\\Theta", "\\Lambda", "\\Xi", "\\Pi", "\\Sigma", "\\Upsilon", "\\Phi", "\\Psi", "\\Omega"], "functions": ["\\forall", "\\exists", "\\arccos", "\\arcsin", "\\arctan", "\\cos", "\\cosh", "\\cot", "\\coth", "\\csc", "\\deg", "\\det", "\\dim", "\\exp", "\\gcd", "\\hom", "\\inf", "\\ker", "\\lg", "\\lim", "\\liminf", "\\limsup", "\\ln", "\\log", "\\max", "\\min", "\\sec", "\\sin", "\\sinh", "\\sup", "\\tan", "\\tanh"], "operators": ["--", "---", "\\pm", "\\mp", "\\times", "\\div", "\\ast", "\\star", "\\bullet", "\\circ", "\\cdot", "\\leq", "\\ll", "\\subset", "\\geq", "\\gg", "\\equiv", "\\sim", "\\simeq", "\\approx", "\\neq", "\\propto", "\\not", "\\mid", "\\leftarrow", "\\Leftarrow", "\\longleftarrow", "\\Longleftarrow", "\\rightarrow", "\\Rightarrow", "\\longrightarrow", "\\Longrightarrow", "\\leftrightarrow", "\\Leftrightarrow", "\\longleftrightarrow", "\\uparrow", "\\downarrow", "\\Uparrow", "\\cdots", "\\ddots", "\\ldots", "\\vdots"], "pairs": [["\\left(", "\\right)"], ["\\left[", "\\right]"], ["\\left\\{", "\\right\\}"], ["\\langle", "\\rangle"]], "spaces": ["\\;", "\\:", "\\,", "\\!"], "fonts": [["sfmath", []], ["lmodern", []], ["eulervm", []], ["euler", []], ["beton", []], ["drm", []], ["boisik", []], ["gfsartemisia-euler", []], ["gfsartemisia", []], ["arev", []], ["anttor", ["math", "light,math", "condensed,math", "light,condensed,math"]]], "fontsizes": [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], "template": "\\documentclass[preview]{standalone}\n\\usepackage[$font_option]{$font}\n\\usepackage[T1]{fontenc}\n\\begin{document}\n{\\fontsize{$fontsize pt}{12 pt}\\selectfont \n\\[\n$equation\n\\]\n}\n\\end{document}", "scopes": {"single": ["^", "_", "\\sqrt", "\\underbrace", "\\underline", "\\boldmath", "\\hat", "\\widehat", "\\check", "\\tilde", "\\widetilde", "\\acute", "\\grave", "\\dot", "\\ddot", "\\breve", "\\bar", "\\vec"], "double_with_delimiters": ["\"\\sum", "\\prod", "\\int", "\\bigcup", "\\bigcap"], "double_no_delimiters": ["\\frac", "\\stackrel"]}}
|