dkoshman commited on
Commit
e33424f
1 Parent(s): 02f3832

dataset, preprocessing, gitignore

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +14 -1
  3. data_preprocessing.py +107 -0
  4. resources/latex.json +1 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.ipynb
2
+ /__pycache__
app.py CHANGED
@@ -1,4 +1,17 @@
1
  import streamlit as st
2
 
3
  st.markdown("### Hello, world!")
4
- st.write("hello")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
  st.markdown("### Hello, world!")
4
+ st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)
5
+ # ^-- можно показывать пользователю текст, картинки, ограниченное подмножество html - всё как в jupyter
6
+
7
+ text = st.text_area("TEXT HERE")
8
+ # ^-- показать текстовое поле. В поле text лежит строка, которая находится там в данный момент
9
+
10
+ # from transformers import pipeline
11
+ # pipe = pipeline("ner", "Davlan/distilbert-base-multilingual-cased-ner-hrl")
12
+ # raw_predictions = pipe(text)
13
+ # тут уже знакомый вам код с huggingface.transformers -- его можно заменить на что угодно от fairseq до catboost
14
+
15
+ # st.markdown(f"{raw_predictions}")
16
+ st.markdown(f"Simon says {text}!")
17
+ # выводим результаты модели в текстовое поле, на потеху пользователю
data_preprocessing.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tokenizers
4
+ import torch
5
+ import torchvision
6
+ import torchvision.transforms as T
7
+ import tqdm
8
+ import PIL
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+
12
+ directory = "/external2/dkkoshman/repos/ML2TransformerApp/data/"
13
+
14
+ class TexImageDataset(Dataset):
15
+ """Image to tex dataset."""
16
+
17
+ def __init__(self, root_dir, image_preprocessing=None, tex_preprocessing=None):
18
+ """
19
+ Args:
20
+ root_dir (string): Directory with all the images and tex files.
21
+ transform (callable, optional): Optional transform to be applied
22
+ on a sample.
23
+
24
+ image_preprocessing: callable image preprocessing
25
+
26
+ tex_preprocessing: callable tex preprocessing
27
+ """
28
+
29
+ torch.multiprocessing.set_sharing_strategy('file_system')
30
+ self.root_dir = root_dir
31
+ filenames = sorted(
32
+ set(os.path.splitext(filename)[0] for filename in os.listdir(root_dir) if filename.endswith('png'))
33
+ )
34
+ self.data = []
35
+
36
+ for filename in tqdm.tqdm(filenames):
37
+ tex_path = self.root_dir + filename + '.tex'
38
+ image_path = self.root_dir + filename + '.png'
39
+
40
+ with open(tex_path) as file:
41
+ tex = file.read()
42
+ if tex_preprocessing:
43
+ tex = tex_preprocessing(tex)
44
+
45
+ image = torchvision.io.read_image(image_path)
46
+ if image_preprocessing:
47
+ image = image_preprocessing(image)
48
+
49
+ self.data.append((image, tex))
50
+
51
+ def __len__(self):
52
+ return len(self.data)
53
+
54
+ def __getitem__(self, idx):
55
+ image, tex = self.data[idx]
56
+ return {"image": image, "tex": tex}
57
+
58
+
59
+ class StandardizeImage(object):
60
+ """Pad and crop image to a given size, invert and normalize"""
61
+
62
+ def __init__(self, width=1024, height=128):
63
+ self.transform = T.Compose((
64
+ T.Resize(height),
65
+ T.Grayscale(),
66
+ T.functional.invert,
67
+ T.CenterCrop((height, width))
68
+ ))
69
+
70
+ def __call__(self, image):
71
+ image = self.transform(image)
72
+ return image
73
+
74
+
75
+ class RandomTransformImage(object):
76
+ """Standardize image and randomly augment"""
77
+
78
+ def __init__(self, standardize, random_magnitude=5):
79
+ self.brighten = T.ColorJitter(brightness=(1/random_magnitude, 1 + 1/random_magnitude))
80
+ self.standardize = standardize
81
+ self.rand_aug = T.RandAugment(magnitude=random_magnitude)
82
+
83
+ def __call__(self, image):
84
+ image = self.brighten(image)
85
+ image = self.standardize(image)
86
+ image = image.contiguous()
87
+ image = self.rand_aug(image)
88
+ return image
89
+
90
+
91
+ def generate_tex_tokenizer(dataset):
92
+ """Returns a tokeniser trained on tex strings from dataset"""
93
+
94
+ tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token="[UNK]"))
95
+ tokenizer_trainer = tokenizers.trainers.BpeTrainer(
96
+ vocab_size=300,
97
+ special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
98
+ )
99
+ tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
100
+ tokenizer.train_from_iterator((item['tex'] for item in dataset), trainer=tokenizer_trainer)
101
+ tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
102
+ single="$A [SEP]",
103
+ special_tokens=[("[SEP]", tokenizer.token_to_id("[SEP]"))]
104
+ )
105
+ tokenizer.enable_padding(pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]")
106
+
107
+ return tokenizer
resources/latex.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"special": {"dollar": "$", "underscore": "_", "caret": "^", "left_bracket": "{", "right_bracket": "}", "ampersand": "&"}, "chars": "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"'()*+,-./:;<=>?@[]`|~", "greek": ["\\alpha", "\\beta", "\\gamma", "\\delta", "\\epsilon", "\\varepsilon", "\\zeta", "\\eta", "\\theta", "\\vartheta", "\\iota", "\\kappa", "\\lambda", "\\mu", "\\nu", "\\xi", "\\pi", "\\varpi", "\\rho", "\\varrho", "\\sigma", "\\varsigma", "\\tau", "\\upsilon", "\\phi", "\\varphi", "\\chi", "\\psi", "\\omega", "\\Gamma", "\\Delta", "\\Theta", "\\Lambda", "\\Xi", "\\Pi", "\\Sigma", "\\Upsilon", "\\Phi", "\\Psi", "\\Omega"], "functions": ["\\forall", "\\exists", "\\arccos", "\\arcsin", "\\arctan", "\\cos", "\\cosh", "\\cot", "\\coth", "\\csc", "\\deg", "\\det", "\\dim", "\\exp", "\\gcd", "\\hom", "\\inf", "\\ker", "\\lg", "\\lim", "\\liminf", "\\limsup", "\\ln", "\\log", "\\max", "\\min", "\\sec", "\\sin", "\\sinh", "\\sup", "\\tan", "\\tanh"], "operators": ["--", "---", "\\pm", "\\mp", "\\times", "\\div", "\\ast", "\\star", "\\bullet", "\\circ", "\\cdot", "\\leq", "\\ll", "\\subset", "\\geq", "\\gg", "\\equiv", "\\sim", "\\simeq", "\\approx", "\\neq", "\\propto", "\\not", "\\mid", "\\leftarrow", "\\Leftarrow", "\\longleftarrow", "\\Longleftarrow", "\\rightarrow", "\\Rightarrow", "\\longrightarrow", "\\Longrightarrow", "\\leftrightarrow", "\\Leftrightarrow", "\\longleftrightarrow", "\\uparrow", "\\downarrow", "\\Uparrow", "\\cdots", "\\ddots", "\\ldots", "\\vdots"], "pairs": [["\\left(", "\\right)"], ["\\left[", "\\right]"], ["\\left\\{", "\\right\\}"], ["\\langle", "\\rangle"]], "spaces": ["\\;", "\\:", "\\,", "\\!"], "fonts": [["sfmath", []], ["lmodern", []], ["eulervm", []], ["euler", []], ["beton", []], ["drm", []], ["boisik", []], ["gfsartemisia-euler", []], ["gfsartemisia", []], ["arev", []], ["anttor", ["math", "light,math", "condensed,math", "light,condensed,math"]]], "fontsizes": [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], "template": "\\documentclass[preview]{standalone}\n\\usepackage[$font_option]{$font}\n\\usepackage[T1]{fontenc}\n\\begin{document}\n{\\fontsize{$fontsize pt}{12 pt}\\selectfont \n\\[\n$equation\n\\]\n}\n\\end{document}", "scopes": {"single": ["^", "_", "\\sqrt", "\\underbrace", "\\underline", "\\boldmath", "\\hat", "\\widehat", "\\check", "\\tilde", "\\widetilde", "\\acute", "\\grave", "\\dot", "\\ddot", "\\breve", "\\bar", "\\vec"], "double_with_delimiters": ["\"\\sum", "\\prod", "\\int", "\\bigcup", "\\bigcap"], "double_no_delimiters": ["\\frac", "\\stackrel"]}}