vosatorp commited on
Commit
2ce3f0c
1 Parent(s): 264499a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -192
app.py CHANGED
@@ -1,207 +1,83 @@
 
1
  import streamlit as st
2
- from nltk.tokenize import sent_tokenize, word_tokenize
3
- import warnings
4
- import gensim
5
- from gensim.models.word2vec import Word2Vec
6
- import gensim.downloader
7
- import pandas as pd
8
- from random import choice, shuffle
9
- from mnemonic import Mnemonic
10
- import bip32utils
11
- import numpy as np
12
- import sys, re
13
- from time import sleep
14
- from urllib.request import urlopen
15
- import time
16
- from joblib import Parallel, delayed
17
- from tqdm import tqdm
18
- from stqdm import stqdm
19
 
20
- warnings.filterwarnings(action='ignore')
21
- st.set_page_config(page_title="some solver", layout="centered")
22
- st.markdown("Welcome!")
23
- BOUND = 10 ** 6
24
- good_prefixes = {'19pcB', '1Bbf', '1NSme', '1wLR1', '1KBtw', '1Hu5J', '1F3n8', '172x'}
25
- good_prefixes_small = {i.lower() for i in good_prefixes}
26
 
27
- @st.cache()
28
- def get():
29
- wv = gensim.downloader.load('glove-wiki-gigaword-50')
30
- bip39_dict = [row.strip() for row in open("bip39dict.txt").readlines()]
31
- forbiddenSet = set()
32
- for row in open("forbidden.txt", "r").readlines():
33
- forbiddenSet |= set(row.split())
34
- return wv, bip39_dict, forbiddenSet
35
 
36
- @st.cache()
37
- def get_2048():
38
- table = {}
39
- for word in bip39_dict:
40
- res = []
41
- for elem in bip39_dict:
42
- cur_sim = wv.similarity(word, elem)
43
- res.append((elem, cur_sim))
44
- res.sort(key = lambda x: -x[1])
45
- table[word] = res
46
- return table
47
 
48
- @st.cache()
49
- def top_from_bip39(word, topn):
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  res = []
51
- if word not in wv:
52
- st.markdown("Sorry, not dictionary word")
53
- return []
54
- for elem in bip39_dict:
55
- cur_sim = wv.similarity(word, elem)
56
- res.append((elem, cur_sim))
57
- res.sort(key = lambda x: -x[1])
58
- return res[:topn]
59
 
60
- def check_balance(address):
61
- blockchain_tags_json = ['total_received', 'final_balance']
62
- SATOSHIS_PER_BTC = 1e+8
63
- check_address = address
64
- parse_address_structure = re.match(r' *([a-zA-Z1-9]{1,34})', check_address)
65
- check_address = parse_address_structure.group(1)
66
- htmlfile = urlopen("https://blockchain.info/address/%s?format=json" % check_address, timeout = 10)
67
- htmltext = htmlfile.read().decode('utf-8')
68
- blockchain_info_array = []
69
- for tag in blockchain_tags_json:
70
- blockchain_info_array.append (
71
- float(re.search( r'%s":(\d+),' % tag, htmltext ).group(1)))
72
- res = {}
73
- out = "Bitcoin Address " + check_address + "\n"
74
- for i, btc_tokens in enumerate(blockchain_info_array):
75
- num = max(0, btc_tokens) / SATOSHIS_PER_BTC
76
- res[blockchain_tags_json[i]] = num
77
- out += "%s \t " % blockchain_tags_json[i] + "%.8f Bitcoin" % num + "\n"
78
- st.text(out)
79
  return res
80
 
81
- mnemon = Mnemonic('english')
82
- words = mnemon.generate(256)
83
- def get_address(mnemonic, passphrase):
84
- seed = mnemon.to_seed(mnemonic, passphrase=passphrase)
85
- root_key = bip32utils.BIP32Key.fromEntropy(seed)
86
- child_key = root_key.ChildKey(44 + bip32utils.BIP32_HARDEN
87
- ).ChildKey(bip32utils.BIP32_HARDEN
88
- ).ChildKey(bip32utils.BIP32_HARDEN
89
- ).ChildKey(0).ChildKey(0)
90
- child_address = child_key.Address()
91
- return child_address
 
 
 
 
 
 
92
 
93
- def check(sentence, passphrase=None):
94
- if not mnemon.check(' '.join(sentence)):
95
- return
96
- if passphrase is None:
97
- for passphrase in passwords:
98
- check(sentence, passphrase)
99
- return
100
- b44 = get_address(' '.join(sentence), passphrase)
101
- flag1 = b44[:5] in good_prefixes or b44[:4] in good_prefixes
102
- if flag1:
103
- st.markdown(' '.join(sentence))
104
- candidates.append((sentence, b44, passphrase))
105
- res = check_balance(b44)
106
- if res['final_balance'] > 0:
107
- founded.append((sentence, b44, passphrase))
108
- out = 'FOUNDED!\n' + '\n'.join(sentence) + '\n' + b44 + '\n' + passphrase + '\n'
109
- st.markdown(out)
110
- open('FOUNDED.txt', 'w').write(out)
111
- exit(0)
112
 
113
- def generateAll(monthVars):
114
- def gen(pos, cur):
115
- if len(res) >= BOUND:
116
- return
117
- if pos == 12:
118
- res.append(cur)
119
- return
120
- for word in monthVars[pos]:
121
- gen(pos + 1, cur + [word])
122
- res = []
123
- gen(0, [])
124
- return res
125
 
126
- def searchSeed(MonthVariants):
127
- def check_(cur):
128
- bucket.append(cur)
129
- return
130
- assert len(MonthVariants) == 12
131
- cntSearched = 0
132
- variants = generateAll(MonthVariants)
133
- shuffle(variants)
134
- st.markdown(f"Количество вариантов для перебора: {len(variants)}")
135
- bucket = []
136
- for curSentence in stqdm(variants):
137
- cntSearched += 1
138
- for i in range(12):
139
- cur = curSentence[i:] + curSentence[:i]
140
- check_(cur)
141
- cur = cur[::-1]
142
- check_(cur)
143
- if len(bucket) > 1000:
144
- Parallel(n_jobs=n_jobs)(delayed(check)(words) for words in bucket)
145
- bucket = []
146
 
147
- with st.form(key='similar_words'):
148
- st.write('Найти похожие слова из bip39')
149
- col1, col2 = st.columns([1, 1])
150
- with col1:
151
- similar_input = st.text_input("Введите слово")
152
- with col2:
153
- topn = st.slider("Сколько топ похожих вывести?", 10, 100, value=10)
154
- submit_button = st.form_submit_button(label='Submit')
155
- if submit_button:
156
- show_forb = st.checkbox(label='Показывать выброшенные слова')
157
- wv, bip39_dict, forbiddenSet = get()
158
- res = top_from_bip39(similar_input.lower(), topn)
159
- if not show_forb:
160
- res2 = []
161
- for word, num in res:
162
- if word not in forbiddenSet:
163
- res2.append((word, num))
164
- res = res2.copy()
165
- st.markdown("Топ похожих слов")
166
- def highlight(s):
167
- if s['Слово'] in forbiddenSet:
168
- return ['background-color: red'] * len(s)
169
- else:
170
- return [None] * len(s)
171
- df = pd.DataFrame(res, columns=['Слово', 'Похожесть'])
172
- st.table(df.style.apply(highlight, axis=1))
173
 
174
- with st.form(key='check_address'):
175
- st.write("Проверить баланс на адресе")
176
- col1, col2 = st.columns([2, 1])
177
- with col1:
178
- sentence_input = st.text_input(label="Введите предложение из 12 слов")
179
- with col2:
180
- password_input = st.text_input(label="Введите пароль")
181
- submit_button = st.form_submit_button(label='Submit')
182
- if submit_button:
183
- b44 = get_address(sentence_input, password_input)
184
- res = check_balance(b44)
185
- if res['final_balance'] > 0:
186
- out = 'FOUNDED!\n' + '\n'.join(sentence) + '\n' + b44 + '\n' + passphrase + '\n'
187
- st.markdown(out)
188
 
189
- with st.form(key='search'):
190
- st.write("Перебор адресов по вариантам")
191
- col1, col2 = st.columns([7, 3])
192
- with col1:
193
- cand_input = st.text_area("Введите список кандидатов", height=320, value=open("monthvars.txt").read())
194
- with col2:
195
- pass_input = st.text_area("Введите список паролей", height=320, value=open("passwords.txt").read())
196
- n_jobs = 1
197
- # n_jobs = st.slider("Количество потоков для параллеивания", 1, 10, value=1)
198
- # debug = st.radio("print intermediate steps", [True, False])
199
- submit_button = st.form_submit_button(label='Запустить перебор')
200
- if submit_button:
201
- candidates = []
202
- founded = []
203
- monthVars = [row.split() for row in cand_input.split('\n')]
204
- passwords = [i.strip() for i in pass_input.split('\n')]
205
- start_time = time.time()
206
- searchSeed(monthVars[:12])
207
-
1
+ import torch
2
  import streamlit as st
3
+ import transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ from transformers import BertTokenizer, BertForMaskedLM
6
+ from transformers import BertForSequenceClassification, DataCollatorWithPadding
 
 
 
 
7
 
8
+ st.set_page_config(page_title="style transfer", layout="centered")
9
+ st.markdown("Welcome to text style transfer. Wait a few seconds for the model to load...")
 
 
 
 
 
 
10
 
11
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
 
 
 
 
 
 
 
 
12
 
13
+
14
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
15
+ bert_mlm_positive = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True).to(device).train(False)
16
+ bert_mlm_negative = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True).to(device).train(False)
17
+ bert_cls = BertForSequenceClassification.from_pretrained(
18
+ 'bert-base-uncased', return_dict=True, problem_type="multi_label_classification", num_labels=2
19
+ ).to(device).train(False)
20
+
21
+ def get_replacements(sentence: str, num_tokens, k_best, epsilon=1e-3):
22
+ """
23
+ - split the sentence into tokens using the INGSOC-approved BERT tokenizer
24
+ - find :num_tokens: tokens with the highest ratio (see above)
25
+ - replace them with :k_best: words according to bert_mlm_positive
26
+ :return: a list of all possible strings (up to k_best * num_tokens)
27
+ """
28
  res = []
29
+ sentence_ix = tokenizer(sentence, return_tensors='pt')
30
+ sentence_ix = {key: value.to(device) for key, value in sentence_ix.items()}
31
+ length = len(sentence_ix['input_ids'][0])
32
+ probs_positive = bert_mlm_positive(**sentence_ix).logits.softmax(dim=-1)[0]
33
+ probs_negative = bert_mlm_negative(**sentence_ix).logits.softmax(dim=-1)[0]
34
+ p_tokens_positive = probs_positive[torch.arange(length), sentence_ix['input_ids'][0]]
35
+ p_tokens_negative = probs_negative[torch.arange(length), sentence_ix['input_ids'][0]]
 
36
 
37
+ p_relative = (p_tokens_positive + epsilon) / (p_tokens_negative + epsilon)
38
+ best_pos = torch.argsort(p_relative[1:-1], dim=0)[:num_tokens] + 1
39
+ best_pos_tokens = torch.argsort(probs_positive, dim=1)[..., -k_best:]
40
+ for pos in best_pos:
41
+ for replace_token in best_pos_tokens[pos]:
42
+ new_tensor = sentence_ix['input_ids'][0].cpu().numpy()
43
+ new_tensor[pos] = replace_token
44
+ new_sentence = tokenizer.decode(new_tensor[1:-1])
45
+ res.append(new_sentence)
46
+ # print(new_sentence)
 
 
 
 
 
 
 
 
 
47
  return res
48
 
49
+ def beamSearch(sentence, n_rounds=5):
50
+ labels = torch.tensor([[1, 1]], dtype=torch.float).to(device)
51
+ for i in range(n_rounds):
52
+ cur_res = get_replacements(sentence, num_tokens=num_tokens, k_best=k_best)
53
+ max_prob = -1
54
+ best_sentence = None
55
+ for candidate_sentence in cur_res:
56
+ inputs = tokenizer(candidate_sentence, return_tensors="pt").to(device)
57
+ outputs = bert_cls(**inputs, labels=labels)
58
+ prob_good = outputs.logits.softmax(dim=-1)[0][1]
59
+ if prob_good > max_prob:
60
+ max_prob = prob_good
61
+ best_sentence = candidate_sentence
62
+ if debug:
63
+ st.markdown(f"cur_sentence: {best_sentence}")
64
+ sentence = best_sentence
65
+ return sentence
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ bert_mlm_positive.load_state_dict(torch.load('mlm_positive.pth', map_location=torch.device('cpu')))
69
+ bert_mlm_negative.load_state_dict(torch.load('mlm_negative.pth', map_location=torch.device('cpu')))
70
+ # bert_cls.load_state_dict(torch.load('bert_cls.pth', map_location=torch.device('cpu')))
 
 
 
 
 
 
 
 
 
71
 
72
+ user_input = st.text_input("Please enter something review")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ n_rounds = st.slider("Pick a number of rounds in beamSearch", 1, 10, value=5)
75
+ k_best = st.slider("Pick k_best parameter", 1, 5, value=3)
76
+ num_tokens = st.slider("Pick num_tokens parameter", 1, 5, value=3)
77
+ debug = st.radio("print intermediate steps", [True, False])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ if len(user_input.split()) > 0:
81
+ res = beamSearch(user_input, n_rounds=n_rounds)
82
+ st.markdown("Processed review:")
83
+ st.markdown(f"{res}")