Spaces:

vosatorp
/

style-transfer

Runtime error

App Files Files Community

vosatorp commited on Jan 7, 2022

Commit

2ce3f0c

•

1 Parent(s): 264499a

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -192

app.py CHANGED Viewed

@@ -1,207 +1,83 @@
 import streamlit as st
-from nltk.tokenize import sent_tokenize, word_tokenize
-import warnings
-import gensim
-from gensim.models.word2vec import Word2Vec
-import gensim.downloader
-import pandas as pd
-from random import choice, shuffle
-from mnemonic import Mnemonic
-import bip32utils
-import numpy as np
-import sys, re
-from time import sleep
-from urllib.request import urlopen
-import time
-from joblib import Parallel, delayed
-from tqdm import tqdm
-from stqdm import stqdm
-warnings.filterwarnings(action='ignore')
-st.set_page_config(page_title="some solver", layout="centered")
-st.markdown("Welcome!")
-BOUND = 10 ** 6
-good_prefixes = {'19pcB', '1Bbf', '1NSme', '1wLR1', '1KBtw', '1Hu5J', '1F3n8', '172x'}
-good_prefixes_small = {i.lower() for i in good_prefixes}
-@st.cache()
-def get():
-    wv = gensim.downloader.load('glove-wiki-gigaword-50')
-    bip39_dict = [row.strip() for row in open("bip39dict.txt").readlines()]
-    forbiddenSet = set()
-    for row in open("forbidden.txt", "r").readlines():
-        forbiddenSet |= set(row.split())
-    return wv, bip39_dict, forbiddenSet
-@st.cache()
-def get_2048():
-    table = {}
-    for word in bip39_dict:
-        res = []
-        for elem in bip39_dict:
-            cur_sim = wv.similarity(word, elem)
-            res.append((elem, cur_sim))
-        res.sort(key = lambda x: -x[1])
-        table[word] = res
-    return table
-@st.cache()
-def top_from_bip39(word, topn):
     res = []
-    if word not in wv:
-        st.markdown("Sorry, not dictionary word")
-        return []
-    for elem in bip39_dict:
-        cur_sim = wv.similarity(word, elem)
-        res.append((elem, cur_sim))
-    res.sort(key = lambda x: -x[1])
-    return res[:topn]
-def check_balance(address):
-    blockchain_tags_json = ['total_received', 'final_balance']
-    SATOSHIS_PER_BTC = 1e+8
-    check_address = address
-    parse_address_structure = re.match(r' *([a-zA-Z1-9]{1,34})', check_address)
-    check_address = parse_address_structure.group(1)
-    htmlfile = urlopen("https://blockchain.info/address/%s?format=json" % check_address, timeout = 10)
-    htmltext = htmlfile.read().decode('utf-8')
-    blockchain_info_array = []
-    for tag in blockchain_tags_json:
-        blockchain_info_array.append (
-            float(re.search( r'%s":(\d+),' % tag, htmltext ).group(1)))
-    res = {}
-    out = "Bitcoin Address   " + check_address + "\n"
-    for i, btc_tokens in enumerate(blockchain_info_array):
-        num = max(0, btc_tokens) / SATOSHIS_PER_BTC
-        res[blockchain_tags_json[i]] = num
-        out += "%s \t  " % blockchain_tags_json[i] + "%.8f Bitcoin" % num + "\n"
-    st.text(out)
     return res
-mnemon = Mnemonic('english')
-words = mnemon.generate(256)
-def get_address(mnemonic, passphrase):
-    seed = mnemon.to_seed(mnemonic, passphrase=passphrase)
-    root_key = bip32utils.BIP32Key.fromEntropy(seed)
-    child_key = root_key.ChildKey(44 + bip32utils.BIP32_HARDEN
-    ).ChildKey(bip32utils.BIP32_HARDEN
-    ).ChildKey(bip32utils.BIP32_HARDEN
-    ).ChildKey(0).ChildKey(0)
-    child_address = child_key.Address()
-    return child_address
-def check(sentence, passphrase=None):
-    if not mnemon.check(' '.join(sentence)):
-        return
-    if passphrase is None:
-        for passphrase in passwords:
-            check(sentence, passphrase)
-        return
-    b44 = get_address(' '.join(sentence), passphrase)
-    flag1 = b44[:5] in good_prefixes or b44[:4] in good_prefixes
-    if flag1:
-        st.markdown(' '.join(sentence))
-        candidates.append((sentence, b44, passphrase))
-        res = check_balance(b44)
-        if res['final_balance'] > 0:
-            founded.append((sentence, b44, passphrase))
-            out = 'FOUNDED!\n' +  '\n'.join(sentence) + '\n' + b44 + '\n' + passphrase + '\n'
-            st.markdown(out)
-            open('FOUNDED.txt', 'w').write(out)
-            exit(0)
-def generateAll(monthVars):
-    def gen(pos, cur):
-        if len(res) >= BOUND:
-            return
-        if pos == 12:
-            res.append(cur)
-            return
-        for word in monthVars[pos]:
-            gen(pos + 1, cur + [word])
-    res = []
-    gen(0, [])
-    return res
-def searchSeed(MonthVariants):
-    def check_(cur):
-        bucket.append(cur)
-        return
-    assert len(MonthVariants) == 12
-    cntSearched = 0
-    variants = generateAll(MonthVariants)
-    shuffle(variants)
-    st.markdown(f"Количество вариантов для перебора: {len(variants)}")
-    bucket = []
-    for curSentence in stqdm(variants):
-        cntSearched += 1
-        for i in range(12):
-            cur = curSentence[i:] + curSentence[:i]
-            check_(cur)
-            cur = cur[::-1]
-            check_(cur)
-        if len(bucket) > 1000:
-            Parallel(n_jobs=n_jobs)(delayed(check)(words) for words in bucket)
-            bucket = []
-with st.form(key='similar_words'):
-    st.write('Найти похожие слова из bip39')
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        similar_input = st.text_input("Введите слово")
-    with col2:
-        topn = st.slider("Сколько топ похожих вывести?", 10, 100, value=10)
-    submit_button = st.form_submit_button(label='Submit')
-    if submit_button:
-        show_forb = st.checkbox(label='Показывать выброшенные слова')
-        wv, bip39_dict, forbiddenSet = get()
-        res = top_from_bip39(similar_input.lower(), topn)
-        if not show_forb:
-            res2 = []
-            for word, num in res:
-                if word not in forbiddenSet:
-                    res2.append((word, num))
-            res = res2.copy()
-        st.markdown("Топ похожих слов")
-        def highlight(s):
-            if s['Слово'] in forbiddenSet:
-                return ['background-color: red'] * len(s)
-            else:
-                return [None] * len(s)
-        df = pd.DataFrame(res, columns=['Слово', 'Похожесть'])
-        st.table(df.style.apply(highlight, axis=1))
-with st.form(key='check_address'):
-    st.write("Проверить баланс на адресе")
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        sentence_input = st.text_input(label="Введите предложение из 12 слов")
-    with col2:
-        password_input = st.text_input(label="Введите пароль")
-    submit_button = st.form_submit_button(label='Submit')
-    if submit_button:
-        b44 = get_address(sentence_input, password_input)
-        res = check_balance(b44)
-        if res['final_balance'] > 0:
-            out = 'FOUNDED!\n' +  '\n'.join(sentence) + '\n' + b44 + '\n' + passphrase + '\n'
-            st.markdown(out)
-with st.form(key='search'):
-    st.write("Перебор адресов по вариантам")
-    col1, col2 = st.columns([7, 3])
-    with col1:
-        cand_input = st.text_area("Введите список кандидатов", height=320, value=open("monthvars.txt").read())
-    with col2:
-        pass_input = st.text_area("Введите список паролей", height=320, value=open("passwords.txt").read())
-    n_jobs = 1
-    # n_jobs = st.slider("Количество потоков для параллеивания", 1, 10, value=1)
-    # debug = st.radio("print intermediate steps", [True, False])
-    submit_button = st.form_submit_button(label='Запустить перебор')
-    if submit_button:
-        candidates = []
-        founded = []
-        monthVars = [row.split() for row in cand_input.split('\n')]
-        passwords = [i.strip() for i in pass_input.split('\n')]
-        start_time = time.time()
-        searchSeed(monthVars[:12])

+import torch
 import streamlit as st
+import transformers
+from transformers import BertTokenizer, BertForMaskedLM
+from transformers import BertForSequenceClassification, DataCollatorWithPadding
+st.set_page_config(page_title="style transfer", layout="centered")
+st.markdown("Welcome to text style transfer. Wait a few seconds for the model to load...")
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+bert_mlm_positive = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True).to(device).train(False)
+bert_mlm_negative = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True).to(device).train(False)
+bert_cls = BertForSequenceClassification.from_pretrained(
+    'bert-base-uncased', return_dict=True, problem_type="multi_label_classification", num_labels=2
+).to(device).train(False)
+def get_replacements(sentence: str, num_tokens, k_best, epsilon=1e-3):
+    """
+    - split the sentence into tokens using the INGSOC-approved BERT tokenizer
+    - find :num_tokens: tokens with the highest ratio (see above)
+    - replace them with :k_best: words according to bert_mlm_positive
+    :return: a list of all possible strings (up to k_best * num_tokens)
+    """
     res = []
+    sentence_ix = tokenizer(sentence, return_tensors='pt')
+    sentence_ix = {key: value.to(device) for key, value in sentence_ix.items()}
+    length = len(sentence_ix['input_ids'][0])
+    probs_positive = bert_mlm_positive(**sentence_ix).logits.softmax(dim=-1)[0]
+    probs_negative = bert_mlm_negative(**sentence_ix).logits.softmax(dim=-1)[0]
+    p_tokens_positive = probs_positive[torch.arange(length), sentence_ix['input_ids'][0]]
+    p_tokens_negative = probs_negative[torch.arange(length), sentence_ix['input_ids'][0]]
+    p_relative = (p_tokens_positive + epsilon) / (p_tokens_negative + epsilon)
+    best_pos = torch.argsort(p_relative[1:-1], dim=0)[:num_tokens] + 1
+    best_pos_tokens = torch.argsort(probs_positive, dim=1)[..., -k_best:]
+    for pos in best_pos:
+        for replace_token in best_pos_tokens[pos]:
+            new_tensor = sentence_ix['input_ids'][0].cpu().numpy()
+            new_tensor[pos] = replace_token
+            new_sentence = tokenizer.decode(new_tensor[1:-1])
+            res.append(new_sentence)
+            # print(new_sentence)
     return res
+def beamSearch(sentence, n_rounds=5):
+    labels = torch.tensor([[1, 1]], dtype=torch.float).to(device)
+    for i in range(n_rounds):
+        cur_res = get_replacements(sentence, num_tokens=num_tokens, k_best=k_best)
+        max_prob = -1
+        best_sentence = None
+        for candidate_sentence in cur_res:
+            inputs = tokenizer(candidate_sentence, return_tensors="pt").to(device)
+            outputs = bert_cls(**inputs, labels=labels)
+            prob_good = outputs.logits.softmax(dim=-1)[0][1]
+            if prob_good > max_prob:
+                max_prob = prob_good
+                best_sentence = candidate_sentence
+        if debug:
+            st.markdown(f"cur_sentence: {best_sentence}")
+        sentence = best_sentence
+    return sentence
+bert_mlm_positive.load_state_dict(torch.load('mlm_positive.pth', map_location=torch.device('cpu')))
+bert_mlm_negative.load_state_dict(torch.load('mlm_negative.pth', map_location=torch.device('cpu')))
+# bert_cls.load_state_dict(torch.load('bert_cls.pth', map_location=torch.device('cpu')))
+user_input = st.text_input("Please enter something review")
+n_rounds = st.slider("Pick a number of rounds in beamSearch", 1, 10, value=5)
+k_best = st.slider("Pick k_best parameter", 1, 5, value=3)
+num_tokens = st.slider("Pick num_tokens parameter", 1, 5, value=3)
+debug = st.radio("print intermediate steps", [True, False])
+if len(user_input.split()) > 0:
+    res = beamSearch(user_input, n_rounds=n_rounds)
+    st.markdown("Processed review:")
+    st.markdown(f"{res}")