Spaces:

salexashenko
/

test_237568525

Runtime error

App Files Files Community

salexashenko commited on Jan 16, 2023

Commit

3a555af

•

1 Parent(s): 7cd979d

init

Browse files

Files changed (2) hide show

app.py +290 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import nltk
+from spacy.cli import download
+download("en_core_web_sm")
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+en_stopwords = set(list(stopwords.words('english')) +
+                   ['summary', 'synopsis', 'overview', 'list', 'good', 'will', 'why', 'talk', 'long', 'above', 'looks', 'face', 'men', 'years', 'can', 'both', 'have', 'keep', 'yeah', 'said', 'bring', 'done', 'was', 'when', 'ask', 'now', 'very', 'kind', 'they', 'told', 'tell', 'ever', 'kill', 'hold', 'that', 'below', 'bit', 'knew', 'haven', 'few', 'place', 'could', 'says', 'huh', 'job', 'also', 'ain', 'may', 'heart', 'boy', 'with', 'over', 'son', 'else', 'found', 'see', 'any', 'phone', 'hasn', 'saw', 'these', 'maybe', 'into', 'thing', 'mom', 'god', 'old', 'aren', 'mustn', 'out', 'about', 'guy', 'each', 'most', 'like', 'then', 'wasn', 'being', 'all', 'door', 'look', 'run', 'sorry', 'again', 'won', 'man', 'gone', 'them', 'ago', 'doesn', 'gonna', 'girl', 'feel', 'work', 'much', 'hope', 'never', 'woman', 'went', 'lot', 'what', 'start', 'only', 'play', 'too', 'dad', 'going', 'yours', 'wrong', 'fine', 'made', 'one', 'want', 'isn', 'our', 'true', 'room', 'wanna', 'are', 'idea', 'sure', 'find', 'same', 'doing', 'off', 'put', 'turn', 'come', 'house', 'think', 'meet', 'hers', 'gotta', 'nor', 'away', 'leave', 'car', 'used', 'happy', 'the', 'care', 'seen', 'she', 'not', 'were', 'ours', 'their', 'first', 'world', 'lost', 'make', 'big', 'left', 'miss', 'shan', 'did', 'thank', 'ready', 'those', 'give', 'next', 'came', 'who', 'mind', 'does', 'right', 'her', 'let', 'didn', 'open', 'has', 'show', 'wife', 'yet', 'got', 'know', 'whole', 'some', 'such', 'alone', 'baby', 'him', 'nice', 'bad', 'move', 'new', 'dead', 'three', 'weren', 'whom', 'well', 'get', 'which', 'end', 'you', 'than', 'while', 'last', 'once', 'sir', 'from', 'need', 'wait', 'days', 'how', 'don', 'heard', 'own', 'hear', 'where', 'hey', 'okay', 'just', 'until', 'your', 'there', 'this', 'more', 'been', 'his', 'under', 'mean', 'might', 'here', 'its', 'but', 'stay', 'yes', 'guess', 'even', 'guys', 'hard', 'hadn', 'live', 'stop', 'took', 'still', 'other', 'since', 'every', 'needn', 'way', 'name', 'two', 'back', 'and', 'hello', 'head', 'use', 'must', 'for', 'life', 'die', 'day', 'down', 'wants', 'after', 'say', 'try', 'had', 'night']
+      )
+import multiprocessing
+import os
+from whoosh.analysis import StemmingAnalyzer
+from whoosh.index import create_in
+from whoosh.fields import *
+import whoosh.index as whoosh_index
+import tqdm
+def get_content_ext(content, bm25_field):
+  return content
+def yield_line_by_line(file):
+  with open(file) as input:
+    for l in input:
+      yield l
+def recreate_bm25_idx(content_data_store, bm25_field="search", idx_dir=".", auto_create_bm25_idx=False, idxs=None, use_tqdm=True):
+    if type(content_data_store) is str:
+      content_data_store = yield_line_by_line(content_data_store)
+    schema = Schema(id=ID(stored=True), content=TEXT(analyzer=StemmingAnalyzer()))
+    #TODO determine how to clear out the whoosh index besides rm -rf _M* MAIN*
+    os.system(f"mkdir -p {idx_dir}/bm25_{bm25_field}")
+    need_reindex = auto_create_bm25_idx or not os.path.exists(f"{idx_dir}/bm25_{bm25_field}/_MAIN_1.toc") #CHECK IF THIS IS RIGHT
+    if not need_reindex:
+      whoosh_ix = whoosh_index.open_dir(f"{idx_dir}/bm25_{bm25_field}")
+    else:
+      whoosh_ix = create_in(f"{idx_dir}/bm25_{bm25_field}", schema)
+      writer = whoosh_ix.writer(multisegment=True, limitmb=1024, procs=multiprocessing.cpu_count())
+      #writer = self.whoosh_ix.writer(multisegment=True,  procs=multiprocessing.cpu_count())
+      if hasattr(content_data_store, 'tell'):
+        pos = content_data_store.tell()
+        content_data_store.seek(0, 0)
+      if idxs is not None:
+        idx_text_pairs = [(idx, content_data_store[idx]) for idx in idxs]
+        if use_tqdm:
+          data_iterator =  tqdm.tqdm(idx_text_pairs)
+        else:
+          data_iterator = idx_text_pairs
+      else:
+        if use_tqdm:
+          data_iterator = tqdm.tqdm(enumerate(content_data_store))
+        else:
+          data_iterator = enumerate(content_data_store)
+      # TODO:
+      #self.indexer.reset_bm25_idx(0)
+      #data_iterator = self.indexer.process_bm25_field(content_data_store, **kwargs)
+      for idx, content in data_iterator:
+          content= get_content_ext(content , bm25_field)
+          if not content: continue
+          writer.add_document(id=str(idx), content=content)
+      writer.commit()
+      return whoosh_index
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+safety_tokenizer = tokenizer = AutoTokenizer.from_pretrained("salexashenko/T5-Base-ROT-epoch-2-train-loss-1.3495-val-loss-1.4164", use_auth_token=True)
+safety_model = model = AutoModelForSeq2SeqLM.from_pretrained("salexashenko/T5-Base-ROT-epoch-2-train-loss-1.3495-val-loss-1.4164", use_auth_token=True).half().cuda().eval()
+from transformers import AutoTokenizer, AutoModelForCausalLM
+blackcat_tokenizer = AutoTokenizer.from_pretrained("theblackcat102/galactica-1.3b-conversation-finetuned")
+blackcat_model = AutoModelForCausalLM.from_pretrained("theblackcat102/galactica-1.3b-conversation-finetuned").half().cuda().eval()
+t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")
+t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small",  torch_dtype=torch.half).half().eval().cuda()
+ from transformers import AutoTokenizer, OPTForCausalLM,  AutoModelForCausalLM, AutoModel, T5Tokenizer, T5PreTrainedModel
+from transformers import AutoTokenizer,  AutoModelForCausalLM, AutoModel
+from transformers import T5Tokenizer, T5EncoderModel, AutoModel
+from transformers import T5PreTrainedModel, T5EncoderModel
+from transformers import AutoModelForSeq2SeqLM
+from torch import nn
+import torch
+def run_model(input_string, model, tokenizer, device='cuda', **generator_args):
+  with torch.no_grad():
+    input_ids = tokenizer(input_string, padding=True, return_tensors="pt")
+    input_ids = input_ids.to(device)
+    input_ids['no_repeat_ngram_size']=4
+    for key, val in generator_args.items():
+      input_ids[key] = val
+    res = model.generate(**input_ids)
+    return [ret.replace("..", ".").replace(".-", ".").replace("..", ".").replace("--", "-").replace("--", "-") for ret in tokenizer.batch_decode(res, skip_special_tokens=True)]
+def run_python_and_return(s):
+  try:
+    ret = {'__ret': None}
+    exec(s, ret)
+    return ret['__ret']
+  except:
+    return ''
+import wikipedia,spacy
+from wikipedia import DisambiguationError
+from duckduckgo_search import ddg
+from collections import Counter
+nlp  = spacy.load('en_core_web_sm')
+def duck_duck_and_wikipedia_search(query, num_terms=4, max_docs=10):
+  ret = []
+  #using duckduckgo search
+  data = ddg(
+          query,
+          region="us-en",
+          safesearch="moderate",)
+  data2 = [(a['title'] + ". " + a['body']).replace("?",".").strip("?!.") for a in data]
+  ret.append(data2)
+  doc = nlp(" ".join(data2))
+  query0 = [a[0].strip("!.,;") for a in Counter([e.text for e in doc.ents if e.label_ != 'CARDINAL']).most_common(num_terms)]
+  print (query0)
+  for query2 in query0:
+    search = wikipedia.search(query2)
+    for s in search[:max(1, int(max_docs/num_terms))]:
+      try:
+        page = wikipedia.WikipediaPage(s)
+      except:
+        continue
+      x = ["="+x1 if "==" in x1 else x1 for x1 in page.content.split("\n=")]
+      ret.append (x)
+      if len(ret) > max_docs: return ret
+  return ret
+def generate_with_safety(para, model, tokenizer, do_safety=True, do_execute_work=False, backtrack_on_mismatched_work_answers=False, return_answer_only=True, do_search=False, max_length=512, do_self_contrastive=True, contrative_guidance_embedding=None, max_return_sequences=4, ret=None, do_sample=True, do_beam=False, device="cuda", target_lang=None):
+    global safety_model, safety_tokenizer, t5_model, t5_tokenizer
+    if backtrack_on_mismatched_work_answers: do_execute_work = True #TODO the backtracking inference
+    background = ""
+    para = para.strip()
+    if do_search:
+      data = ddg(
+        para,
+        region="us-en",
+        safesearch="moderate",)
+      data2 = [a['body'].replace("?",".").strip("?!., ") for a in data]
+      # there is a google paper that says using the summary of the search results is better. Need to look for that paper.
+      # also need a simple ngram filter to get rid of bad summaries and use the actual search results as a backup
+      # TODO: store reference URL so we can refer back to the URL in generated text. use ngram overlap (Roge score)
+      background = ". ".join([s.replace("?",".").lstrip(" ?,!.").rstrip(" ,")  for s in run_model(data2[:5], t5_model, t5_tokenizer, max_length=512)])
+    #TODO: inject background knowledge into the instruciton.
+    # give me instructions on how to eat castor beans
+    background_lower = background.lower()
+    is_wrong = is_dangerous = False
+    #replace with a multi task classifier using the safety pipeline
+    if "immoral" in background_lower or "illegal" in background_lower:
+      if "not immoral" not in background_lower and "not illegal" not in background_lower: is_wrong = True
+    if "lethal" in background_lower or "dangerous" in background_lower or " poison" in background_lower:
+      if "not lethal" not in background_lower and "not dangerous" not in background_lower and "not poison" not in background_lower: is_dangerous = True
+    #print (is_wrong, is_dangerous)
+    safety_prefix = ""
+    if do_safety:
+      para2 = para.strip(".?:-")
+      if is_dangerous:
+        para2 += " which is dangerous"
+      elif is_wrong:
+        para2 += " which is wrong"
+      safety_prefix = run_model(para2, safety_model,safety_tokenizer)[0].strip('"\' ')
+      if "wrong" in safety_prefix or "not right" in safety_prefix:
+        safety_prefix = f"As a chatbot, I cannot recommend this. {safety_prefix}"
+    if background:
+      # probably can do a rankgen match instead of keyword on "who", "what", "where", etc.
+      if para.split()[0].lower() not in {"who", "what", "when", "where", "how", "why", "does", "do", "can", "could", "would", "is", "are", "will", "might", "find", "write", "give"} and not para.endswith("?"):
+        para = f"Background: {background}. <question> Complete this sentence: {para} <answer> "
+      else:
+        para =f"Background: {background}. <question> {para} <answer> "
+    if safety_prefix:
+      if "<answer>" not in para:
+        para += "<answer> " + safety_prefix + " "
+      else:
+        para += safety_prefix + " "
+    len_para = len(para)
+    if "<question>" in para:
+      len_para -= len("<question>")
+    if "<answer>" in para:
+      len_para -= len("<answer>")
+    if safety_model:
+      len_para -= len( safety_prefix + " " )
+    if "<answer>" not in para:
+      para += "<answer>"
+    print (para)
+    input_ids = tokenizer.encode(para, return_tensors='pt')
+    input_ids = input_ids.to(device)
+    if ret is None: ret = {}
+    with torch.no_grad():
+      if do_sample:
+          # Here we use top_k / top_k random sampling. It generates more diverse queries, but of lower quality
+          outputs = model.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                no_repeat_ngram_size=4,
+                do_sample=True,
+                top_p=0.95,
+                penalty_alpha=0.6 if do_self_contrastive else None,
+                top_k=10,
+                num_return_sequences=max(1, int(max_return_sequences/2)) if do_beam else max_return_sequences
+                )
+          for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here
+            query = tokenizer.decode(outputs[i], skip_special_tokens=True)
+            if return_answer_only:
+              query = query[len_para:].lstrip(".? \n\t")
+            ret[query] = 1
+      if do_beam:
+        # Here we use Beam-search. It generates better quality queries, but with less diversity
+          outputs = model.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                num_beams=max(int(max_return_sequences/2) if do_sample else max_return_sequences,5),
+                no_repeat_ngram_size=4,
+                penalty_alpha=0.6 if do_self_contrastive else None,
+                num_return_sequences=max(1, int(max_return_sequences/2)) if do_sample else max_return_sequences,
+                early_stopping=True
+            )
+          for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here
+            query = tokenizer.decode(outputs[i], skip_special_tokens=True)
+            if return_answer_only:
+              query = query[len_para:].lstrip(".? \n\t")
+            ret[query] = 1
+    #take care of the <work> tokens - let's execute the code
+    #TODO: do backtracking when code doesn't return the same answer as the answer in the generated text.
+    if do_execute_work: #galactica specific
+      for query in list(ret.keys()):
+        if "<work>" in query:
+          query2 = ""
+          for query_split in query.split("<work>"):
+            if "```" in query_split:
+              query_split = query_split.replace("""with open("output.txt", "w") as file:\n    file.write""", "__ret=")
+              code =query_split.split("</work>")[0].split("```")[1].split("```")[0]
+              query_split1, query_split2 = query_split.split("""<<read: "output.txt">>\n\n""")
+              old_answer2 = old_answer = query_split.split("""<<read: "output.txt">>\n\n""")[1].split("\n")[0]
+              work_answer = run_python_and_return(code)
+              if work_answer is not None:
+                try:
+                  float(old_answer)
+                  old_answer2=float(old_answer)
+                  work_answer = float(work_answer)
+                except:
+                  pass
+                if old_answer2 != work_answer:
+                  query_split2 = query_split2.replace(old_answer, work_answer)
+                query_split = query_split1 + "Computed Answer:" + query_split2
+          if query2:
+            query2 = query2 + "<work>" + query_split
+          else:
+            query2 = query_split
+          if query2 != query:
+            del ret[query]
+            ret[query2] = 1
+    return list(ret.keys())
+import gradio as gr
+def query_model(do_safety, do_search, text):
+    return generate_with_safety(text, blackcat_model, blackcat_tokenizer, do_safety=do_safety, do_search = do_search)
+demo = gr.Interface(
+    query_model,
+    [
+        gr.Checkbox(label="Safety"),
+        gr.Checkbox(label="Search"),
+        gr.Textbox(
+            label="Prompt",
+            lines=5,
+            value="Teach me how to take over the world.",
+        ),
+    ],
+    ["text","text","text","text"]
+)
+if __name__ == "__main__":
+    demo.launch(auth = ('user','supersecurepassword'), auth_message= "Enter your username and password", share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+wikipedia
+duckduckgo_search
+whoosh