bhoov commited on
Commit
34b8a50
1 Parent(s): f0d89ef

Gut more server things

Browse files
server/config.py DELETED
@@ -1,20 +0,0 @@
1
- """
2
- This file stores the main configuration variables to run a server.
3
- """
4
- from pathlib import Path
5
- import utils.path_fixes as pf
6
- import os
7
-
8
- ROOT = Path(os.path.abspath(__file__)).parent
9
- CORPORA = ROOT / "corpora"
10
-
11
- # Change this to indicate what data is loaded for searching
12
- RESOURCE_DIR = CORPORA / "gpt2" / "woz"
13
- MODEL_VERSION = "gpt2"
14
- # RESOURCE_DIR = CORPORA / "woz_bert-base-cased"
15
- # MODEL_VERSION = "bert-base-cased"
16
-
17
- # Below are DEFAULTS. Change only if you changed the way embeddings and contexts are stored and created
18
- CORPUS = RESOURCE_DIR / "data.hdf5"
19
- EMBEDDING_FAISS = RESOURCE_DIR / "embedding_faiss"
20
- CONTEXT_FAISS = RESOURCE_DIR / "context_faiss"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/conftest.py DELETED
File without changes
server/data_processing/README.md DELETED
@@ -1,35 +0,0 @@
1
- # Creating an annotated corpus
2
- This module contains the code necessary for extracting and labeling a corpus with semantic data.
3
-
4
- ## Known limitations
5
- Please note the following:
6
-
7
- - There are many cases in which BPE tokenization and spacy's built in tokenization do not align. To remedy this, contractions that would break the BPE tokenization (defined by Spacy's hard coded exceptions in `spacy.lang.en.TOKENIZER_EXCEPTIONS` and `spacy.lang.tokenizer_exceptions.BASE_EXCEPTIONS`) are instead decomposed into the full words the contractions represent.
8
- - Large corpus files require a LOT of hard drive space to store all the attentions and representations at every layer for every head. When tackling a corpus the size of the Wizard of Oz (207kb), make sure you have at least 9GB of free space. For the validation set of WikiText-2 (1.1MB), you will need 47GB.
9
-
10
- ## Getting Started
11
- The raw Wizard of Oz text used to create the annotated corpus can be found [here](http://www.gutenberg.org/ebooks/55). A small version of Wikipedia (WikiText-2) can be found [here](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
12
-
13
- ### Environment
14
- Because this module depends on code written in other parts of this repo, we will need to make those files available to the PYTHONPATH. There are several ways to do this, but the easiest way is to do the following:
15
-
16
- 1. `conda activate exbert` (Assuming you have taken the time to sort out the conda dependencies)
17
- 2. `cd server`
18
- 3. `pip install -e .`
19
-
20
- This essentially makes this repository a local pip package, allowing you access to all packages inside of `server/` whenever the conda environment is active. For instance, if writing your own scripts or running a jupyter notebook, the top level `utils/token_processing` module will be available as `import utils.token_processing as tp`.
21
-
22
- ### Overview
23
- To create your own dataset from scratch, you will need a large text file whose contents are in English. This repo currently does not support other languages.
24
-
25
- 1. Run `python create_corpus.py -f <FNAME>.txt -o <OUTDIR>`. This will create, in `<OUTDIR>`, the following files:
26
- - `embeddings/` - A folder containing the `<FNAME>.hdf5` file and all the `<layer_**>.faiss` files needed to index into the embeddings. NOTE: These files can be quite large
27
- - `headContext/` - A folder containing the `<FNAME>.hdf5` file and all the `<layer_**>.faiss` files needed to index into the head embeddings/context. NOTE: These files can be quite large
28
-
29
- If you want to overwrite existing files in the output directory, add the `--force` flag onto the `create_corpus.py` command above.
30
-
31
- ### Running the individual scripts
32
- 2. Run `python create_hdf5.py -f <FNAME>.txt -o <OUTDIR>`
33
- 3. Run `python create_faiss.py -d <OUTDIR>`. This will assume the creation of the `embeddings` and `headContexts` folders inside of `<OUTDIR>`
34
-
35
- You will then need to link these corpora into your application. In the `config.py` file, change the `RESOURCE_DIR` to point at `<OUTDIIR>`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/__init__.py DELETED
@@ -1,12 +0,0 @@
1
- from .corpus_data_wrapper import CorpusDataWrapper
2
- from .convenience_corpus import ConvenienceCorpus, from_model
3
- from .index_wrapper import Indexes, ContextIndexes
4
- from .sentence_data_wrapper import TokenH5Data, SentenceH5Data
5
-
6
- __all__ = [
7
- 'CorpusDataWrapper',
8
- 'Indexes',
9
- 'ContextIndexes',
10
- 'TokenH5Data',
11
- 'SentenceH5Data'
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/convenience_corpus.py DELETED
@@ -1,71 +0,0 @@
1
- from pathlib import Path
2
- from .corpus_data_wrapper import CorpusDataWrapper
3
- from .index_wrapper import Indexes, ContextIndexes
4
- from config import CORPORA
5
- from utils.f import memoize, delegates, GetAttr
6
- from typing import List
7
-
8
- def get_dir_names(path: Path) -> List[str]:
9
- available = [g.name for g in filter(lambda g: g.is_dir(), path.glob("*"))]
10
- return available
11
-
12
-
13
- @memoize
14
- def from_model(model_name, corpus_name):
15
- """Get the convenience corpus wrapper for a model and a corpus"""
16
- model_dir = Path(CORPORA) / model_name
17
- available = get_dir_names(model_dir)
18
- if not model_dir.exists() or len(available) == 0:
19
- raise FileNotFoundError("There are no corpora present for this model")
20
-
21
- base_dir = model_dir / corpus_name
22
-
23
- if not base_dir.exists():
24
- raise FileNotFoundError(f"Desired corpus '{corpus_name}' not available")
25
-
26
- return ConvenienceCorpus(base_dir)
27
-
28
- def files_available(base_dir, glob_pattern="*.faiss"):
29
- """Determine whether the base_dir contains indexed files"""
30
- if not base_dir.exists() or len(list(base_dir.glob(glob_pattern))) == 0:
31
- return False
32
-
33
- return True
34
- class ConvenienceCorpus(GetAttr):
35
- def __init__(self, base_dir):
36
- bd = Path(base_dir)
37
- self.base_dir = bd
38
- self.model_dir = bd.parent
39
- self.available_corpora = get_dir_names(self.model_dir)
40
-
41
- self.model_name = self.model_dir.name
42
- self.corpus_name = bd.name
43
- self.name = f"{self.model_name}_{self.corpus_name}"
44
-
45
- self.corpus_f = bd / 'data.hdf5'
46
- self.embedding_dir = bd / 'embedding_faiss'
47
- self.context_dir = bd / 'context_faiss'
48
-
49
- # Define whether these different files exist or not
50
- if not self.corpus_f.exists():
51
- raise FileNotFoundError("Main HDF5 file does not exist")
52
-
53
- self.embeddings_available = files_available(self.embedding_dir)
54
- self.contexts_available = files_available(self.context_dir)
55
-
56
- self.corpus = CorpusDataWrapper(self.corpus_f, self.name)
57
- self.embedding_faiss = Indexes(self.embedding_dir)
58
- self.context_faiss = ContextIndexes(self.context_dir)
59
-
60
- self.default = self.corpus # Almost acts like an inherited class, but is rather a composed class
61
-
62
- def search_embeddings(self, layer, query, k):
63
- D, I = self.embedding_faiss.search(layer, query, k)
64
- return self.find2d(I)[0]
65
-
66
- def search_contexts(self, layer, heads, query, k):
67
- D, I = self.context_faiss.search(layer, heads, query, k)
68
- return self.find2d(I)[0]
69
-
70
- def __repr__(self):
71
- return f"ConvenienceCorpus({self.name})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/corpus_data_wrapper.py DELETED
@@ -1,147 +0,0 @@
1
- import h5py
2
- import numpy as np
3
- from functools import partial
4
- from utils.gen_utils import map_nlist, vround
5
- import regex as re
6
- from spacyface.simple_spacy_token import SimpleSpacyToken
7
- from data_processing.sentence_data_wrapper import SentenceH5Data, TokenH5Data
8
- from utils.f import ifnone
9
-
10
- ZERO_BUFFER = 12 # Number of decimal places each index takes
11
- main_key = r"{:0" + str(ZERO_BUFFER) + r"}"
12
-
13
- def to_idx(idx:int):
14
- return main_key.format(idx)
15
-
16
- def zip_len_check(*iters):
17
- """Zip iterables with a check that they are all the same length"""
18
- if len(iters) < 2:
19
- raise ValueError(f"Expected at least 2 iterables to combine. Got {len(iters)} iterables")
20
- n = len(iters[0])
21
- for i in iters:
22
- n_ = len(i)
23
- if n_ != n:
24
- raise ValueError(f"Expected all iterations to have len {n} but found {n_}")
25
-
26
- return zip(*iters)
27
-
28
- class CorpusDataWrapper:
29
- """A wrapper for both the token embeddings and the head context.
30
-
31
- This class allows access into an HDF5 file designed according to the data/processing module's contents as if it were
32
- and in memory dictionary.
33
- """
34
-
35
- def __init__(self, fname, name=None):
36
- """Open an hdf5 file of the format designed and provide easy access to its contents"""
37
-
38
- # For iterating through the dataset
39
- self.__curr = 0
40
-
41
- self.__name = ifnone(name, "CorpusData")
42
- self.fname = fname
43
- self.data = h5py.File(fname, 'r')
44
-
45
- main_keys = self.data.keys()
46
- self.__len = len(main_keys)
47
-
48
- assert self.__len > 0, "Cannot process an empty file"
49
-
50
- embeds = self[0].embeddings
51
- self.embedding_dim = embeds.shape[-1]
52
- self.n_layers = embeds.shape[0] - 1 # 1 was added for the input layer
53
- self.refmap, self.total_vectors = self._init_vector_map()
54
-
55
- def __del__(self):
56
- try: self.data.close()
57
-
58
- # If run as a script, won't be able to close because of an import error
59
- except ImportError: pass
60
-
61
- except AttributeError:
62
- print(f"Never successfully loaded {self.fname}")
63
-
64
- def __iter__(self):
65
- return self
66
-
67
- def __len__(self):
68
- return self.__len
69
-
70
- def __next__(self):
71
- if self.__curr >= self.__len:
72
- self.__curr = 0
73
- raise StopIteration
74
-
75
- out = self[self.__curr]
76
- self.__curr += 1
77
- return out
78
-
79
- def __getitem__(self, idx):
80
- """Index into the embeddings"""
81
- if isinstance(idx, slice):
82
-
83
- start = idx.start or 0
84
- step = idx.step or 1
85
- stop = idx.stop or (self.__len - 1)
86
- stop = min(stop, self.__len)
87
-
88
- i = start
89
- out = []
90
- while i < stop:
91
- out.append(self[i])
92
- i += step
93
-
94
- return out
95
-
96
- elif isinstance(idx, int):
97
- if idx < 0: i = self.__len + idx
98
- else: i = idx
99
-
100
- key = to_idx(i)
101
- return SentenceH5Data(self.data[key])
102
-
103
- else:
104
- raise NotImplementedError
105
-
106
- def __repr__(self):
107
- return f"{self.__name}: containing {self.__len} items"
108
-
109
- def _init_vector_map(self):
110
- """Create main hashmap for all vectors to get their metadata.
111
-
112
- TODO Initialization is a little slow... Should this be stored in a separate hdf5 file?
113
-
114
- This doesn't change. Check for special hdf5 file and see if it exists already. If it does, open it.
115
- If not, create it
116
- """
117
- refmap = {}
118
- print("Initializing reference map for embedding vector...")
119
- n_vec = 0
120
- for z, sentence in enumerate(self):
121
- for i in range(len(sentence)):
122
- refs = TokenH5Data(sentence, i)
123
- refmap[n_vec] = refs
124
- n_vec += 1
125
-
126
- return refmap, n_vec
127
-
128
- def extract(self, layer):
129
- """Extract embeddings from a particular layer from the dataset
130
-
131
- For all examples
132
- """
133
- embeddings = []
134
- for i, embeds in enumerate(self):
135
- embeddings.append(embeds[layer])
136
-
137
- out = np.vstack(embeddings)
138
- return out
139
-
140
- def find(self, vec_num):
141
- """Find a vector's metadata (by id) in the hdf5 file. Needed to find sentence info and other attr"""
142
- return self.refmap[vec_num]
143
-
144
- def find2d(self, idxs):
145
- """Find a vector's metadata in the hdf5 file. Needed to find sentence info and other attr"""
146
- out = [[self.refmap[i] for i in idx] for idx in idxs]
147
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/create_corpus.py DELETED
@@ -1,28 +0,0 @@
1
- import argparse
2
- from pathlib import Path
3
-
4
- def parse_args():
5
- parser = argparse.ArgumentParser()
6
- parser.add_argument('-f', '--file', help="Path to .txt file to analyze and annotate")
7
- parser.add_argument("-o", "--outdir", help="Path of output directory inside of which to place <model>/<corpus>/ directory containing hdf5 and faiss files")
8
- parser.add_argument("-n", "--name", default=None, help="Name the corpus with a code name. If not given, default to the name of the provided .txt file")
9
- parser.add_argument("--force", action="store_true", help="If given, overwrite existing hdf5 and faiss files.")
10
- parser.add_argument("-m", "--model", help="Specify the huggingface model to use for attentions")
11
- parser.add_argument("--nomask", action='store_false', help="INCLUDE attentions from special tokens like [CLS] and [SEP]. By default, ignore these attentions")
12
-
13
- return parser.parse_args()
14
-
15
- if __name__ == "__main__":
16
- from utils.f import ifnone
17
- import create_hdf5
18
- import create_faiss
19
-
20
- args = parse_args()
21
-
22
- f = Path(args.file)
23
- corpus_name = ifnone(args.name, f.stem)
24
- output_dir = Path(args.outdir) / args.model / corpus_name
25
- output_dir.mkdir(parents=True, exist_ok=True)
26
-
27
- create_hdf5.main(args.file, output_dir, args.force, args.model, args.nomask)
28
- create_faiss.main(output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/create_faiss.py DELETED
@@ -1,78 +0,0 @@
1
- from pathlib import Path
2
- import faiss
3
- import numpy as np
4
- from data_processing.corpus_data_wrapper import CorpusDataWrapper
5
- from data_processing.index_wrapper import LAYER_TEMPLATE
6
- import argparse
7
-
8
- # Get model from base_dir
9
- # Use that information to get the model's configuration
10
- # From this, get the special tokens associated with that model
11
- # Have flag to allow model's special tokens to be ignored
12
- # Test what items match 'bert-base-cased'
13
-
14
- def parse_args():
15
- parser = argparse.ArgumentParser()
16
- parser.add_argument("-d", "--directory", help="Path to the directory that contains the 'embeddings' and 'headContext' folders")
17
-
18
- args = parser.parse_args()
19
- return args
20
-
21
- def train_indexes(ce:CorpusDataWrapper, stepsize=100, drop_null=True):
22
- """
23
-
24
- Parameters:
25
- ===========
26
- - corpus_embedding: Wrapper around HDF5 file for easy access to data
27
- - stepsize: How many sentences to train with at once
28
- - drop_null: Don't index the embeddings of special tokens (e.g., [CLS] and [SEP]) whose spacy POS are null
29
- """
30
- NUM_LAYERS = ce.n_layers # want to account for the input layer, which for attentions + contexts is all value 0
31
-
32
- embedding_indexes = [faiss.IndexFlatIP(ce.embedding_dim) for i in range(NUM_LAYERS)]
33
- context_indexes = [faiss.IndexFlatIP(ce.embedding_dim) for i in range(NUM_LAYERS)]
34
-
35
- for ix in range(0, len(ce), stepsize):
36
- cdata = ce[ix:ix+stepsize]
37
-
38
- if drop_null:
39
- embeddings = np.concatenate([c.zero_special_embeddings for c in cdata], axis=1)
40
- contexts = np.concatenate([c.zero_special_contexts for c in cdata], axis=1)
41
- else:
42
- embeddings = np.concatenate([c.embeddings for c in cdata], axis=1)
43
- contexts = np.concatenate([c.contexts for c in cdata], axis=1)
44
-
45
- for i in range(NUM_LAYERS):
46
- embedding_indexes[i].add(embeddings[i])
47
- context_indexes[i].add(contexts[i])
48
-
49
- return embedding_indexes, context_indexes
50
-
51
- def save_indexes(idxs, outdir, base_name=LAYER_TEMPLATE):
52
- """Save the faiss index into a file for each index in idxs"""
53
-
54
- base_dir = Path(outdir)
55
- if not base_dir.exists(): base_dir.mkdir(exist_ok=True, parents=True)
56
-
57
- out_name = str(base_dir / base_name)
58
- for i, idx in enumerate(idxs):
59
- name = out_name.format(i)
60
- print(f"Saving to {name}")
61
- faiss.write_index(idx, name)
62
-
63
- def main(basedir):
64
- base = Path(basedir)
65
- h5_fname = base / 'data.hdf5'
66
- corpus = CorpusDataWrapper(h5_fname)
67
- embedding_faiss, context_faiss = train_indexes(corpus)
68
-
69
- context_faiss_dir = base / "context_faiss"
70
- embedding_faiss_dir = base / "embedding_faiss"
71
- save_indexes(embedding_faiss, embedding_faiss_dir)
72
- save_indexes(context_faiss, context_faiss_dir)
73
-
74
- if __name__ == "__main__":
75
- # Creating the indices for both the context and embeddings
76
- args = parse_args()
77
-
78
- main(args.directory)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/create_hdf5.py DELETED
@@ -1,71 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import h5py
4
- import pickle
5
- import argparse
6
- from pathlib import Path
7
-
8
- from data_processing.sentence_extracting import extract_chars, extract_lines
9
- from data_processing.corpus_data_wrapper import CorpusDataWrapper, to_idx
10
- from transformer_details import from_pretrained
11
-
12
- MIN_SENTENCE_CHARLEN = 24
13
-
14
- def parse_args():
15
- parser = argparse.ArgumentParser()
16
- parser.add_argument("-f", "--file", help="Path to .pckl file of unique sentences from a corpus.")
17
- parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .hdf5")
18
- parser.add_argument("-m", "--model", default="bert-base-cased", help="Which pretrained transformer model to use. See 'transformer_details.py' for supported models")
19
- parser.add_argument("--nomask", action='store_false', help="By default, ignore attentions to special tokens like '[CLS]' and '[SEP]'. If given, include these attentions")
20
- parser.add_argument("--force", action="store_true", help="If given, overwrite existing hdf5 files.")
21
-
22
- args = parser.parse_args()
23
- return args
24
-
25
- def main(infile, outdir, force, model_name, mask_attentions):
26
- outdir = Path(outdir)
27
- outdir.mkdir(parents=True, exist_ok=True)
28
- data_outfile = outdir / "data.hdf5"
29
- f = h5py.File(data_outfile, 'a')
30
- if force: f.clear()
31
-
32
- extractor = from_pretrained(model_name)
33
-
34
- # if "gpt" in model_name:
35
- # mask_attentions = False
36
-
37
- print_every = 50
38
- long_strings = extract_chars(infile, 10000)
39
- cutoff_sent = ""
40
- i = 0
41
- for strip in long_strings:
42
- sentences = [sent.text for sent in extractor.aligner.spacy_nlp(strip).sents]
43
- fixed_sentences = [cutoff_sent + sentences[0]] + sentences[1:-1]
44
-
45
- # This leads to the possibility that there will be an input that is two sentences long. This is ok.
46
- cutoff_sent = sentences[-1]
47
- for s in fixed_sentences:
48
- if len(s) < MIN_SENTENCE_CHARLEN: continue
49
- if ((i + 1) % print_every) == 0: print(f"Starting sentence {i+1}: \n", s)
50
-
51
- try:
52
- out = extractor.att_from_sentence(s, mask_attentions=mask_attentions)
53
-
54
- except Exception as e:
55
- print(f"Error {e} occured at sentence {i}:\n{s}\n\n Skipping, not creating hdf5 grp")
56
- continue
57
-
58
- content = out.to_hdf5_content()
59
- meta = out.to_hdf5_meta()
60
- grp = f.create_group(to_idx(i))
61
- for k,v in content.items(): grp.create_dataset(k, data=v)
62
- for k, v in meta.items(): grp.attrs[k] = v
63
-
64
- i += 1 # Increment to mark the next sentence
65
-
66
- print("FINISHED CORPUS PROCESSING SUCCESSFULLY")
67
-
68
- if __name__ == "__main__":
69
- args = parse_args()
70
-
71
- main(args.file, args.outdir, args.force, args.model, args.nomask)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/index_wrapper.py DELETED
@@ -1,88 +0,0 @@
1
- from functools import partial
2
- import faiss
3
- import numpy as np
4
- from pathlib import Path
5
- from typing import Iterable
6
- from utils.f import memoize
7
- from transformers import AutoConfig
8
-
9
- @memoize
10
- def get_config(model_name):
11
- return AutoConfig.from_pretrained(model_name)
12
-
13
- FAISS_LAYER_PATTERN = 'layer_*.faiss'
14
- LAYER_TEMPLATE = 'layer_{:02d}.faiss'
15
-
16
- def create_mask(head_size:int , n_heads:int, selected_heads:Iterable[int]):
17
- """Create a masked vector of size (head_size * n_heads), where 0 indicates we don't care about the contribution of that head 1 indicates that we do care
18
-
19
- Parameters:
20
- -----------
21
- head_size: Hidden dimension of the heads
22
- n_heads: Number of heads the model has
23
- selected_heads: Which heads we don't want to zero out
24
- """
25
-
26
- mask = np.zeros(n_heads)
27
- for h in selected_heads:
28
- mask[int(h)] = 1
29
-
30
- return np.repeat(mask, head_size)
31
-
32
- class Indexes:
33
- """Wrapper around the faiss indices to make searching for a vector simpler and faster.
34
-
35
- Assumes there are files in the folder matching the pattern input
36
- """
37
- def __init__(self, folder, pattern=FAISS_LAYER_PATTERN):
38
- self.base_dir = Path(folder)
39
- self.n_layers = len(list(self.base_dir.glob(pattern))) - 1 # Subtract final output
40
- self.indexes = [None] * (self.n_layers + 1) # Initialize empty list, adding 1 for input
41
- self.pattern = pattern
42
- self.__init_indexes()
43
-
44
- # Extract model name from folder hierarchy
45
- self.model_name = self.base_dir.parent.parent.stem
46
- self.config = get_config(self.model_name)
47
- self.nheads = self.config.num_attention_heads
48
- self.hidden_size = self.config.hidden_size
49
- assert (self.hidden_size % self.nheads) == 0, "Number of heads does not divide cleanly into the hidden size. Aborting"
50
- self.head_size = int(self.config.hidden_size / self.nheads)
51
-
52
-
53
- def __getitem__(self, v):
54
- """Slices not allowed, but index only"""
55
- return self.indexes[v]
56
-
57
- def __init_indexes(self):
58
- for fname in self.base_dir.glob(self.pattern):
59
- print(fname)
60
- idx = fname.stem.split('_')[-1]
61
- self.indexes[int(idx)] = faiss.read_index(str(fname))
62
-
63
- def search(self, layer, query, k):
64
- """Search a given layer for the query vector. Return k results"""
65
- return self[layer].search(query, k)
66
-
67
-
68
- class ContextIndexes(Indexes):
69
- """Special index enabling masking of particular heads before searching"""
70
-
71
- def __init__(self, folder, pattern=FAISS_LAYER_PATTERN):
72
- super().__init__(folder, pattern)
73
-
74
- self.head_mask = partial(create_mask, self.head_size, self.nheads)
75
-
76
- # Int -> [Int] -> np.Array -> Int -> (np.Array(), )
77
- def search(self, layer:int, heads:list, query:np.ndarray, k:int):
78
- """Search the embeddings for the context layer, masking by selected heads"""
79
- assert max(heads) < self.nheads, "max of selected heads must be lest than nheads. Are you indexing by 1 instead of 0?"
80
- assert min(heads) >= 0, "What is a negative head?"
81
-
82
- unique_heads = list(set(heads))
83
- mask_vector = self.head_mask(unique_heads)
84
- mask_vector = mask_vector.reshape(query.shape)
85
-
86
- new_query = (query * mask_vector).astype(np.float32)
87
-
88
- return self[layer].search(new_query, k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/sentence_data_wrapper.py DELETED
@@ -1,331 +0,0 @@
1
- import h5py
2
- import numpy as np
3
- from functools import partial
4
- from utils.gen_utils import map_nlist, vround
5
- import regex as re
6
- from spacyface.simple_spacy_token import SimpleSpacyToken
7
-
8
- ZERO_BUFFER = 12 # Number of decimal places each index takes
9
- main_key = r"{:0" + str(ZERO_BUFFER) + r"}"
10
- suppl_attn_key = r"{:0" + str(ZERO_BUFFER) + r"}_attn"
11
-
12
- def zip_len_check(*iters):
13
- """Zip iterables with a check that they are all the same length"""
14
- if len(iters) < 2:
15
- raise ValueError(f"Expected at least 2 iterables to combine. Got {len(iters)} iterables")
16
- n = len(iters[0])
17
- for i in iters:
18
- n_ = len(i)
19
- if n_ != n:
20
- raise ValueError(f"Expected all iterations to have len {n} but found {n_}")
21
-
22
- return zip(*iters)
23
-
24
- class SentenceH5Data:
25
- def __init__(self, grp):
26
- self.grp = grp
27
-
28
- @property
29
- def n_layers(self):
30
- return self.embeddings.shape[0] - 1 # 1 was added at the input, not a hidden layer
31
-
32
- @property
33
- def sentence(self):
34
- return self.grp.attrs['sentence']
35
-
36
- @property
37
- def embeddings(self):
38
- return self.grp['embeddings'][:]
39
-
40
- @property
41
- def zero_special_embeddings(self):
42
- out = self.embeddings.copy()
43
- out[:, self.mask_is_special] = np.zeros(out[:, self.mask_is_special].shape)
44
- return out
45
-
46
- @property
47
- def contexts(self):
48
- return self.grp['contexts'][:]
49
-
50
- @property
51
- def zero_special_contexts(self):
52
- out = self.contexts.copy()
53
- out[:, self.mask_is_special] = np.zeros(out[:, self.mask_is_special].shape)
54
- return out
55
-
56
- @property
57
- def attentions(self):
58
- """Return all attentions, including [CLS] and [SEP]
59
-
60
- Note that if the hdf5 is created with CLS and SEP attentions, it will have CLS and SEP attentions"""
61
- return self.grp['attentions'][:] # Converts to numpy array
62
-
63
- @property
64
- def mask_is_special(self):
65
- return np.logical_or(self.deps == '', self.poss == '')
66
-
67
- @property
68
- def tokens(self):
69
- return self.grp.attrs['token']
70
-
71
- @property
72
- def poss(self):
73
- return self.grp.attrs['pos']
74
-
75
- @property
76
- def deps(self):
77
- return self.grp.attrs['dep']
78
-
79
- @property
80
- def is_ents(self):
81
- return self.grp.attrs['is_ent']
82
-
83
- @property
84
- def heads(self):
85
- """Not the attention heads, but rather the head word of the orig sentence"""
86
- return self.grp.attrs['head']
87
-
88
- @property
89
- def norms(self):
90
- return self.grp.attrs['norm']
91
-
92
- @property
93
- def tags(self):
94
- return self.grp.attrs['tag']
95
-
96
- @property
97
- def lemmas(self):
98
- return self.grp.attrs['lemma']
99
-
100
- def __len__(self):
101
- return len(self.tokens)
102
-
103
- def __repr__(self):
104
- sent_len = 40
105
- if len(self.sentence) > sent_len: s = self.sentence[:(sent_len - 3)] + '...'
106
- else: s = self.sentence
107
- return f"SentenceH5Data({s})"
108
-
109
- class TokenH5Data(SentenceH5Data):
110
- """A wrapper around the HDF5 file storage information allowing easy access to information about each
111
- processed sentence.
112
-
113
- Sometimes, and index of -1 is used to represent the entire object in memory
114
- """
115
- def __init__(self, grp, index):
116
- """Represents returned from the refmap of the CorpusEmbedding class"""
117
- if type(grp) == SentenceH5Data: super().__init__(grp.grp)
118
- elif type(grp) == h5py._hl.group.Group: super().__init__(grp)
119
- self.index = index
120
-
121
- @property
122
- def embedding(self):
123
- return self.embeddings[:, self.index, :]
124
-
125
- @property
126
- def context(self):
127
- return self.contexts[:, self.index, :]
128
-
129
- @property
130
- def attentions_out(self):
131
- """Access all attention OUT of this token"""
132
- output = self.attentions[:,:, self.index, :]
133
- return output
134
-
135
- @property
136
- def attentions_in(self):
137
- """Access all attention INTO this token"""
138
- new_attention = self.attentions.transpose((0,1,3,2))
139
- return new_attention[:,:, self.index, :]
140
-
141
- def _select_from_attention(self, layer, heads):
142
- if type(heads) is int:
143
- heads = [heads]
144
-
145
- # Select layer and heads
146
- modified_attentions = self.attentions[layer, heads].mean(0)
147
- attentions_out = modified_attentions
148
- attentions_in = modified_attentions.transpose()
149
- return attentions_out, attentions_in
150
-
151
- def _calc_offset_single(self, attention):
152
- """Get offset to location of max attention"""
153
- curr_idx = self.index
154
- max_atts = np.argmax(attention)
155
- return max_atts - curr_idx
156
-
157
- # Define metadata properties.
158
- # Right now, needs manual curation of fields from SimpleSpacyToken. Ideally, this is automated
159
-
160
- @property
161
- def token(self):
162
- return self.tokens[self.index]
163
-
164
- @property
165
- def pos(self):
166
- return self.poss[self.index]
167
-
168
- @property
169
- def dep(self):
170
- return self.deps[self.index]
171
-
172
- @property
173
- def is_ent(self):
174
- return bool(self.is_ents[self.index])
175
-
176
- @property
177
- def norm(self):
178
- return self.norms[self.index]
179
-
180
- @property
181
- def head(self):
182
- return self.heads[self.index]
183
-
184
- @property
185
- def lemma(self):
186
- return self.lemmas[self.index]
187
-
188
- @property
189
- def tag(self):
190
- return self.tags[self.index]
191
-
192
- def to_json(self, layer, heads, top_k=5, ndigits=4):
193
- """
194
- Convert token information and attention to return to frontend
195
-
196
- Require layer, heads, and top_k to convert the attention into value to return to frontend.
197
-
198
- Output:
199
- {
200
- sentence: str
201
- index: number
202
- match: str
203
- is_match: bool
204
- is_next_word: bool
205
- matched_att: {
206
- in: { att: number[]
207
- , offset_to_max: number
208
- , loc_of_max: float
209
- }
210
- out: { att: number[]
211
- , offset_to_max: number
212
- , loc_of_max: float
213
- }
214
- },
215
- matched_att_plus_1: {
216
- in: { att: number[]
217
- , offset_to_max: number
218
- }
219
- out: { att: number[]
220
- , offset_to_max: number
221
- }
222
- }
223
- tokens: List[
224
- { token: string
225
- , pos: string
226
- , dep: string
227
- , is_ent: boolean
228
- , inward: number[]
229
- , outward: number[]
230
- }
231
- ]
232
- }
233
- """
234
- keys = [
235
- "token",
236
- "pos",
237
- "dep",
238
- "is_ent",
239
- "inward",
240
- "outward",
241
- ]
242
-
243
- token_arr = []
244
- matched_attentions = {}
245
- N = len(self)
246
-
247
- # Iterate through the following
248
- tokens = self.tokens.tolist()
249
- poss = [p.lower() for p in self.poss.tolist()]
250
- deps = [d.lower() for d in self.deps.tolist()]
251
- ents = self.is_ents.tolist()
252
- attentions_out, attentions_in = self._select_from_attention(layer, heads)
253
-
254
- matched_att_plus_1 = None
255
- next_index = None
256
-
257
- for i, tok_info in enumerate(zip_len_check(
258
- tokens
259
- , poss
260
- , deps
261
- , ents
262
- , attentions_out.tolist()
263
- , attentions_in.tolist())):
264
-
265
- def get_interesting_attentions():
266
- return {
267
- "in": {
268
- "att": att_in,
269
- "offset_to_max": self._calc_offset_single(att_in).item(),
270
- # "loc_of_max": np.argmax(att_in), # Broken
271
- },
272
- "out": {
273
- "att": att_out,
274
- "offset_to_max": self._calc_offset_single(att_out).item(),
275
- # "loc_of_max": np.argmax(att_out), # Broken
276
- }
277
- }
278
-
279
-
280
- # Perform rounding of attentions
281
- rounder = partial(round, ndigits=ndigits)
282
- att_out = map_nlist(rounder, tok_info[-2])
283
- att_in = map_nlist(rounder, tok_info[-1])
284
-
285
- obj = {k: v for (k, v) in zip_len_check(keys, tok_info)}
286
-
287
- IS_LAST_TOKEN = i == (N-1)
288
-
289
- if (i == self.index) or ((i - 1) == self.index):
290
- interesting_attentions = get_interesting_attentions()
291
-
292
- if i == self.index:
293
- obj['is_match'] = True
294
- matched_attentions = interesting_attentions
295
-
296
- elif (i-1) == self.index:
297
- matched_att_plus_1 = interesting_attentions
298
- obj['is_next_word'] = True
299
- next_index = i
300
-
301
- # Edge case for final iteration through sentence
302
-
303
- else:
304
- obj['is_match'] = False
305
- obj['is_next_word'] = False
306
-
307
- if (IS_LAST_TOKEN and (matched_att_plus_1 is None)):
308
- print("Saving matched_att_plus_1 to: ", interesting_attentions)
309
- obj['is_next_word'] = True
310
- matched_att_plus_1 = get_interesting_attentions()
311
- next_index = i
312
-
313
- token_arr.append(obj)
314
-
315
- next_token = self.tokens[next_index]
316
-
317
- obj = {
318
- "sentence": self.sentence,
319
- "index": self.index,
320
- "match": self.token,
321
- "next_index": next_index,
322
- "match_plus_1": next_token,
323
- "matched_att": matched_attentions,
324
- "matched_att_plus_1": matched_att_plus_1,
325
- "tokens": token_arr,
326
- }
327
-
328
- return obj
329
-
330
- def __repr__(self):
331
- return f"{self.token}: [{self.pos}, {self.dep}, {self.is_ent}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/data_processing/sentence_extracting.py DELETED
@@ -1,181 +0,0 @@
1
- """Extractor functions to retrieve sentences by character chunks from a file
2
-
3
- This script contains the logic that allows the user to process and filter
4
- sentences of the original corpus. By default, this considers a minimum sentence
5
- length, and removes newlines and multiple consecutive spaces.
6
-
7
- Configuration for existing functionality is at the top of the file. Feel free to
8
- add new processing and/or filter functions. The "process_line" and "filter_line"
9
- functions contain the pipeline for processing the scripts as needed.
10
-
11
- """
12
- import regex as re
13
- import argparse
14
- from pathlib import Path
15
- from functools import partial
16
- from typing import Union
17
-
18
- MIN_LINE_LENGTH = 8 # words
19
-
20
- def parse_args():
21
- parser = argparse.ArgumentParser()
22
- parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate")
23
- parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl")
24
-
25
-
26
- args = parser.parse_args()
27
- return args
28
-
29
- # ============================================================
30
- # Helper functions
31
- # ============================================================
32
- # String -> String
33
- def replace_newlines(s:str) -> str:
34
- return re.sub(r"\n+", r" ", s)
35
-
36
- # String -> String
37
- def replace_multispace(s:str) -> str:
38
- return re.sub(r"\s+", r" ", s)
39
-
40
- def is_short_sentence(s:str, min_len=8) -> str:
41
- """Returns True if the sentence has less than `min_len` number of words"""
42
- return len(s.split(' ')) < min_len
43
-
44
- def contains_char(char:str, s:str) -> str:
45
- return char in s
46
-
47
- # ============================================================
48
- # Compilation functions
49
- # ============================================================
50
-
51
- def process_line(line:str) -> str:
52
- """"Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file.
53
-
54
- Args:
55
- line: Chunk of text
56
-
57
- Returns:
58
- Input that has been stripped of newlines and multiple consecutive spaces.
59
- """
60
- s = replace_multispace(replace_newlines(line))
61
- return s
62
-
63
- def filter_line(line:str) -> bool:
64
- """Returns True if the sentence passes the MIN_LINE_LENGTH configuration
65
-
66
- Redefine this function with desired helper functions, returning true if you want to keep the line
67
- """
68
- fails = is_short_sentence(line, MIN_LINE_LENGTH)
69
-
70
- return not fails
71
-
72
- # ============================================================
73
- # Main Logic
74
- # ============================================================
75
-
76
- def read_outcomes(chars:str) -> Union[str, None]:
77
- """From a chunk of characters, decide whether to return the processed characters or Nothing.
78
-
79
- If the input is the empty string "", raise StopIteration
80
-
81
- Args:
82
- chars: Chunk of text to process
83
-
84
- Returns:
85
- The processed chunk of text or nothing if the characters do not pass the filtering
86
-
87
- Raises:
88
- StopIteration: If the input is the empty string "", raise StopIteration
89
- """
90
-
91
- if chars == '': raise StopIteration
92
- line = process_line(chars)
93
- if filter_line(line): return line
94
- return None
95
-
96
- def get_chars(n:int, f) -> Union[str, None]:
97
- """Extract `n` chars from opened file `f`
98
-
99
- Args:
100
- n: Number of characters to read from the opened file
101
- f: Opened file from the return of `open(fname)`
102
-
103
- Returns:
104
- The processed chunk of text or nothing if the characters do not pass the filtering
105
-
106
- Raises:
107
- This function does not raise any errors of its own, but can pass up the StopIteration exception
108
- from read_outcomes
109
- """
110
- chars = f.read(n)
111
- return read_outcomes(chars)
112
-
113
- def get_line(f):
114
- """Given an open file, get the next line and process it. Handles 3 scenarios:
115
-
116
- 1. StopIteration indicates the opened file has reached the end
117
- 2. Return a processed line if it passes the filter
118
- 3. If line does not pass the filter line, return None
119
- """
120
- line = f.readline()
121
- return read_outcomes(line)
122
-
123
- def read_on(reader, f):
124
- """Read from an open file `f` according to the function `reader`
125
-
126
- Args:
127
- reader: A unary function of signature (f: _io.TextIOWrapper) -> str
128
- f: An opened file, as returned by `open(fname)`
129
-
130
- Yields:
131
- A generator that returns lines defined by `reader` until the end of the file is reached.
132
- """
133
- while True:
134
- try:
135
- line = reader(f)
136
- except StopIteration:
137
- break
138
-
139
- if line is not None:
140
- yield line
141
-
142
-
143
- def extract_chars(infile, n=10000):
144
- """Extract `n` characters from a file"""
145
- reader = partial(get_chars, n)
146
- src = open(infile, 'r')
147
- return read_on(reader, src)
148
- src.close()
149
-
150
-
151
- def extract_lines(infile):
152
- """Given a file, yield the processed lines from that file"""
153
- src = open(infile, 'r')
154
- return read_on(get_line, src)
155
- src.close()
156
-
157
-
158
- def extract_sentences_to_file(infile, outfname:str):
159
- """Extract sentences from a file into a new file indicated by `outfname`."""
160
- out = open(outfname, 'x')
161
-
162
- linegen = extract_lines(infile)
163
-
164
- for line in linegen:
165
- out.write(line + "\n")
166
-
167
- out.close()
168
-
169
- def main(infile, outdir):
170
- """Main function for creating the outdir and saving the processed sentences to that file"""
171
- outfname = Path(infile).stem + '.txt'
172
- outdir = Path(outdir)
173
- outdir.mkdir(parents=True, exist_ok=True)
174
- outfile = outdir / outfname
175
- out_path = extract_sentences_to_file(infile, outfile)
176
-
177
- return out_path
178
-
179
- if __name__ == "__main__":
180
- args = parse_args()
181
- main(args.file, args.outdir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/main.py CHANGED
@@ -5,7 +5,6 @@ from flask_cors import CORS
5
  from flask import render_template, redirect, send_from_directory
6
 
7
  import utils.path_fixes as pf
8
- import config
9
  from utils.f import ifnone
10
 
11
  from data_processing import from_model
 
5
  from flask import render_template, redirect, send_from_directory
6
 
7
  import utils.path_fixes as pf
 
8
  from utils.f import ifnone
9
 
10
  from data_processing import from_model
server/utils/path_fixes.py CHANGED
@@ -5,6 +5,7 @@ FAISS_LAYER_PATTERN = 'layer_*.faiss'
5
  LAYER_TEMPLATE = 'layer_{:02d}.faiss'
6
 
7
  ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
 
8
  DATA_DIR = ROOT_DIR / 'server' / 'data'
9
  DATASET_DIR = Path.home() / 'Datasets'
10
  ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
 
5
  LAYER_TEMPLATE = 'layer_{:02d}.faiss'
6
 
7
  ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
8
+ CORPORA = ROOT / "corpora"
9
  DATA_DIR = ROOT_DIR / 'server' / 'data'
10
  DATASET_DIR = Path.home() / 'Datasets'
11
  ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
server/utils/token_processing.py CHANGED
@@ -5,7 +5,6 @@ If adding more metadata, modify the definitions in `to_spacy_meta` and `meta_to_
5
  import h5py
6
  import numpy as np
7
  import spacy
8
- import config
9
  from transformers.tokenization_bert import BertTokenizer
10
  from .f import flatten_, assoc, memoize, GetAttr
11
 
 
5
  import h5py
6
  import numpy as np
7
  import spacy
 
8
  from transformers.tokenization_bert import BertTokenizer
9
  from .f import flatten_, assoc, memoize, GetAttr
10