GabMartino commited on
Commit
e3c0725
Β·
1 Parent(s): 425a478

First Commit.

Browse files
README.md CHANGED
@@ -1,12 +1,16 @@
1
  ---
2
- title: Serica Intelligent Search Fork
3
- emoji: 🌍
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.17.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: Serica Intelligent Search
3
+ emoji: πŸ”
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: streamlit
7
+ sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
+ license: agpl-3.0
11
  ---
12
 
13
+
14
+ **This is a fork from https://huggingface.co/spaces/galatolo/serica-intelligent-search**
15
+
16
+ _Check **latin-bert** https://github.com/dbamman/latin-bert_
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+
5
+ import numpy as np
6
+ import streamlit as st
7
+ from elasticsearch import Elasticsearch
8
+
9
+ from embedders.LatinBERT import LatinBERT
10
+ from embedders.labse import LaBSE
11
+
12
+ models = dict(
13
+ LaBSE=LaBSE(),
14
+ LatinBERT=LatinBERT(bertPath="./embedders/latin_bert/latin_bert", tokenizerPath="./embedders/tokenizer/latin.subword.encoder")
15
+ )
16
+ verify_certs=False
17
+ es = Elasticsearch(os.environ["ELASTIC_HOST"], basic_auth=os.environ["ELASTIC_AUTH"].split(":"), verify_certs=verify_certs)
18
+
19
+
20
+ def searchCloseSentence(document, startNumber, numCloseSentence=3):
21
+
22
+ queryPrevious = {
23
+ "bool": {
24
+ "must": [{
25
+ "term": {
26
+ "document": document
27
+ }
28
+ }, {
29
+ "range": {
30
+ "number": {
31
+ "gte": startNumber - numCloseSentence,
32
+ "lt": startNumber,
33
+ }
34
+ }
35
+ }
36
+ ]
37
+ }
38
+ }
39
+
40
+ queryNext = {
41
+ "bool": {
42
+ "must": [{
43
+ "term": {
44
+ "document": document
45
+ }
46
+ }, {
47
+ "range": {
48
+ "number": {
49
+ "lte": startNumber+3,
50
+ "gt": startNumber,
51
+ }
52
+ }
53
+ }
54
+ ]
55
+ }
56
+ }
57
+
58
+ previous = es.search(
59
+ index="sentences",
60
+ query=queryPrevious
61
+ )
62
+ nexts = es.search(
63
+ index="sentences",
64
+ query=queryNext
65
+ )
66
+ previous_hits = sorted(previous["hits"]["hits"], key=lambda e: e["_source"]["number"])
67
+ previous_context = "".join([r["_source"]["sentence"] for r in previous_hits])
68
+
69
+ subsequent_hits = sorted(nexts["hits"]["hits"], key=lambda e: e["_source"]["number"])
70
+ subsequent_context = "".join([r["_source"]["sentence"] for r in subsequent_hits])
71
+
72
+ document_name_results = es.search(
73
+ index="documents",
74
+ query={
75
+ "bool": {
76
+ "must": [{
77
+ "term": {
78
+ "id": document
79
+ }
80
+ }
81
+ ]
82
+ }
83
+ }
84
+ )
85
+
86
+ document_name_data = document_name_results["hits"]["hits"][0]["_source"]
87
+ document_name = f"{document_name_data['title']} - {document_name_data['author']}"
88
+
89
+ return document_name, previous_context, subsequent_context
90
+
91
+ def prepareResults(results):
92
+ results = results['hits']['hits']
93
+
94
+ string_results = []
95
+ for sentence in results:
96
+ text = sentence['_source']['sentence']
97
+ score = sentence['_score']
98
+ document = sentence['_source']['document']
99
+ number = sentence['_source']['number']
100
+ document_name, previous_context, subsequent_context = searchCloseSentence(document, number, 3)
101
+ string_result = f"#### {document_name} (score: {score:.2f})\n{previous_context} **{text}** {subsequent_context}"
102
+ string_results.append(string_result)
103
+
104
+ return string_results
105
+ def search():
106
+ if query == "":
107
+ return
108
+ status_indicator.write(f"Computing query embeddings...")
109
+
110
+ query_vector = None
111
+ embeddingType = None
112
+ if model_name in ["LaBSE", "LatinBERT"]:
113
+ query_vector = models[model_name](query)[0, :].numpy().tolist()
114
+ embeddingType = "labse_embedding" if model_name == "LaBSE" else "latinBERT_embedding"
115
+ elif model_name in ["LaBSE-LatinBERT-Mean","LaBSE-LatinBERT-CONCAT"]:
116
+ query_vector_labse = models['LaBSE'](query)[0, :].numpy().tolist()
117
+ query_vector_latinBERT = models['LatinBERT'](query)[0, :].numpy().tolist()
118
+
119
+ if model_name == "LaBSE-LatinBERT-Mean":
120
+ query_vector = np.mean([query_vector_labse, query_vector_latinBERT], axis=0).tolist()
121
+ embeddingType = "mean_embedding"
122
+ elif model_name == "LaBSE-LatinBERT-CONCAT":
123
+ query_vector = query_vector_latinBERT + query_vector_labse
124
+ embeddingType = "concat_embedding"
125
+
126
+
127
+ script = {
128
+ "source": f"cosineSimilarity(params.query_vector, '{embeddingType}') + 1.0",
129
+ "params": {"query_vector": query_vector}
130
+ }
131
+ results = es.search(
132
+ index='sentences',
133
+ query={
134
+ "script_score": {
135
+ "query": {"match_all": {}},
136
+ "script": script
137
+
138
+ }
139
+
140
+ },
141
+ size=limit
142
+ )
143
+
144
+ pretty_results = prepareResults(results)
145
+ for res in pretty_results:
146
+ results_placeholder.markdown(res)
147
+
148
+
149
+
150
+ st.header("Serica Intelligent Search")
151
+ st.write("Perform an intelligent search using a Sentence Embedding Transformer model on the SERICA database")
152
+ model_name = st.selectbox("Model", ["LaBSE", "LatinBERT", "LaBSE-LatinBERT-Mean", "LaBSE-LatinBERT-CONCAT"])
153
+ limit = st.number_input("Number of results (sentences) ", 25)
154
+ query = st.text_input("Query", value="")
155
+ status_indicator = st.empty()
156
+ do_search = st.button("Search", on_click=search)
157
+ results_placeholder = st.container()
158
+
159
+ if do_search:
160
+ search()
161
+ #do_search(model_name, query, limit, results_placeholder, status_indicator)
162
+
embedders/LatinBERT.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from types import SimpleNamespace
2
+
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from transformers import BertTokenizerFast, BertForMaskedLM, BertTokenizer, BertModel
7
+ from tensor2tensor.data_generators import text_encoder
8
+ import torch.nn.functional as F
9
+
10
+
11
+ class LatinBERT(nn.Module):
12
+
13
+ def __init__(self, bertPath, tokenizerPath):
14
+ super().__init__()
15
+ self.tokenizer = LatinTokenizer(tokenizerPath) #BertTokenizer.from_pretrained("bert-base-cased")
16
+ self.model = BertModel.from_pretrained(bertPath)#.to("cuda")
17
+ self.model.eval()
18
+
19
+ @torch.no_grad()
20
+ def __call__(self, sentences):
21
+ if not isinstance(sentences, list):
22
+ sentences = [sentences]
23
+
24
+ tokens_ids, masks, transforms = self.tokenizer.tokenize(sentences, 512)
25
+ #tokens_ids = tokens_ids.to("cuda")
26
+ #tokens_ids = tokens_ids.squeeze()
27
+ if tokens_ids.shape[-1] > 512:
28
+ print(tokens_ids.shape)
29
+ tokens_ids = torch.narrow(tokens_ids, -1, 0, 512)
30
+
31
+ tokens_ids = tokens_ids.reshape((-1, tokens_ids.shape[-1]))
32
+ outputs = self.model.forward(tokens_ids)
33
+ embeddings = outputs.pooler_output
34
+ embeddings = F.normalize(embeddings, p=2).cpu()
35
+ return embeddings
36
+
37
+ @property
38
+ def dim(self):
39
+ return 768
40
+
41
+
42
+ class LatinTokenizer:
43
+ def __init__(self, model):
44
+ self.vocab = dict()
45
+ self.reverseVocab = dict()
46
+ self.encoder = text_encoder.SubwordTextEncoder(model)
47
+
48
+ self.vocab["[PAD]"] = 0
49
+ self.vocab["[UNK]"] = 1
50
+ self.vocab["[CLS]"] = 2
51
+ self.vocab["[SEP]"] = 3
52
+ self.vocab["[MASK]"] = 4
53
+
54
+ for key in self.encoder._subtoken_string_to_id:
55
+ self.vocab[key] = self.encoder._subtoken_string_to_id[key] + 5
56
+ self.reverseVocab[self.encoder._subtoken_string_to_id[key] + 5] = key
57
+
58
+ def convert_tokens_to_ids(self, tokens):
59
+ wp_tokens = list()
60
+ for token in tokens:
61
+ if token == "[PAD]":
62
+ wp_tokens.append(0)
63
+ elif token == "[UNK]":
64
+ wp_tokens.append(1)
65
+ elif token == "[CLS]":
66
+ wp_tokens.append(2)
67
+ elif token == "[SEP]":
68
+ wp_tokens.append(3)
69
+ elif token == "[MASK]":
70
+ wp_tokens.append(4)
71
+ else:
72
+ wp_tokens.append(self.vocab[token])
73
+
74
+ return wp_tokens
75
+
76
+ def tokenize(self, sentences, max_batch):
77
+ #print(len(sentences))
78
+ maxLen=0
79
+ for sentence in sentences:
80
+ length=0
81
+ for word in sentence:
82
+ toks=self._tokenize(word)
83
+ length+=len(toks)
84
+
85
+ if length> maxLen:
86
+ maxLen=length
87
+ #print(maxLen)
88
+ all_data=[]
89
+ all_masks=[]
90
+ all_labels=[]
91
+ all_transforms=[]
92
+
93
+ for sentence in sentences:
94
+ tok_ids=[]
95
+ input_mask=[]
96
+ labels=[]
97
+ transform=[]
98
+
99
+ all_toks=[]
100
+ n=0
101
+ for idx, word in enumerate(sentence):
102
+ toks=self._tokenize(word)
103
+ all_toks.append(toks)
104
+ n+=len(toks)
105
+
106
+ cur=0
107
+ for idx, word in enumerate(sentence):
108
+ toks=all_toks[idx]
109
+ ind=list(np.zeros(n))
110
+ for j in range(cur,cur+len(toks)):
111
+ ind[j]=1./len(toks)
112
+ cur+=len(toks)
113
+ transform.append(ind)
114
+
115
+ tok_ids.extend(self.convert_tokens_to_ids(toks))
116
+
117
+ input_mask.extend(np.ones(len(toks)))
118
+ labels.append(1)
119
+
120
+ all_data.append(tok_ids)
121
+ all_masks.append(input_mask)
122
+ all_labels.append(labels)
123
+ all_transforms.append(transform)
124
+
125
+ lengths = np.array([len(l) for l in all_data])
126
+
127
+ # Note sequence must be ordered from shortest to longest so current_batch will work
128
+ ordering = np.argsort(lengths)
129
+
130
+ ordered_data = [None for i in range(len(all_data))]
131
+ ordered_masks = [None for i in range(len(all_data))]
132
+ ordered_labels = [None for i in range(len(all_data))]
133
+ ordered_transforms = [None for i in range(len(all_data))]
134
+
135
+
136
+ for i, ind in enumerate(ordering):
137
+ ordered_data[i] = all_data[ind]
138
+ ordered_masks[i] = all_masks[ind]
139
+ ordered_labels[i] = all_labels[ind]
140
+ ordered_transforms[i] = all_transforms[ind]
141
+
142
+ batched_data=[]
143
+ batched_mask=[]
144
+ batched_labels=[]
145
+ batched_transforms=[]
146
+
147
+ i=0
148
+ current_batch=max_batch
149
+
150
+ while i < len(ordered_data):
151
+
152
+ batch_data=ordered_data[i:i+current_batch]
153
+ batch_mask=ordered_masks[i:i+current_batch]
154
+ batch_labels=ordered_labels[i:i+current_batch]
155
+ batch_transforms=ordered_transforms[i:i+current_batch]
156
+
157
+ max_len = max([len(sent) for sent in batch_data])
158
+ max_label = max([len(label) for label in batch_labels])
159
+
160
+ for j in range(len(batch_data)):
161
+
162
+ blen=len(batch_data[j])
163
+ blab=len(batch_labels[j])
164
+
165
+ for k in range(blen, max_len):
166
+ batch_data[j].append(0)
167
+ batch_mask[j].append(0)
168
+ for z in range(len(batch_transforms[j])):
169
+ batch_transforms[j][z].append(0)
170
+
171
+ for k in range(blab, max_label):
172
+ batch_labels[j].append(-100)
173
+
174
+ for k in range(len(batch_transforms[j]), max_label):
175
+ batch_transforms[j].append(np.zeros(max_len))
176
+
177
+ batched_data.append(batch_data)
178
+ batched_mask.append(batch_mask)
179
+ batched_labels.append(batch_labels)
180
+ batched_transforms.append(batch_transforms)
181
+
182
+ #bsize=torch.FloatTensor(batch_transforms).shape
183
+
184
+ i+=current_batch
185
+
186
+ # adjust batch size; sentences are ordered from shortest to longest so decrease as they get longer
187
+ if max_len > 100:
188
+ current_batch=12
189
+ if max_len > 200:
190
+ current_batch=6
191
+
192
+ #print(len(batch_data), len(batch_mask), len(batch_transforms))
193
+ return torch.LongTensor(batched_data).squeeze(), torch.FloatTensor(batched_mask).squeeze(), torch.FloatTensor(batched_transforms).squeeze()
194
+
195
+ '''
196
+
197
+ def _tokenize(self, text):
198
+ if not isinstance(text, list):
199
+ text = [text]
200
+
201
+ outputs = []
202
+ for sentence in text:
203
+ tokens = sentence.split(" ")
204
+ wp_tokens = []
205
+ for token in tokens:
206
+ if token in ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]:
207
+ wp_tokens.append(token)
208
+ else:
209
+ wp_toks = self.encoder.encode(token)
210
+ for wp in wp_toks:
211
+ wp_tokens.append(self.reverseVocab[wp + 5])
212
+
213
+ outputs.append(SimpleNamespace(
214
+ tokens=wp_tokens,
215
+ ids=torch.Tensor(self.convert_tokens_to_ids(wp_tokens))
216
+ ))
217
+ return outputs
218
+
219
+ '''
220
+
221
+ def _tokenize(self, text):
222
+ tokens = text.split(" ")
223
+ wp_tokens = []
224
+ for token in tokens:
225
+
226
+ if token in {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"}:
227
+ wp_tokens.append(token)
228
+ else:
229
+
230
+ wp_toks = self.encoder.encode(token)
231
+
232
+ for wp in wp_toks:
233
+ wp_tokens.append(self.reverseVocab[wp + 5])
234
+ #print(wp_tokens)
235
+ return wp_tokens
236
+
237
+ def main():
238
+ model = LatinBERT("../../latinBert/latin_bert/models/latin_bert", tokenizerPath="./tokenizer/latin.subword.encoder")
239
+
240
+ sents = ["arma virumque cano", "arma gravi numero violentaque bella parabam"]
241
+
242
+
243
+ output = model(sents)
244
+ print("end", output.shape)
245
+
246
+ if __name__ == "__main__":
247
+ main()
embedders/__pycache__/LatinBERT.cpython-39.pyc ADDED
Binary file (5.92 kB). View file
 
embedders/__pycache__/labse.cpython-39.pyc ADDED
Binary file (1.65 kB). View file
 
embedders/labse.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertModel, BertTokenizerFast
3
+ import torch.nn.functional as F
4
+
5
+ def similarity(embeddings_1, embeddings_2):
6
+ normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
7
+ normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
8
+ return torch.matmul(
9
+ normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
10
+ )
11
+
12
+ class LaBSE:
13
+ def __init__(self):
14
+ self.tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
15
+ self.model = BertModel.from_pretrained("setu4993/LaBSE")
16
+ #self.model = self.model.to('cuda')
17
+ self.model.eval()
18
+
19
+ @torch.no_grad()
20
+ def __call__(self, sentences):
21
+ if not isinstance(sentences, list):
22
+ sentences = [sentences]
23
+ tokens = self.tokenizer(sentences, return_tensors="pt", padding=True)
24
+ #print(tokens.input_ids.shape, tokens.token_type_ids.shape, tokens.attention_mask.shape)
25
+ #tokens = tokens.to("cuda")
26
+ outputs = self.model(**tokens)
27
+ embeddings = outputs.pooler_output
28
+ return F.normalize(embeddings, p=2).cpu()#.numpy()
29
+
30
+ @property
31
+ def dim(self):
32
+ return 768
33
+
34
+ if __name__ == "__main__":
35
+ model = LaBSE()
36
+ sents = ["arma virumque cano", "arma gravi numero violentaque bella parabam"]
37
+
38
+ output = model(sents)
39
+ print("end", output.shape)
embedders/latin_bert/latin_bert/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "directionality": "bidi",
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 3072,
9
+ "max_position_embeddings": 512,
10
+ "num_attention_heads": 12,
11
+ "num_hidden_layers": 12,
12
+ "pooler_fc_size": 768,
13
+ "pooler_num_attention_heads": 12,
14
+ "pooler_num_fc_layers": 3,
15
+ "pooler_size_per_head": 128,
16
+ "pooler_type": "first_token_transform",
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 32900
19
+ }
embedders/latin_bert/latin_bert/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad931a030dffd79a6cc13eaf5108352418285d3a04b510cc70b41fdd609635b5
3
+ size 447786794
embedders/latin_bert/latin_bert/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
embedders/tokenizer/latin.subword.encoder ADDED
The diff for this file is too large to render. See raw diff