Nuno Machado commited on
Commit
8d8e1b1
1 Parent(s): 37deedc

Add embedding generator

Browse files
.gitignore CHANGED
@@ -109,6 +109,7 @@ venv/
109
  ENV/
110
  env.bak/
111
  venv.bak/
 
112
 
113
  # Spyder project settings
114
  .spyderproject
@@ -127,3 +128,10 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
 
 
 
 
 
 
109
  ENV/
110
  env.bak/
111
  venv.bak/
112
+ lex-semantic-search/
113
 
114
  # Spyder project settings
115
  .spyderproject
 
128
 
129
  # Pyre type checker
130
  .pyre/
131
+
132
+ # IDE
133
+ .idea
134
+ *.iml
135
+
136
+ # Custom files
137
+ data/
README.md CHANGED
@@ -1,2 +1,14 @@
1
  # lex-semantic-search
2
  Semantic search for Lex Fridman podcast
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # lex-semantic-search
2
  Semantic search for Lex Fridman podcast
3
+
4
+
5
+ ## Dataset
6
+
7
+ ## Usage
8
+
9
+ ```bash
10
+ python -m venv lex-semantic-search
11
+ source lex-semantic-search/bin/activate
12
+ pip install -r requirements_cpu.txt # for CPU
13
+ pip install -r requirements_gpu.txt # for GPU
14
+ ```
embeddings/__init__.py ADDED
File without changes
embeddings/encoder.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+ import numpy as np
4
+
5
+
6
+ class EmbeddingEncoder(ABC):
7
+ @abstractmethod
8
+ def generate_embeddings(self, texts: List[str]) -> List[np.ndarray]:
9
+ pass
embeddings/huggingface.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ from typing import List
5
+ from transformers import AutoTokenizer, AutoModel
6
+ from embeddings.encoder import EmbeddingEncoder
7
+
8
+
9
+ def cls_pooling(model_output):
10
+ return model_output.last_hidden_state[:, 0]
11
+
12
+
13
+ class HuggingFaceEncoder(EmbeddingEncoder):
14
+ def __init__(self, model_name: str):
15
+ self.model_name = model_name
16
+ self.model = AutoModel.from_pretrained(model_name)
17
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+
19
+ def generate_embeddings(self, sentences: List[str]) -> List[np.ndarray]:
20
+ # Tokenize sentences
21
+ encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
22
+
23
+ # Compute token embeddings
24
+ with torch.no_grad():
25
+ model_output = self.model(**encoded_input, return_dict=True)
26
+
27
+ # Perform pooling
28
+ embeddings = cls_pooling(model_output)
29
+
30
+ return embeddings