PetrovDE commited on
Commit
701d7dd
1 Parent(s): d4d8e3e

add data files

Browse files
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . .
13
+
14
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "main:app"]
bi_encoder.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
8
+ bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
9
+
10
+
11
+ def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
12
+ in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
13
+ pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
14
+ return pool
15
+
16
+
17
+ def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
18
+ ) -> torch.tensor:
19
+
20
+ model.eval()
21
+ tokenized_texts = tokenizer(input_texts, max_length=512,
22
+ padding='max_length', truncation=True, return_tensors="pt")
23
+ token_embeds = model(tokenized_texts["input_ids"].to(device),
24
+ tokenized_texts["attention_mask"].to(device)).last_hidden_state
25
+ pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
26
+ return pooled_embeds
27
+
28
+
29
+ with open('data/sentences.pkl', 'rb') as f:
30
+ sentences = pickle.load(f)
31
+
32
+ with open('data/corpus.pkl', 'rb') as f:
33
+ corpus = pickle.load(f)
34
+
35
+ df = pd.DataFrame.from_dict(sentences)
36
+ df['corpus'] = corpus
37
+
38
+
39
+ def get_question(context: str, question: str):
40
+ cont_quest = f"{context} [Cont_token] {question}"
41
+ pooled_embeds = encode(cont_quest, tokenizer, bert_model, "cpu")
42
+ pooled_embeds = pooled_embeds.cpu().detach().numpy()
43
+ return pooled_embeds
44
+
45
+
46
+ def cosine_sim(question, embed):
47
+ return cosine_similarity(question, embed)[0][0]
48
+
49
+
50
+ def get_corpus(context: str, question: str):
51
+ question_embed = get_question(context, question)
52
+ df['cosine_similarity'] = df.apply(lambda x: cosine_sim(question_embed, x['embeds']), axis=1)
53
+ corp = df.sort_values(by=['cosine_similarity'], ascending=False).head(10)['corpus'].tolist()
54
+ return corp
corssencode_inference.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Any
2
+
3
+ import numpy as np
4
+ from transformers import AutoTokenizer
5
+ from bi_encoder import get_corpus, get_question
6
+
7
+ import torch
8
+
9
+ from model import CrossEncoderBert
10
+
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+
14
+ model = CrossEncoderBert()
15
+ model.model.resize_token_embeddings(len(model.tokenizer))
16
+ model.load_state_dict(torch.load('model/torch_model', map_location=torch.device(device)))
17
+ model.tokenizer = AutoTokenizer.from_pretrained('model/tokenizer')
18
+ model.to(device)
19
+
20
+
21
+ def get_range_answers(
22
+ context: str,
23
+ question: str,
24
+ num_answers: int = 5) -> list[str]:
25
+
26
+ corpus = get_corpus(context, question)
27
+ context_question = f'{context} [Cont_token] {question}'
28
+ context_questions = [context_question] * len(corpus)
29
+ tokenized_texts = model.tokenizer(
30
+ context_questions,
31
+ corpus,
32
+ max_length=512,
33
+ padding=True,
34
+ truncation=True,
35
+ return_tensors='pt'
36
+ ).to(device)
37
+
38
+ with torch.no_grad():
39
+ ce_scores = model(tokenized_texts['input_ids'],
40
+ tokenized_texts['attention_mask']).squeeze(-1)
41
+ ce_scores = torch.sigmoid(ce_scores)
42
+
43
+ scores = ce_scores.cpu().numpy()
44
+ scores_ix = np.argsort(scores)[::-1]
45
+ best_answers = []
46
+ for idx in scores_ix[:num_answers]:
47
+ best_answers.append((scores[idx], corpus[idx]))
48
+
49
+ best_answers = [str(x[1]) for x in best_answers]
50
+ return best_answers
51
+
52
+
53
+ def get_best_answer(
54
+ context: str,
55
+ question: str
56
+ ) -> str:
57
+ return get_range_answers(context, question, 1)[0][1]
data/BBT_sheldon_all.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/corpus.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6dc57f8da9666e2889503c73a6ab21d85f38a9fcd1650a289468ca2a06841c8
3
+ size 1070772
data/dataset.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2ff6bc63bafa936eb743eb3bbfdd9ebd3192e8ba9e1bbe212cf53093a478a7
3
+ size 3360049
data/sentences.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fb9226181cc8dacaac4ea03baf363be24a5df81d1f8ce70fa85b7b71016c4ef
3
+ size 37335519
main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, jsonify
2
+ from corssencode_inference import get_range_answers, get_best_answer
3
+
4
+ app = Flask(__name__)
5
+
6
+
7
+ @app.route('/', methods=['GET', 'POST'])
8
+ def index():
9
+ if request.method == 'POST':
10
+ if request.form.get('get_answer') == 'One answer':
11
+ one_answer = get_best_answer(request.form.get('context'), request.form.get('question'))
12
+
13
+ return jsonify(
14
+ {
15
+ "response_code": "200",
16
+ "request": f"{request.form.get('context')} [Cont_token] {request.form.get('question')}",
17
+ "response": one_answer
18
+ }
19
+ )
20
+ elif request.form.get('get_answer_corpus') == 'Five answer':
21
+ many_answer = get_range_answers(request.form.get('context'), request.form.get('question'))
22
+
23
+ return jsonify(
24
+ {
25
+ "response_code": "200",
26
+ "request": f"{request.form.get('context')} [Cont_token] {request.form.get('question')}",
27
+ "response": many_answer
28
+ }
29
+ )
30
+ elif request.method == 'GET':
31
+ return render_template('index.html')
32
+
33
+
34
+ if __name__ == '__main__':
35
+ app.run('localhost', 5000)
model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+
4
+
5
+ class CrossEncoderBert(torch.nn.Module):
6
+ def __init__(self):
7
+ super().__init__()
8
+ self.model = AutoModel.from_pretrained('distilbert-base-uncased')
9
+ self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
10
+ self.tokenizer.add_tokens(["[Cont_token]"], special_tokens=True)
11
+ self.linear = torch.nn.Linear(self.model.config.hidden_size, 1)
12
+
13
+ def forward(self, input_ids, attention_mask):
14
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
15
+ pooled_output = outputs.last_hidden_state[:, 0]
16
+ return self.linear(pooled_output)
requirements.txt ADDED
Binary file (5.45 kB). View file
 
templates/index.html ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Chat with Sheldon</title>
7
+ </head>
8
+ <body>
9
+ <h2 style="font-style:italic; text-align:center">Chat bot Sheldon</h2>
10
+
11
+ <p style="text-align:center">Home work 1</p>
12
+
13
+ <p style="text-align:center">by Petrov DE</p>
14
+
15
+ <hr />
16
+ <div>
17
+ <p style="text-align:center">Answer: {{ answer }}</p>
18
+ </div>
19
+ <form method="post" action="/" style="text-align:center">
20
+ <label>
21
+ Context:
22
+ </label>
23
+ <label>
24
+ <input type="text" name="context"/>
25
+ </label>
26
+ <label>
27
+ Question:
28
+ </label>
29
+ <label>
30
+ <input type="text" name="question"/>
31
+ </label>
32
+ <input type="submit" value="One answer" name="get_answer"/>
33
+ <input type="submit" value="Five answer" name="get_answer_corpus" />
34
+ </form>
35
+ </body>
36
+ </html>