UNIST-Eunchan commited on
Commit
e0b483f
1 Parent(s): 56b6b47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -1
app.py CHANGED
@@ -4,6 +4,11 @@ import nltk
4
  from nltk import sent_tokenize
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  import json
 
 
 
 
 
7
  nltk.download('punkt')
8
  with open('testbook.json') as f:
9
  test_book = json.load(f)
@@ -63,7 +68,7 @@ def chunking(book_text):
63
  next_token_len += token_lens[i+t]
64
  next_pseudo_segment += sentences[i+t]
65
 
66
- embs = model.encode([current_segment, next_pseudo_segment, sentences[i]]) # current, next, sent
67
  if cos_similarity(embs[1],embs[2]) > cos_similarity(embs[0],embs[2]):
68
  segments.append(current_segment)
69
  current_segment = sentences[i]
 
4
  from nltk import sent_tokenize
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  import json
7
+
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ sentence_transformer_model = SentenceTransformer("sentence-transformers/all-roberta-large-v1")
11
+
12
  nltk.download('punkt')
13
  with open('testbook.json') as f:
14
  test_book = json.load(f)
 
68
  next_token_len += token_lens[i+t]
69
  next_pseudo_segment += sentences[i+t]
70
 
71
+ embs = sentence_transformer_model.encode([current_segment, next_pseudo_segment, sentences[i]]) # current, next, sent
72
  if cos_similarity(embs[1],embs[2]) > cos_similarity(embs[0],embs[2]):
73
  segments.append(current_segment)
74
  current_segment = sentences[i]