UNIST-Eunchan
commited on
Commit
•
e0b483f
1
Parent(s):
56b6b47
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,11 @@ import nltk
|
|
4 |
from nltk import sent_tokenize
|
5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
6 |
import json
|
|
|
|
|
|
|
|
|
|
|
7 |
nltk.download('punkt')
|
8 |
with open('testbook.json') as f:
|
9 |
test_book = json.load(f)
|
@@ -63,7 +68,7 @@ def chunking(book_text):
|
|
63 |
next_token_len += token_lens[i+t]
|
64 |
next_pseudo_segment += sentences[i+t]
|
65 |
|
66 |
-
embs =
|
67 |
if cos_similarity(embs[1],embs[2]) > cos_similarity(embs[0],embs[2]):
|
68 |
segments.append(current_segment)
|
69 |
current_segment = sentences[i]
|
|
|
4 |
from nltk import sent_tokenize
|
5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
6 |
import json
|
7 |
+
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
|
10 |
+
sentence_transformer_model = SentenceTransformer("sentence-transformers/all-roberta-large-v1")
|
11 |
+
|
12 |
nltk.download('punkt')
|
13 |
with open('testbook.json') as f:
|
14 |
test_book = json.load(f)
|
|
|
68 |
next_token_len += token_lens[i+t]
|
69 |
next_pseudo_segment += sentences[i+t]
|
70 |
|
71 |
+
embs = sentence_transformer_model.encode([current_segment, next_pseudo_segment, sentences[i]]) # current, next, sent
|
72 |
if cos_similarity(embs[1],embs[2]) > cos_similarity(embs[0],embs[2]):
|
73 |
segments.append(current_segment)
|
74 |
current_segment = sentences[i]
|