itdainb
/

PhoRanker

@@ -39,9 +39,9 @@ pipeline_tag: text-classification
 ## Installation
-  - Install `pyvi` to word segment:
-	- `pip install pyvi`
  -  Install `sentence-transformers` (recommend) - [Usage](#usage-with-sentence-transformers):
@@ -54,7 +54,9 @@ pipeline_tag: text-classification
 ## Pre-processing
 ```python
-from pyvi import ViTokenizer
 query = "Trường UIT là gì?"
 sentences = [
@@ -63,8 +65,8 @@ sentences = [
     "Quĩ uỷ thác đầu tư (tiếng Anh: Unit Investment Trusts; viết tắt: UIT) là một công ty đầu tư mua hoặc nắm giữ một danh mục đầu tư cố định"
 ]
-tokenized_query = ViTokenizer.tokenize(query)
-tokenized_sentences = [ViTokenizer.tokenize(sent) for sent in sentences]
 tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]

 ## Installation
+  - Install `VnCoreNLP` to word segment:
+	- `pip install py_vncorenlp`
  -  Install `sentence-transformers` (recommend) - [Usage](#usage-with-sentence-transformers):
 ## Pre-processing
 ```python
+import py_vncorenlp
+py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp')
+rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/absolute/path/to/vncorenlp')
 query = "Trường UIT là gì?"
 sentences = [
     "Quĩ uỷ thác đầu tư (tiếng Anh: Unit Investment Trusts; viết tắt: UIT) là một công ty đầu tư mua hoặc nắm giữ một danh mục đầu tư cố định"
 ]
+tokenized_query = rdrsegmenter.word_segment(query)
+tokenized_sentences = [rdrsegmenter.word_segment(sent) for sent in sentences]
 tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]