new5558 commited on
Commit
c64c467
1 Parent(s): f176525

docs: update readme

Browse files
Files changed (1) hide show
  1. README.md +29 -27
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  license: mit
3
  datasets:
 
4
  - scb_mt_enth_2020
5
  - oscar
6
  - wikipedia
7
- - best2009
8
  language:
9
  - th
10
  library_name: transformers
@@ -27,7 +27,8 @@ pip install attacut
27
  To initialize the model from hub, use the following commands
28
  ```python
29
  from transformers import AutoTokenizer, AutoModel
30
- from attacut import tokenize
 
31
 
32
  tokenizer = AutoTokenizer.from_pretrained("new5558/HoogBERTa")
33
  model = AutoModel.from_pretrained("new5558/HoogBERTa")
@@ -41,38 +42,39 @@ To annotate POS, NE, and clause boundary, use the following commands
41
  To extract token features, based on the RoBERTa architecture, use the following commands
42
 
43
  ```python
 
 
 
 
 
 
 
 
 
 
 
44
  with torch.no_grad():
45
- model.eval()
46
- sentence = "วันที่ 12 มีนาคมนี้ ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"
47
- all_sent = []
48
- sentences = sentence.split(" ")
49
- for sent in sentences:
50
- all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
51
-
52
- sentence = " _ ".join(all_sent)
53
- tokenized_text = tokenizer(sentence, return_tensors = 'pt')
54
- token_ids = tokenized_text['input_ids']
55
- features = model(**tokenized_text)
56
  ```
57
 
58
  For batch processing,
59
 
60
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  with torch.no_grad():
62
- model.eval()
63
- sentenceL = ["วันที่ 12 มีนาคมนี้","ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"]
64
- inputList = []
65
- for sentX in sentenceL:
66
- sentences = sentX.split(" ")
67
- all_sent = []
68
- for sent in sentences:
69
- all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
70
-
71
- sentence = " _ ".join(all_sent)
72
- inputList.append(sentence)
73
- tokenized_text = tokenizer(inputList, padding = True, return_tensors = 'pt')
74
- token_ids = tokenized_text['input_ids']
75
- features = model(**tokenized_text)
76
  ```
77
 
78
  To use HoogBERTa as an embedding layer, use
 
1
  ---
2
  license: mit
3
  datasets:
4
+ - best2009
5
  - scb_mt_enth_2020
6
  - oscar
7
  - wikipedia
 
8
  language:
9
  - th
10
  library_name: transformers
 
27
  To initialize the model from hub, use the following commands
28
  ```python
29
  from transformers import AutoTokenizer, AutoModel
30
+ from attacut import tokenized
31
+ import torch
32
 
33
  tokenizer = AutoTokenizer.from_pretrained("new5558/HoogBERTa")
34
  model = AutoModel.from_pretrained("new5558/HoogBERTa")
 
42
  To extract token features, based on the RoBERTa architecture, use the following commands
43
 
44
  ```python
45
+ model.eval()
46
+ sentence = "วันที่ 12 มีนาคมนี้ ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"
47
+ all_sent = []
48
+ sentences = sentence.split(" ")
49
+ for sent in sentences:
50
+ all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
51
+
52
+ sentence = " _ ".join(all_sent)
53
+ tokenized_text = tokenizer(sentence, return_tensors = 'pt')
54
+ token_ids = tokenized_text['input_ids']
55
+
56
  with torch.no_grad():
57
+ features = model(**tokenized_text, output_hidden_states = True).hidden_states[-1]
 
 
 
 
 
 
 
 
 
 
58
  ```
59
 
60
  For batch processing,
61
 
62
  ```python
63
+ model.eval()
64
+ sentenceL = ["วันที่ 12 มีนาคมนี้","ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"]
65
+ inputList = []
66
+ for sentX in sentenceL:
67
+ sentences = sentX.split(" ")
68
+ all_sent = []
69
+ for sent in sentences:
70
+ all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))
71
+
72
+ sentence = " _ ".join(all_sent)
73
+ inputList.append(sentence)
74
+ tokenized_text = tokenizer(inputList, padding = True, return_tensors = 'pt')
75
+ token_ids = tokenized_text['input_ids']
76
  with torch.no_grad():
77
+ features = model(**tokenized_text, output_hidden_states = True).hidden_states[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  ```
79
 
80
  To use HoogBERTa as an embedding layer, use