updated for paper publication

Browse files

Files changed (5) hide show

README.md +12 -8
config.json +3 -3
pytorch_model.bin +2 -2
tf_model.h5 +2 -2
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -8,13 +8,15 @@ datasets:
 - Tesserae
 - Phi5
 - Thomas Aquinas
 ---
 # Cicero-Similis
 ## Model description
-A Latin Language Model, trained on classical Latin texts that are reasonably close to Cicero's range of vocabulary as described in the forthcoming paper "What Would Cicero Write?".
 ## Intended uses & limitations
@@ -26,7 +28,7 @@ Normalize text using JV Replacement and tokenize using CLTK to separate enclitic
 from transformers import BertForMaskedLM, AutoTokenizer, FillMaskPipeline
 tokenizer = AutoTokenizer.from_pretrained("cook/cicero-similis")
 model = BertForMaskedLM.from_pretrained("cook/cicero-similis")
-fill_mask = FillMaskPipeline(model=model, tokenizer=tokenizer)
 # Cicero, De Re Publica, VI, 32, 2
 # "animal" is found in A, Q, PhD manuscripts
 # 'anima' H^1 Macr. et codd. Tusc.
@@ -35,21 +37,23 @@ results = fill_mask("inanimum est enim omne quod pulsu agitatur externo; quod au
 #### Limitations and bias
-Biased towards Cicero, but that weakness is the model's strength; it's not aimed to be a one-size fits all model.
 ## Training data
-Trained on the corpora Phi5, Tesserae, and Thomas Aquinas--excluding documents that went outside the scope of Cicero's expected unknown vocabulary probabilities.
 ## Training procedure
-5 epochs, masked language modeling .45, effective batch size 32
 ## Eval results
-A novel evaluation metric is proposed in the forthcoming paper "What Would Cicero Write?"
 ### BibTeX entry and citation info
-A paper will be published in Cicero Digitalis in 2021.

 - Tesserae
 - Phi5
 - Thomas Aquinas
+- Patrologia Latina
 ---
 # Cicero-Similis
 ## Model description
+A Latin Language Model, trained on Latin texts, and evaluated using the corpus of Cicero, as described in the paper _What Would Cicero Write? -- Examining Critical Textual Decisions with a Language Model_ by Todd Cook,
+Published in Ciceroniana On Line, Vol. V, #2.
 ## Intended uses & limitations
 from transformers import BertForMaskedLM, AutoTokenizer, FillMaskPipeline
 tokenizer = AutoTokenizer.from_pretrained("cook/cicero-similis")
 model = BertForMaskedLM.from_pretrained("cook/cicero-similis")
+fill_mask = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=10_000)
 # Cicero, De Re Publica, VI, 32, 2
 # "animal" is found in A, Q, PhD manuscripts
 # 'anima' H^1 Macr. et codd. Tusc.
 #### Limitations and bias
+Currently the model training data excludes modern and 19th century texts, but that weakness is the model's strength; it's not aimed to be a one-size-fits-all model.
 ## Training data
+Trained on the corpora Phi5, Tesserae, Thomas Aquinas, and Patrologes Latina.
 ## Training procedure
+5 epochs, masked language modeling .15, effective batch size 32
 ## Eval results
+A novel evaluation metric is proposed in the paper _What Would Cicero Write? -- Examining Critical Textual Decisions with a Language Model_ by Todd Cook,
+Published in Ciceroniana On Line, Vol. V, #2.
 ### BibTeX entry and citation info
+TODO
+_What Would Cicero Write? -- Examining Critical Textual Decisions with a Language Model_ by Todd Cook,
+Published in Ciceroniana On Line, Vol. V, #2.

config.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
-  "_name_or_path": "/Users/todd/PycharmProjects/cicero-similis",
   "architectures": [
     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.2,
-  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.2,
   "hidden_size": 768,
@@ -17,7 +17,7 @@
   "num_hidden_layers": 6,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
-  "transformers_version": "4.3.2",
   "type_vocab_size": 1,
   "use_cache": true,
   "vocab_size": 25000

 {
+  "_name_or_path": "models/final",
   "architectures": [
     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.2,
+  "classifier_dropout": null,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.2,
   "hidden_size": 768,
   "num_hidden_layers": 6,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
+  "transformers_version": "4.15.0",
   "type_vocab_size": 1,
   "use_cache": true,
   "vocab_size": 25000

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fc3843041c1a1a046c2c4bbf5d94a58ee08316a969628b974d179f80d59b0b0
-size 250989083

 version https://git-lfs.github.com/spec/v1
+oid sha256:15a9661486ed016a2ad717e37b7949d5617dca271e491359bc3ad260bb13f542
+size 253348914

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f994bbe8e4ec14eb6dfa0d619f190b0e5a23e58d09205b7545f14385c244c47e
-size 327909792

 version https://git-lfs.github.com/spec/v1
+oid sha256:469f4cb1f094d290ffa8a66f77eabec58f22c57cd56fc536ee0a6a30a6b2aae0
+size 329499728

vocab.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff