asi commited on
Commit
005bca6
1 Parent(s): ceecd11

:books: Add documentation items

Browse files
.idea/gpt-fr-cased-small.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="28">
8
+ <item index="0" class="java.lang.String" itemvalue="batcher" />
9
+ <item index="1" class="java.lang.String" itemvalue="tensorboard" />
10
+ <item index="2" class="java.lang.String" itemvalue="tqdm" />
11
+ <item index="3" class="java.lang.String" itemvalue="spacy" />
12
+ <item index="4" class="java.lang.String" itemvalue="spacy-conll" />
13
+ <item index="5" class="java.lang.String" itemvalue="pydevd-pycharm" />
14
+ <item index="6" class="java.lang.String" itemvalue="transformers" />
15
+ <item index="7" class="java.lang.String" itemvalue="scikit-learn" />
16
+ <item index="8" class="java.lang.String" itemvalue="torch" />
17
+ <item index="9" class="java.lang.String" itemvalue="torchvision" />
18
+ <item index="10" class="java.lang.String" itemvalue="m2r" />
19
+ <item index="11" class="java.lang.String" itemvalue="networkx" />
20
+ <item index="12" class="java.lang.String" itemvalue="mlflow" />
21
+ <item index="13" class="java.lang.String" itemvalue="nltk" />
22
+ <item index="14" class="java.lang.String" itemvalue="pytorch-lightning" />
23
+ <item index="15" class="java.lang.String" itemvalue="PyYAML" />
24
+ <item index="16" class="java.lang.String" itemvalue="torchtext" />
25
+ <item index="17" class="java.lang.String" itemvalue="tensorflow" />
26
+ <item index="18" class="java.lang.String" itemvalue="sentencepiece" />
27
+ <item index="19" class="java.lang.String" itemvalue="tensorflow-gpu" />
28
+ <item index="20" class="java.lang.String" itemvalue="pandas" />
29
+ <item index="21" class="java.lang.String" itemvalue="datasets" />
30
+ <item index="22" class="java.lang.String" itemvalue="tensorflow_hub" />
31
+ <item index="23" class="java.lang.String" itemvalue="cython-0.29.21" />
32
+ <item index="24" class="java.lang.String" itemvalue="sentence-transformers" />
33
+ <item index="25" class="java.lang.String" itemvalue="matplotlib" />
34
+ <item index="26" class="java.lang.String" itemvalue="gensim" />
35
+ <item index="27" class="java.lang.String" itemvalue="scikit_learn" />
36
+ </list>
37
+ </value>
38
+ </option>
39
+ </inspection_tool>
40
+ <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
41
+ <option name="ignoredErrors">
42
+ <list>
43
+ <option value="W605" />
44
+ </list>
45
+ </option>
46
+ </inspection_tool>
47
+ <inspection_tool class="PyUnreachableCodeInspection" enabled="false" level="WARNING" enabled_by_default="false" />
48
+ <inspection_tool class="PyUnresolvedReferencesInspection" enabled="false" level="WARNING" enabled_by_default="false" />
49
+ <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
50
+ <option name="processCode" value="true" />
51
+ <option name="processLiterals" value="true" />
52
+ <option name="processComments" value="true" />
53
+ </inspection_tool>
54
+ </profile>
55
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/gpt-fr-cased-small.iml" filepath="$PROJECT_DIR$/.idea/gpt-fr-cased-small.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="RunDashboard">
4
+ <option name="ruleStates">
5
+ <list>
6
+ <RuleState>
7
+ <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
8
+ </RuleState>
9
+ <RuleState>
10
+ <option name="name" value="StatusDashboardGroupingRule" />
11
+ </RuleState>
12
+ </list>
13
+ </option>
14
+ </component>
15
+ </project>
README.md CHANGED
@@ -17,7 +17,7 @@ license: apache-2.0
17
 
18
  <img src="imgs/logo.png" width="200">
19
 
20
- **GPT-fr** is a French GPT trained on a very large and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for Scientific Research) [Jean Zay](http://www.idris.fr/eng/jean-zay/) supercomputer. We released the pre-trained weights for the following model sizes:
21
 
22
  | Model name | Number of layers | Attention Heads | Embedding Dimension | Total Parameters |
23
  | :------: | :---: | :---: | :---: | :---: |
@@ -26,12 +26,13 @@ license: apache-2.0
26
 
27
  ## Intended uses & limitations
28
 
29
- GPT is a generative model which can be leveraged for language generation tasks. Besides, many tasks may be formatted such that the output is directly generated in natural language. Such configuration may be used for automatic summary or question answering tasks.
30
 
31
  #### How to use
32
 
 
 
33
  ```python
34
- import torch
35
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
36
 
37
  # Load pretrained model and tokenizer
@@ -53,14 +54,13 @@ beam_outputs = model.generate(
53
  num_return_sequences=1
54
  )
55
 
56
- print("Output:\\\\\\\\\\\\\\\\
57
- " + 100 * '-')
58
  print(tokenizer.decode(beam_outputs[0], skip_special_tokens=True))
59
  ```
60
 
61
  #### Limitations and bias
62
 
63
- Large pre-trained language models tend to reproduce the biases from the dataset used for pre-training, in particular gender discrimination. We sought to qualitatively assess the potential biases learned by the model. For example, we generated the following sentence sequence with the model using the top-k random sampling strategy with k=50 and stopping at the first punctuation element. "Ma femme/Mon mari vient d'obtenir un nouveau poste en tant qu'\\_\\_\\_\\_\\_\\_\\_":
64
 
65
  The position generated for the wife are:
66
 
@@ -84,11 +84,11 @@ The position generated for the husband are:
84
 
85
  ## Training data
86
 
87
- We created a dedicated corpus to train our generative model. Indeed the model uses a fixed-length context size of 1,024 and require long documents to be trained. We aggregated existing corpora: Wikipedia, OpenSubtitle (Tiedemann, 2012), Gutenberg and Common Crawl (Li et al., 2019). Corpora are filtered and separated into sentences. Successive sentences are then concatenated within the limit of 1024 tokens per document.
88
 
89
  ## Training procedure
90
 
91
- We pre-trained the model on a TPU v2-8 using the Google Colab inter-server.
92
 
93
  ## Eval results
94
 
@@ -102,4 +102,7 @@ In line with the [WikiText](https://blog.einstein.ai/the-wikitext-long-term-depe
102
  @inproceedings{...,
103
  year={2020}
104
  }
105
- ```
 
 
 
 
17
 
18
  <img src="imgs/logo.png" width="200">
19
 
20
+ **GPT-fr** is a French GPT trained on a very large and heterogeneous French corpus. We trained models of different sizes and released the pre-trained weights for the following model sizes:
21
 
22
  | Model name | Number of layers | Attention Heads | Embedding Dimension | Total Parameters |
23
  | :------: | :---: | :---: | :---: | :---: |
 
26
 
27
  ## Intended uses & limitations
28
 
29
+ GPT is a generative model which can be leveraged for language generation tasks. Besides, many tasks may be formatted such that the output is directly generated in natural language. Such configuration may be used for tasks such as automatic summary or question answering tasks. We do hope our model might be use for both academic and industrial applications.
30
 
31
  #### How to use
32
 
33
+ The model might be used through the astonishing `Transformers librairie:
34
+
35
  ```python
 
36
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
37
 
38
  # Load pretrained model and tokenizer
 
54
  num_return_sequences=1
55
  )
56
 
57
+ print("Output:\n" + 100 * '-')
 
58
  print(tokenizer.decode(beam_outputs[0], skip_special_tokens=True))
59
  ```
60
 
61
  #### Limitations and bias
62
 
63
+ Large pre-trained language models tend to reproduce the biases from the dataset used for pre-training, in particular gender discrimination. As we do hope our model might be use for both academic and industrial applications, we try to limit such shortcoming effects by paying a specific attention to our pre-training corpus and filter explicit and offensive content. The corpus building steps are detailed in our paper. Nonetheless, we sought to qualitatively assess the potential biases learned by the model. We do appreciate your feedback to better assess such effects. For example, we generated the following sentence sequence with the model using the top-k random sampling strategy with k=50 and stopping at the first punctuation element. "Ma femme/Mon mari vient d'obtenir un nouveau poste en tant qu'\_\_\_\_\_\_\_":
64
 
65
  The position generated for the wife are:
66
 
 
84
 
85
  ## Training data
86
 
87
+ We created a dedicated corpus to train our generative model. Indeed the model uses a fixed-length context size of 1,024 and require long documents to be trained. We aggregated existing corpora: [Wikipedia](https://dumps.wikimedia.org/frwiki/), [OpenSubtitle](http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/) ([Tiedemann, 2012](#tiedemann-2012)), [Gutenberg](http://www.gutenberg.org). Corpora are filtered and separated into sentences. Successive sentences are then concatenated within the limit of 1,024 tokens per document.
88
 
89
  ## Training procedure
90
 
91
+ We pre-trained the model on a TPU v2-8 using the amazing [Google Colab](https://colab.research.google.com) inter-server.
92
 
93
  ## Eval results
94
 
 
102
  @inproceedings{...,
103
  year={2020}
104
  }
105
+ ```
106
+ ### References
107
+
108
+ ><div name="tiedemann-2012">Jörg Tiedemann: Parallel Data, Tools and Interfaces in OPUS. LREC 2012: 2214-2218</div>