File size: 4,527 Bytes
a5baed0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22bb108
a5baed0
 
22bb108
a5baed0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43dbeb1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
version: "0.8.0"

corpusPath: "./resources/dataset/dataseer/corpus"
templatePath: "./resources/dataset/dataseer/crfpp-templates/dataseer.template"
grobidHome: "/opt/grobid/grobid-home"
tmpPath: "/opt/grobid/grobid-home/tmp/"

# path to Pub2TEI repository as available at https://github.com/kermitt2/Pub2TEI
pub2teiPath: "/opt/Pub2TEI/"

gluttonHost: "https://cloud.science-miner.com/glutton"
gluttonPort: 

# entity-fishing server information for performing entity disambiguation
# for https, indicate 443 as port
entityFishingHost: cloud.science-miner.com/nerd
entityFishingPort: 443
#entityFishingHost: localhost
#entityFishingPort: 8090

# if true we use binary classifiers for the contexts, otherwise use a single multi-label classifier
# binary classifiers perform better, but havier to use
useBinaryContextClassifiers: false

# sequence labeling model (identify data-related sections)
models:

  # model for zones
  - name: "dataseer"
    engine: "wapiti"
    #engine: "delft"
    wapiti:
      # wapiti training parameters, they will be used at training time only
      epsilon: 0.00001
      window: 20
      nbMaxIterations: 2000

  # classifier model, dataset binary (datset or not dataset in the current sentence)
  - name: "dataseer-binary"
    engine: "delft"
    delft:
      # deep learning parameters
      #architecture: "gru"
      architecture: "bert"
      #embeddings_name: "word2vec"
      transformer: "allenai/scibert_scivocab_cased"

  # identification of the data type (first level hierarchy) 
  - name: "dataseer-first"
    engine: "delft"
    delft:
      # deep learning parameters
      #architecture: "gru"
      architecture: "bert"
      #embeddings_name: "word2vec"
      transformer: "allenai/scibert_scivocab_cased"

  # mention context classification (reuse binary for the moment)
  - name: "dataseer-reuse"
    engine: "delft"
    delft:
      # deep learning parameters
      #architecture: "gru"
      architecture: "bert"
      #embeddings_name: "word2vec"
      transformer: "allenai/scibert_scivocab_cased"

  # model for dataset mention recognition
  - name: "datasets"
    #engine: "wapiti"
    engine: "delft"
    wapiti:
      # wapiti training parameters, they will be used at training time only
      epsilon: 0.00001
      window: 20
      nbMaxIterations: 2000
    delft:
      # deep learning parameters
      #architecture: "BidLSTM_CRF"
      architecture: "BERT_CRF"
      #transformer: "allenai/scibert_scivocab_cased"
      transformer: "michiyasunaga/LinkBERT-basecased"
      #useELMo: true
      #embeddings_name: "glove-840B"
      runtime:
        # parameters used at runtime/prediction
        max_sequence_length: 200
        #max_sequence_length: 300
        batch_size: 20

  - name: "context"
    engine: "delft"
    delft:
      #architecture: "gru"
      #embeddings_name: "glove-840B"
      architecture: "bert"
      transformer: "michiyasunaga/LinkBERT-basecased"

  - name: "context_used"
    engine: "delft"
    delft:
      #architecture: "gru"
      #embeddings_name: "glove-840B"
      architecture: "bert"
      transformer: "michiyasunaga/LinkBERT-basecased"

  - name: "context_creation"
    engine: "delft"
    delft:
      #architecture: "gru"
      #embeddings_name: "glove-840B"
      architecture: "bert"
      transformer: "michiyasunaga/LinkBERT-basecased"

  - name: "context_shared"
    engine: "delft"
    delft:
      #architecture: "gru"
      #embeddings_name: "glove-840B"
      architecture: "bert"
      transformer: "michiyasunaga/LinkBERT-basecased"

# Limit the maximum number of requests (0, no limit)
maxParallelRequests: 0

# CORS configuration for the web API service
corsAllowedOrigins: "*"
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"

server:
  type: custom
  idleTimeout: 120 seconds
  applicationConnectors:
    - type: http
      port: 8060
  adminConnectors:
    - type: http
      port: 8061
  registerDefaultExceptionMappers: false
  maxThreads: 2048
  maxQueuedRequests: 2048
  acceptQueueSize: 2048
  requestLog:
    appenders: []

# these logging settings apply to the service usage mode
logging:
  level: INFO
  loggers:
    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
    org.glassfish.jersey.internal: "OFF"
    com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
  appenders:
    - type: console
      threshold: INFO
      timeZone: UTC
      # uncomment to have the logs in json format
      #layout:
      #  type: json