sl_core/workflow.xml at main · parallelworks/sl_core · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
<tool id='sl_core' name='sl_core'>
  <command interpreter='bash'>workflow.sh</command>
  <inputs>
    <section name='repos' type='section' title='Repositories, branches, resources, and environments' expanded='true'>
      <param name='ml_arch_repo'
	     label='ML archive repository'
	     type='text'
	     value='git@github.com:parallelworks/dynamic-learning-rivers'
	     width='50%_none%'
	     help='Repo MUST have SSH prefix, e.g. git@github.com:org/repo, for SSH auth w/ deploy keys to push'>
      </param>
      <param name='ml_arch_branch'
	     label='ML archive BRANCH'
	     type='text'
	     value='test-auto-workflow'
	     width='50%_node%'>
      </param>
      <param name='push_to_gh'
             type='boolean'
             truevalue='Yes'
             falsevalue='No'
             checked='True'
             label='Push results to GitHub'
             help='Select yes to push results to GitHub. A deploy key for the repo is required.'
             width='25%_none%'
             float='right'>
      </param>
      <param name='ml_code_repo'
	     label='ML code repository'
	     type='text'
	     value='https://github.com/parallelworks/sl_core'
	     width='50%_none%'>
      </param>
      <param name='ml_data_repo'
	     label='ML data repository'
	     type='text'
	     value='https://github.com/parallelworks/global-river-databases'
	     width='50%_none%'>
      </param>
      <param name='whost'
	     label='Workflow host (cluster name)'
	     type='computeResource'
	     hideDisconnectedResources='False'
	     width='50%_none%'
	     help='Name of cluster to run on. Select a resource from drop down menu.'>
      </param>
      <param name='miniconda_loc'
	     label='Location (prefix) of Miniconda installation'
	     type='text'
	     value='/home/__USER__/.miniconda3'
	     width='50%_none%'
	     help='Location of miniconda on cluster.  __USER__ is auto substituted.'>
      </param>
      <param name='my_env'
	     label='Name of Miniconda environment'
	     type='text'
	     value='superlearner'
	     width='50%_none%'
	     help='Specify name of Miniconda environment to use. There is a bug with using base!'>
      </param>
    </section>
    <section name='superlearner' type='section' title='SuperLearner configuration' expanded='False'>
      <param name='train_test_data'
	     label='Data for training and testing [.csv] [Inputs, Outputs]'
	     type='text'
	     help='The header contains the parameter names; the inputs must occupy the first columns; path wrt ML archive repo.'
	     value='scripts/prep_06_output_final_train.csv'
	     width='50%'>
      </param>
      <param name='predict_data'
             label='Data for predicting [.csv] [Inputs, Outputs]'
             type='text'
             help='The header contains the parameter names; only inputs; path wrt ML archive repo.'
             value='scripts/prep_06_output_final_predict'
             width='50%'>
      </param>
      <param name='num_inputs'
	     label='Number of inputs'
	     type='text'
	     value='25'
	     width='25%'>
      </param>
      <param name='superlearner_conf'
	     label='SuperLearner configuration'
	     type='text'
	     help='SuperLearner configuration; path wrt ML code repo'
	     value='sample_inputs/superlearner_conf.py'
	     width='50%_none%'>
      </param>
      <param name='work_dir_base'
             label='Working directory basename wrt archive repo'
             type='text'
             value='ml_models/sl_'
             width='50%_none%'
	     help='Prefix/base of the location where to store all SL output in the archive repo. Test!'>
      </param>
      <param name='hpo'
	     type='boolean'
	     truevalue='Yes'
	     falsevalue='No'
	     checked='False'
	     label='Run hyperparameter optimization?'
	     help='Select yes for hyperparameter tuning'
	     width='25%_none%'
	     float='right'>
      </param>
      <param name='cross_val_score'
	     type='boolean'
	     truevalue='Yes'
	     falsevalue='No'
	     checked='False'
	     label='Run cross-validation?'
	     help='Select yes to run cross-validation on the entire data set'
	     width='25%_none%'
	     float='right'>
      </param>
      <param name='smogn'
             type='boolean'
             truevalue='Yes'
             falsevalue='No'
             checked='False'
             label='Use SMOGN to build synthetic data'
             help='Select yes to augment training data with synthetic data.'
             width='25%_none%'
             float='right'>
      </param>
      <param name='onnx'
             type='boolean'
             truevalue='Yes'
             falsevalue='No'
             checked='False'
             label='Save SuperLearner models in ONNX format'
             help='Select yes to save in ONNX format (more portable than .pkl)'
             width='25%_none%'
             float='right'>
      </param>
      <param name='backend'
	     type='select'
	     label='Joblib backend'
	     width='50%'
	     help='Select Backend'
	     multiple='false'>
	<option value='loky' selected='true'>loky</option>
	<option value='multiprocessing'>multiprocessing</option>
	<option value='threading'>threading</option>
	<option value='dask'>dask</option>
      </param>
      <param name='n_jobs'
	     label='Number of parallel jobs on worker'
	     type='integer'
	     min='1'
	     max='100'
	     value='8'
	     width='25%'>
      </param>
      <param name='num_inst'
             label='Number of SuperLearner instances (duplicate stacked-regressors trained)'
             type='integer'
             min='1'
             max='100'
             value='10'
             width='25%'>
      </param>
      <param name='fpi_corr_cutoff'
             label='Correlation cutoff for FPI'
             type='integer'
             min='1'
             max='99'
             value='50'
             width='25%'
             help='Features correlated above this level will be grouped in FPI analysis. This is correlation on scale of 0 to 100 (instead of 0 to 1), but not R^2!'>
      </param>
    </section>
  </inputs>
  <outputs>
  </outputs>
</tool>