line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Paws::MachineLearning::S3DataSpec; |
2
|
1
|
|
|
1
|
|
754
|
use Moose; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
11
|
|
3
|
|
|
|
|
|
|
has DataLocationS3 => (is => 'ro', isa => 'Str', required => 1); |
4
|
|
|
|
|
|
|
has DataRearrangement => (is => 'ro', isa => 'Str'); |
5
|
|
|
|
|
|
|
has DataSchema => (is => 'ro', isa => 'Str'); |
6
|
|
|
|
|
|
|
has DataSchemaLocationS3 => (is => 'ro', isa => 'Str'); |
7
|
|
|
|
|
|
|
1; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
### main pod documentation begin ### |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Paws::MachineLearning::S3DataSpec |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 USAGE |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
This class represents one of two things: |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head3 Arguments in a call to a service |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
Use the attributes of this class as arguments to methods. You shouldn't make instances of this class. |
22
|
|
|
|
|
|
|
Each attribute should be used as a named argument in the calls that expect this type of object. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
As an example, if Att1 is expected to be a Paws::MachineLearning::S3DataSpec object: |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
$service_obj->Method(Att1 => { DataLocationS3 => $value, ..., DataSchemaLocationS3 => $value }); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head3 Results returned from an API call |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Use accessors for each attribute. If Att1 is expected to be an Paws::MachineLearning::S3DataSpec object: |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
$result = $service_obj->Method(...); |
33
|
|
|
|
|
|
|
$result->Att1->DataLocationS3 |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 DESCRIPTION |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
Describes the data specification of a C<DataSource>. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 B<REQUIRED> DataLocationS3 => Str |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The location of the data file(s) used by a C<DataSource>. The URI |
45
|
|
|
|
|
|
|
specifies a data file or an Amazon Simple Storage Service (Amazon S3) |
46
|
|
|
|
|
|
|
directory or bucket containing data files. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head2 DataRearrangement => Str |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
A JSON string that represents the splitting and rearrangement |
52
|
|
|
|
|
|
|
processing to be applied to a C<DataSource>. If the |
53
|
|
|
|
|
|
|
C<DataRearrangement> parameter is not provided, all of the input data |
54
|
|
|
|
|
|
|
is used to create the C<Datasource>. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
There are multiple parameters that control what data is used to create |
57
|
|
|
|
|
|
|
a datasource: |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=over |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=item * |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
B<C<percentBegin>> |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Use C<percentBegin> to indicate the beginning of the range of the data |
66
|
|
|
|
|
|
|
used to create the Datasource. If you do not include C<percentBegin> |
67
|
|
|
|
|
|
|
and C<percentEnd>, Amazon ML includes all of the data when creating the |
68
|
|
|
|
|
|
|
datasource. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=item * |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
B<C<percentEnd>> |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Use C<percentEnd> to indicate the end of the range of the data used to |
75
|
|
|
|
|
|
|
create the Datasource. If you do not include C<percentBegin> and |
76
|
|
|
|
|
|
|
C<percentEnd>, Amazon ML includes all of the data when creating the |
77
|
|
|
|
|
|
|
datasource. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=item * |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
B<C<complement>> |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
The C<complement> parameter instructs Amazon ML to use the data that is |
84
|
|
|
|
|
|
|
not included in the range of C<percentBegin> to C<percentEnd> to create |
85
|
|
|
|
|
|
|
a datasource. The C<complement> parameter is useful if you need to |
86
|
|
|
|
|
|
|
create complementary datasources for training and evaluation. To create |
87
|
|
|
|
|
|
|
a complementary datasource, use the same values for C<percentBegin> and |
88
|
|
|
|
|
|
|
C<percentEnd>, along with the C<complement> parameter. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
For example, the following two datasources do not share any data, and |
91
|
|
|
|
|
|
|
can be used to train and evaluate a model. The first datasource has 25 |
92
|
|
|
|
|
|
|
percent of the data, and the second one has 75 percent of the data. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Datasource for evaluation: C<{"splitting":{"percentBegin":0, |
95
|
|
|
|
|
|
|
"percentEnd":25}}> |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Datasource for training: C<{"splitting":{"percentBegin":0, |
98
|
|
|
|
|
|
|
"percentEnd":25, "complement":"true"}}> |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=item * |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
B<C<strategy>> |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
To change how Amazon ML splits the data for a datasource, use the |
105
|
|
|
|
|
|
|
C<strategy> parameter. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
The default value for the C<strategy> parameter is C<sequential>, |
108
|
|
|
|
|
|
|
meaning that Amazon ML takes all of the data records between the |
109
|
|
|
|
|
|
|
C<percentBegin> and C<percentEnd> parameters for the datasource, in the |
110
|
|
|
|
|
|
|
order that the records appear in the input data. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
The following two C<DataRearrangement> lines are examples of |
113
|
|
|
|
|
|
|
sequentially ordered training and evaluation datasources: |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Datasource for evaluation: C<{"splitting":{"percentBegin":70, |
116
|
|
|
|
|
|
|
"percentEnd":100, "strategy":"sequential"}}> |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
Datasource for training: C<{"splitting":{"percentBegin":70, |
119
|
|
|
|
|
|
|
"percentEnd":100, "strategy":"sequential", "complement":"true"}}> |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
To randomly split the input data into the proportions indicated by the |
122
|
|
|
|
|
|
|
percentBegin and percentEnd parameters, set the C<strategy> parameter |
123
|
|
|
|
|
|
|
to C<random> and provide a string that is used as the seed value for |
124
|
|
|
|
|
|
|
the random data splitting (for example, you can use the S3 path to your |
125
|
|
|
|
|
|
|
data as the random seed string). If you choose the random split |
126
|
|
|
|
|
|
|
strategy, Amazon ML assigns each row of data a pseudo-random number |
127
|
|
|
|
|
|
|
between 0 and 100, and then selects the rows that have an assigned |
128
|
|
|
|
|
|
|
number between C<percentBegin> and C<percentEnd>. Pseudo-random numbers |
129
|
|
|
|
|
|
|
are assigned using both the input seed string value and the byte offset |
130
|
|
|
|
|
|
|
as a seed, so changing the data results in a different split. Any |
131
|
|
|
|
|
|
|
existing ordering is preserved. The random splitting strategy ensures |
132
|
|
|
|
|
|
|
that variables in the training and evaluation data are distributed |
133
|
|
|
|
|
|
|
similarly. It is useful in the cases where the input data may have an |
134
|
|
|
|
|
|
|
implicit sort order, which would otherwise result in training and |
135
|
|
|
|
|
|
|
evaluation datasources containing non-similar data records. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
The following two C<DataRearrangement> lines are examples of |
138
|
|
|
|
|
|
|
non-sequentially ordered training and evaluation datasources: |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Datasource for evaluation: C<{"splitting":{"percentBegin":70, |
141
|
|
|
|
|
|
|
"percentEnd":100, "strategy":"random", |
142
|
|
|
|
|
|
|
"randomSeed"="s3://my_s3_path/bucket/file.csv"}}> |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Datasource for training: C<{"splitting":{"percentBegin":70, |
145
|
|
|
|
|
|
|
"percentEnd":100, "strategy":"random", |
146
|
|
|
|
|
|
|
"randomSeed"="s3://my_s3_path/bucket/file.csv", "complement":"true"}}> |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=back |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=head2 DataSchema => Str |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
A JSON string that represents the schema for an Amazon S3 |
155
|
|
|
|
|
|
|
C<DataSource>. The C<DataSchema> defines the structure of the |
156
|
|
|
|
|
|
|
observation data in the data file(s) referenced in the C<DataSource>. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
You must provide either the C<DataSchema> or the |
159
|
|
|
|
|
|
|
C<DataSchemaLocationS3>. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Define your C<DataSchema> as a series of key-value pairs. C<attributes> |
162
|
|
|
|
|
|
|
and C<excludedVariableNames> have an array of key-value pairs for their |
163
|
|
|
|
|
|
|
value. Use the following format to define your C<DataSchema>. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
{ "version": "1.0", |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
"recordAnnotationFieldName": "F1", |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
"recordWeightFieldName": "F2", |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
"targetFieldName": "F3", |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
"dataFormat": "CSV", |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
"dataFileContainsHeader": true, |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
"attributes": [ |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
{ "fieldName": "F1", "fieldType": "TEXT" }, { "fieldName": "F2", |
180
|
|
|
|
|
|
|
"fieldType": "NUMERIC" }, { "fieldName": "F3", "fieldType": |
181
|
|
|
|
|
|
|
"CATEGORICAL" }, { "fieldName": "F4", "fieldType": "NUMERIC" }, { |
182
|
|
|
|
|
|
|
"fieldName": "F5", "fieldType": "CATEGORICAL" }, { "fieldName": "F6", |
183
|
|
|
|
|
|
|
"fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": |
184
|
|
|
|
|
|
|
"WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": |
185
|
|
|
|
|
|
|
"WEIGHTED_STRING_SEQUENCE" } ], |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
"excludedVariableNames": [ "F6" ] } |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 DataSchemaLocationS3 => Str |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Describes the schema location in Amazon S3. You must provide either the |
193
|
|
|
|
|
|
|
C<DataSchema> or the C<DataSchemaLocationS3>. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 SEE ALSO |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This class forms part of L<Paws>, describing an object used in L<Paws::MachineLearning> |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head1 BUGS and CONTRIBUTIONS |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
The source code is located here: https://github.com/pplu/aws-sdk-perl |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Please report bugs to: https://github.com/pplu/aws-sdk-perl/issues |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|